Insert all library we need for these Project

In [None]:
# fraud_detection_with_improvements.py

import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import joblib
import shap

# Reproducibility
def seed_everything(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

# Load and preprocess data
df = pd.read_csv('The Dataset.csv')
df['Transaction_ID'] = df.index

X = df.drop(['Class'], axis=1)
y = df['Class']

scaler = StandardScaler()
X['scaled_amount'] = scaler.fit_transform(X['Amount'].values.reshape(-1, 1))
X['scaled_time'] = scaler.fit_transform(X['Time'].values.reshape(-1, 1))
X.drop(['Amount', 'Time'], axis=1, inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Ensemble models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

models = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lgb', LGBMClassifier(random_state=42)),
    ('cat', CatBoostClassifier(verbose=0, random_state=42))
]

# Stratified CV + SMOTE
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []
thresholds = []

print("\n🔍 Starting Stratified 5-Fold Cross Validation with SMOTE + Ensemble...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\n Fold {fold} training...")
    X_tr, y_tr = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

    sm = SMOTE(random_state=42)
    X_tr_res, y_tr_res = sm.fit_resample(X_tr, y_tr)

    fold_preds_val = np.zeros(len(X_val))

    for name, model in tqdm(models, desc="Training models"):
        model.fit(X_tr_res, y_tr_res)
        val_proba = model.predict_proba(X_val)[:, 1]
        fold_preds_val += val_proba / len(models)

    precisions, recalls, thresh = precision_recall_curve(y_val, fold_preds_val)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
    best_thresh = thresh[np.argmax(f1_scores)]
    thresholds.append(best_thresh)

    val_pred = (fold_preds_val > best_thresh).astype(int)

    acc = accuracy_score(y_val, val_pred)
    prec = precision_score(y_val, val_pred)
    rec = recall_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred)
    auc = roc_auc_score(y_val, fold_preds_val)
    print(f"Fold {fold} - Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
    fold_metrics.append((acc, prec, rec, f1, auc))

avg_thresh = np.mean(thresholds)

print("\n Cross-validation complete.")
avg_metrics = np.mean(fold_metrics, axis=0)
print(f"Average Accuracy: {avg_metrics[0]:.4f}, Precision: {avg_metrics[1]:.4f}, Recall: {avg_metrics[2]:.4f}, F1: {avg_metrics[3]:.4f}, AUC: {avg_metrics[4]:.4f}")
print(f"Optimal Threshold (Avg): {avg_thresh:.4f}")

# Final model training on full training data
print("\n Training final ensemble models on full train data with SMOTE...")
X_train_res, y_train_res = SMOTE(random_state=42).fit_resample(X_train, y_train)

base_model_preds_train = []
base_model_preds_test = []
ensemble_models = {}

for name, model in models:
    print(f"Training {name}...")
    model.fit(X_train_res, y_train_res)
    ensemble_models[name] = model
    base_model_preds_train.append(model.predict_proba(X_train_res)[:, 1])
    base_model_preds_test.append(model.predict_proba(X_test)[:, 1])

# Train meta-model
print("\n Training meta-model (Logistic Regression)...")
X_stack_train = np.vstack(base_model_preds_train).T
X_stack_test = np.vstack(base_model_preds_test).T
meta_model = LogisticRegression()
meta_model.fit(X_stack_train, y_train_res)
final_test_probas = meta_model.predict_proba(X_stack_test)[:, 1]
final_test_preds = (final_test_probas > avg_thresh).astype(int)

# Save models
joblib.dump(ensemble_models, "ensemble_models.pkl")
joblib.dump(meta_model, "meta_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\n Models and scaler saved.")

# Evaluation on test set
print("\n Evaluating final ensemble model on test set...")
print(f"Test Accuracy: {accuracy_score(y_test, final_test_preds):.4f}")
print(f"Test Precision: {precision_score(y_test, final_test_preds):.4f}")
print(f"Test Recall: {recall_score(y_test, final_test_preds):.4f}")
print(f"Test F1 Score: {f1_score(y_test, final_test_preds):.4f}")
print(f"Test AUC: {roc_auc_score(y_test, final_test_probas):.4f}")

# Save predictions
result_df = pd.DataFrame({
    'Transaction_ID': X_test.index,
    'Actual': y_test.values,
    'Predicted': final_test_preds
})
result_df.to_csv('test_predictions.csv', index=False)
print("\n Test predictions saved to 'test_predictions.csv'.")

# SHAP explainability
print("\n Running SHAP analysis on LGBM model...")
explainer = shap.TreeExplainer(ensemble_models['lgb'])
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)