In [1]:
%pip install pandas numpy seaborn matplotlib scikit-learn joblib tqdm xgboost lightgbm

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import os
import warnings
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

Note: you may need to restart the kernel to use updated packages.


In [2]:

warnings.filterwarnings("ignore")

# Load dataset
data_dir = "./assets"
train_transaction_path = os.path.join(data_dir, "train_transaction.csv")
train_identity_path = os.path.join(data_dir, "train_identity.csv")

if not os.path.exists(train_transaction_path) or not os.path.exists(train_identity_path):
    raise FileNotFoundError("One or both dataset files are missing. Please check the file paths.")

df_train = pd.read_csv(train_transaction_path)
df_identity = pd.read_csv(train_identity_path)


In [3]:

# Merge datasets
df = df_train.merge(df_identity, on="TransactionID", how="left")

# Handle missing values
df.fillna(-999, inplace=True)

# Encode categorical variables
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))


In [4]:

# Feature selection
features = [col for col in df.columns if col not in ["TransactionID", "isFraud"]]
X = df[features]
y = df["isFraud"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [5]:

# Train Supervised Models with progress bar
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier()
}

best_model = None
best_roc_auc = 0

for name, model in tqdm(models.items(), desc="Training Models"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print(f"\n{name} Evaluation:")
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc)

    if roc_auc > best_roc_auc:
        best_roc_auc = roc_auc
        best_model = model


Training Models:  25%|██▌       | 1/4 [02:08<06:25, 128.40s/it]


Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98    113975
           1       0.76      0.14      0.24      4133

    accuracy                           0.97    118108
   macro avg       0.86      0.57      0.61    118108
weighted avg       0.96      0.97      0.96    118108

ROC AUC: 0.8357124959008557


Training Models:  50%|█████     | 2/4 [04:55<05:02, 151.26s/it]


Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.94      0.45      0.61      4133

    accuracy                           0.98    118108
   macro avg       0.96      0.72      0.80    118108
weighted avg       0.98      0.98      0.98    118108

ROC AUC: 0.9290264498366365


Training Models:  75%|███████▌  | 3/4 [05:50<01:47, 107.37s/it]


XGBoost Evaluation:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.90      0.50      0.64      4133

    accuracy                           0.98    118108
   macro avg       0.94      0.75      0.81    118108
weighted avg       0.98      0.98      0.98    118108

ROC AUC: 0.9409962474419986
[LightGBM] [Info] Number of positive: 16530, number of negative: 455902
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.083583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39130
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 431
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034989 -> initscore=-3.317101
[LightGBM] [Info] Start training from score -3.317101


Training Models: 100%|██████████| 4/4 [06:37<00:00, 99.46s/it] 


LightGBM Evaluation:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.88      0.44      0.59      4133

    accuracy                           0.98    118108
   macro avg       0.93      0.72      0.79    118108
weighted avg       0.98      0.98      0.97    118108

ROC AUC: 0.9278149553237716





In [6]:

# Save the best model
joblib.dump(best_model, "best_fraud_detection_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\nBest model saved successfully!")

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]
print("\nBest Model Test Evaluation:")
print(classification_report(y_test, y_pred_best))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba_best))


Best model saved successfully!

Best Model Test Evaluation:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.90      0.50      0.64      4133

    accuracy                           0.98    118108
   macro avg       0.94      0.75      0.81    118108
weighted avg       0.98      0.98      0.98    118108

ROC AUC: 0.9409962474419986
