In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score
)

from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.ensemble import GradientBoostingClassifier

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("../data/raw/creditcard.csv")

X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
## random forest

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]


In [4]:
print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print(classification_report(y_test, y_pred_rf, digits=4))


Random Forest ROC-AUC: 0.952908497036969
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998     56864
           1     0.9605    0.7449    0.8391        98

    accuracy                         0.9995     56962
   macro avg     0.9800    0.8724    0.9194     56962
weighted avg     0.9995    0.9995    0.9995     56962



In [5]:
## gradient boosting

In [6]:
gb = GradientBoostingClassifier(random_state=42)

gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
y_proba_gb = gb.predict_proba(X_test)[:, 1]


In [7]:
print("Gradient Boosting ROC-AUC:", roc_auc_score(y_test, y_proba_gb))
print(classification_report(y_test, y_pred_gb, digits=4))


Gradient Boosting ROC-AUC: 0.3468859283302516
              precision    recall  f1-score   support

           0     0.9986    0.9997    0.9992     56864
           1     0.5294    0.1837    0.2727        98

    accuracy                         0.9983     56962
   macro avg     0.7640    0.5917    0.6359     56962
weighted avg     0.9978    0.9983    0.9979     56962



In [8]:
## Isolation Forest (ANOMALY DETECTION)
iso = IsolationForest(
    n_estimators=100,
    contamination=0.0017,  # approx fraud ratio
    random_state=42
)

iso.fit(X_train_scaled)


In [9]:
iso_pred = iso.predict(X_test_scaled)
# Convert: -1 = anomaly (fraud), 1 = normal
iso_pred = np.where(iso_pred == -1, 1, 0)


In [10]:
print(classification_report(y_test, iso_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9989    0.9987    0.9988     56864
           1     0.3113    0.3367    0.3235        98

    accuracy                         0.9976     56962
   macro avg     0.6551    0.6677    0.6612     56962
weighted avg     0.9977    0.9976    0.9976     56962



In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Balanced Logistic Regression (baseline reference)
lr_balanced = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

lr_balanced.fit(X_train_scaled, y_train)

y_proba_bal = lr_balanced.predict_proba(X_test_scaled)[:, 1]
roc_auc_bal = roc_auc_score(y_test, y_proba_bal)

roc_auc_bal


np.float64(0.9720834996210077)

In [15]:
# Compute sample weights to handle imbalance
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(
    class_weight="balanced",
    y=y_train
)

gb_balanced = GradientBoostingClassifier(random_state=42)
gb_balanced.fit(X_train, y_train, sample_weight=sample_weights)

y_proba_gb_bal = gb_balanced.predict_proba(X_test)[:, 1]

roc_auc_score(y_test, y_proba_gb_bal)


np.float64(0.9772056385159578)

In [16]:
## model comparison table
model_results = pd.DataFrame({
    "Model": [
        "Logistic Regression (Balanced)",
        "Random Forest",
        "Gradient Boosting (Balanced)",
        "Isolation Forest"
    ],
    "ROC-AUC": [
        roc_auc_bal,
        roc_auc_score(y_test, y_proba_rf),
        roc_auc_score(y_test, y_proba_gb_bal),
        np.nan
    ]
})

model_results




Unnamed: 0,Model,ROC-AUC
0,Logistic Regression (Balanced),0.972083
1,Random Forest,0.952908
2,Gradient Boosting (Balanced),0.977206
3,Isolation Forest,
