# Task 4: Credit Card Fraud Modeling
This notebook trains and evaluates Logistic Regression and Random Forest models on the credit card fraud dataset using SMOTE for class imbalance.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, precision_recall_curve, auc

from datetime import datetime

sns.set(style='whitegrid')
os.makedirs("outputs/plots", exist_ok=True)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

credit_df = pd.read_csv("/content/drive/MyDrive/fraud-detection-adey-2025/creditcard_cleaned.csv")
print("✅ Loaded credit card data. Shape:", credit_df.shape)

credit_df_proc = credit_df.drop(columns=['Time'])
y_credit = credit_df_proc['Class']
X_credit = credit_df_proc.drop(columns=['Class'])

scaler_credit = StandardScaler()
X_credit['Amount'] = scaler_credit.fit_transform(X_credit[['Amount']])

Mounted at /content/drive
✅ Loaded credit card data. Shape: (283726, 31)


In [3]:
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)

print("Before SMOTE:", np.bincount(yc_train))

smote = SMOTE(random_state=42)
Xc_train_res, yc_train_res = smote.fit_resample(Xc_train, yc_train)

print("After SMOTE:", np.bincount(yc_train_res))

Before SMOTE: [226602    378]
After SMOTE: [226602 226602]


In [4]:
def evaluate_model(model, X_test, y_test, model_name="Model", dataset_name="Dataset"):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n🔍 {model_name} on {dataset_name}")
    print(classification_report(y_test, y_pred, digits=4))
    print("F1 Score:", f1_score(y_test, y_pred, average='binary'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    print("PR AUC:", pr_auc)

    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {model_name} ({dataset_name})')
    plt.legend()
    plt.tight_layout()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{dataset_name.lower()}_{model_name.lower().replace(' ', '_')}_pr_curve_{timestamp}.png"
    path = os.path.join("/content/drive/MyDrive/fraud-detection-adey-2025/outputs/plots", filename)
    plt.savefig(path)
    plt.close()
    print(f"✅ Saved PR Curve: {path}")

In [5]:
log_credit = LogisticRegression(max_iter=1000, random_state=42)
log_credit.fit(Xc_train_res, yc_train_res)
evaluate_model(log_credit, Xc_test, yc_test, "Logistic Regression", "Credit Card")


🔍 Logistic Regression on Credit Card
              precision    recall  f1-score   support

           0     0.9998    0.9741    0.9867     56651
           1     0.0534    0.8737    0.1007        95

    accuracy                         0.9739     56746
   macro avg     0.5266    0.9239    0.5437     56746
weighted avg     0.9982    0.9739    0.9853     56746

F1 Score: 0.10072815533980582
Confusion Matrix:
 [[55181  1470]
 [   12    83]]
PR AUC: 0.7112006665727456
✅ Saved PR Curve: /content/drive/MyDrive/fraud-detection-adey-2025/outputs/plots/credit card_logistic_regression_pr_curve_20250815_161932.png


In [6]:
rf_credit = RandomForestClassifier(n_estimators=100, random_state=42)
rf_credit.fit(Xc_train_res, yc_train_res)
evaluate_model(rf_credit, Xc_test, yc_test, "Random Forest", "Credit Card")


🔍 Random Forest on Credit Card
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9997     56651
           1     0.9114    0.7579    0.8276        95

    accuracy                         0.9995     56746
   macro avg     0.9555    0.8789    0.9137     56746
weighted avg     0.9994    0.9995    0.9994     56746

F1 Score: 0.8275862068965517
Confusion Matrix:
 [[56644     7]
 [   23    72]]
PR AUC: 0.8101896414824552
✅ Saved PR Curve: /content/drive/MyDrive/fraud-detection-adey-2025/outputs/plots/credit card_random_forest_pr_curve_20250815_162621.png
