In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
import joblib


## Load data

In [2]:
df = pd.read_csv('../data/processed/Fraud_Data_Processed.csv')
print(df.shape)
print(df['class'].value_counts())


(151112, 157)
class
0    136961
1     14151
Name: count, dtype: int64


## Split Features & Target

In [3]:
X = df.drop(columns=['class'])
y = df['class']


## Handle Class Imbalance

In [None]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print("Before SMOTE:", y.value_counts(normalize=True))
print("After SMOTE:", pd.Series(y_res).value_counts(normalize=True))


## Train–Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)


## Baseline Model — Logistic Regression

In [None]:
num_cols = X_train.select_dtypes(include=['int64','float64']).columns
cat_cols = X_train.select_dtypes(include=['object','category']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)
y_proba = lr_pipeline.predict_proba(X_test)[:,1]

precision, recall, _ = precision_recall_curve(y_test, y_proba)
auc_pr = auc(recall, precision)
print(f"LogReg F1 = {f1_score(y_test,y_pred):.4f} AUC-PR = {auc_pr:.4f}")
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, fmt='d'); plt.title('LR Confusion Matrix');


## Ensemble Model — Random Forest

In [None]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=150, max_depth=10,
        class_weight='balanced', random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
y_proba_rf = rf_pipeline.predict_proba(X_test)[:,1]

precision, recall, _ = precision_recall_curve(y_test, y_proba_rf)
auc_pr_rf = auc(recall, precision)
print(f"RandomForest F1 = {f1_score(y_test,y_pred_rf):.4f} AUC-PR = {auc_pr_rf:.4f}")
sns.heatmap(confusion_matrix(y_test,y_pred_rf), annot=True, fmt='d'); plt.title('RF Confusion Matrix');


## Stratified K-Fold Cross-Validation

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr_cv = cross_val_score(lr_pipeline, X_res, y_res, cv=skf, scoring='f1')
rf_cv = cross_val_score(rf_pipeline, X_res, y_res, cv=skf, scoring='f1')

print(f"LogReg CV F1 = {lr_cv.mean():.4f} ± {lr_cv.std():.4f}")
print(f"RF CV F1 = {rf_cv.mean():.4f} ± {rf_cv.std():.4f}")


## Comparison & Save Model

In [None]:
comparison = pd.DataFrame({
    'Model':['Logistic Regression','Random Forest'],
    'F1':[f1_score(y_test,y_pred),f1_score(y_test,y_pred_rf)],
    'AUC-PR':[auc_pr,auc_pr_rf],
    'CV Mean F1':[lr_cv.mean(),rf_cv.mean()],
    'CV Std F1':[lr_cv.std(),rf_cv.std()]
})
print(comparison)

joblib.dump(rf_pipeline, '../models/random_forest_fraud.pkl')
print("✅ Saved Random Forest model to models/random_forest_fraud.pkl")
