# 02 - Modeling

Train baseline models (LogReg, RandomForest, XGBoost), minimal RandomizedSearchCV, ROC & PR curves, quick SHAP demo.



In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

# Load raw
df = pd.read_csv('data/raw/creditcard.csv')
X = df.drop(columns=['Class'])
y = df['Class'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

models = {
    'log_reg': LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=None),
    'rf': RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42, n_jobs=-1)
}
if HAS_XGB:
    models['xgb'] = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss', tree_method='hist', random_state=42, n_jobs=-1)

for name, m in models.items():
    m.fit(X_train, y_train)
    proba = m.predict_proba(X_test)[:, 1]
    roc = roc_auc_score(y_test, proba)
    pr = average_precision_score(y_test, proba)
    print(name, 'ROC-AUC=', roc, 'PR-AUC=', pr)

best = max(models.items(), key=lambda kv: average_precision_score(y_test, kv[1].predict_proba(X_test)[:,1]))[1]

# Minimal RandomizedSearch on RF
param_dist = {
    'n_estimators': [200, 300, 400, 600],
    'max_depth': [None, 6, 8, 12],
    'min_samples_split': [2, 5, 10],
}
rs = RandomizedSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1), param_distributions=param_dist, n_iter=8, cv=3, scoring='average_precision', random_state=42, n_jobs=-1)
rs.fit(X_train, y_train)
print('Best RF params:', rs.best_params_)

best_rs = rs.best_estimator_

# Curves
for mdl, label in [(best, 'Best baseline'), (best_rs, 'Tuned RF')]:
    proba = mdl.predict_proba(X_test)[:, 1]
    RocCurveDisplay.from_predictions(y_test, proba, name=label)
plt.title('ROC Curves')
plt.show()

for mdl, label in [(best, 'Best baseline'), (best_rs, 'Tuned RF')]:
    proba = mdl.predict_proba(X_test)[:, 1]
    PrecisionRecallDisplay.from_predictions(y_test, proba, name=label)
plt.title('Precision-Recall Curves')
plt.show()

# Export best pipeline
import joblib
joblib.dump(best_rs, 'models/fraud_model.joblib')
print('Saved best model to models/fraud_model.joblib')



> SHAP Note: If `shap` is not available in your environment, skip the following cell or install it with `pip install shap`.



In [None]:
# Quick SHAP demo (may be slow on large data)
try:
    import shap
    explainer = shap.TreeExplainer(best_rs)
    shap_values = explainer.shap_values(X_test.sample(500, random_state=42))
    shap.summary_plot(shap_values, X_test.sample(500, random_state=42))
except Exception as e:
    print('SHAP not available or failed:', e)

