Model Building, Training & Performance Profiling

In [None]:
# ========================================================
# Task 2: Model Building, Training & Performance Profiling
# ========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, precision_recall_curve, auc, confusion_matrix, RocCurveDisplay, RocCurveDisplay

# -----------------------------
# Professional Plotting Setup
# -----------------------------
plt.style.use('ggplot')
sns.set_palette("viridis")

# --------------------------------------------------------
# 1Ô∏è‚É£ Load All Processed Data
# --------------------------------------------------------
# E-commerce Fraud
X_train_f = pd.read_csv("../data/processed/fraud_X_train.csv")
y_train_f = pd.read_csv("../data/processed/fraud_y_train.csv").values.ravel()
X_test_f = pd.read_csv("../data/processed/fraud_X_test.csv")
y_test_f = pd.read_csv("../data/processed/fraud_y_test.csv").values.ravel()

# Bank Credit Card
X_train_c = pd.read_csv("../data/processed/credit_X_train.csv")
y_train_c = pd.read_csv("../data/processed/credit_y_train.csv").values.ravel()
X_test_c = pd.read_csv("../data/processed/credit_X_test.csv")
y_test_c = pd.read_csv("../data/processed/credit_y_test.csv").values.ravel()

print("‚úÖ Data Loading Complete.")

# --------------------------------------------------------
# 2Ô∏è‚É£ Master Evaluation Function
# --------------------------------------------------------
def run_evaluation(model, X_train, y_train, X_test, y_test, dataset_name):
    """
    Evaluates model stability (CV), performance (PR-AUC, ROC-AUC), 
    and generalization (Train vs Test), including confusion matrix and feature importance.
    """
    model_name = model.__class__.__name__
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # -----------------------------
    # 1. Cross-Validation (PR-AUC)
    # -----------------------------
    cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='average_precision')
    
    # -----------------------------
    # 2. Fit and Predict
    # -----------------------------
    model.fit(X_train, y_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]
    y_test_prob = model.predict_proba(X_test)[:, 1]
    y_test_pred = model.predict(X_test)
    
    # -----------------------------
    # 3. Metrics
    # -----------------------------
    # PR-AUC
    p_train, r_train, _ = precision_recall_curve(y_train, y_train_prob)
    p_test, r_test, _ = precision_recall_curve(y_test, y_test_prob)
    auc_train, auc_test = auc(r_train, p_train), auc(r_test, p_test)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    
    print(f"\n--- {model_name} on {dataset_name} ---")
    print(f"CV PR-AUC: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
    print(f"Train vs Test PR-AUC: {auc_train:.4f} / {auc_test:.4f} (Gap: {auc_train-auc_test:.4f})")
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    print("Confusion Matrix (Test):")
    print(cm)
    
    # -----------------------------
    # 4. Visualizations
    # -----------------------------
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # PR Curve
    axes[0].plot(r_test, p_test, label=f'Test PR-AUC={auc_test:.4f}', color='darkblue', lw=2)
    axes[0].fill_between(r_test, p_test, alpha=0.1, color='blue')
    axes[0].set_title(f"PR Curve: {model_name}")
    axes[0].set_xlabel("Recall")
    axes[0].set_ylabel("Precision")
    axes[0].legend()
    
    # ROC Curve
    RocCurveDisplay.from_predictions(y_test, y_test_prob, ax=axes[1])
    axes[1].set_title(f"ROC Curve: {model_name}")
    
    # Feature Importance (if applicable)
    if hasattr(model, 'feature_importances_'):
        importances = pd.Series(model.feature_importances_, index=X_train.columns).nlargest(10)
        importances.plot(kind='barh', ax=axes[2], color='teal')
        axes[2].set_title("Top 10 Feature Importances")
    
    plt.tight_layout()
    plt.show()
    
    return model

# --------------------------------------------------------
# 3Ô∏è‚É£ E-commerce Fraud Modeling
# --------------------------------------------------------
print("\nüöÄ Modeling E-commerce Fraud (SMOTE Balanced)")

# Logistic Regression Baseline
lr_f = run_evaluation(LogisticRegression(max_iter=1000), X_train_f, y_train_f, X_test_f, y_test_f, "E-commerce Fraud")

# Random Forest Ensemble
rf_f = run_evaluation(RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42),
                      X_train_f, y_train_f, X_test_f, y_test_f, "E-commerce Fraud")

# XGBoost T


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/fraud_X_test.csv'