In [2]:
# ===============================
# Modeling: Fraud Detection
# ===============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import f1_score, make_scorer, average_precision_score, confusion_matrix

from imblearn.over_sampling import SMOTE

# -------------------------------
# Load processed data
# -------------------------------
fraud = pd.read_csv("../data/processed/fraud_processed.csv")

# -------------------------------
# Features and target
# -------------------------------
y = fraud['class']
X = fraud.drop(columns=['class','signup_time','purchase_time','ip_address','device_id','user_id'])
X = pd.get_dummies(X, drop_first=True)

# -------------------------------
# Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -------------------------------
# Scaling
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------------
# Handle class imbalance with SMOTE
# -------------------------------
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

# -------------------------------
# Logistic Regression (Baseline)
# -------------------------------
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train_res, y_train_res)

lr_preds = lr.predict(X_test_scaled)
lr_probs = lr.predict_proba(X_test_scaled)[:,1]

print("Logistic Regression Performance:")
print("F1:", f1_score(y_test, lr_preds))
print("AUC-PR:", average_precision_score(y_test, lr_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_preds))

# -------------------------------
# Random Forest with Hyperparameter Tuning
# -------------------------------
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

scorer = make_scorer(f1_score)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid_rf.fit(X_train_res, y_train_res)
best_rf = grid_rf.best_estimator_

print("\nBest RF Hyperparameters:", grid_rf.best_params_)

# -------------------------------
# XGBoost with Hyperparameter Tuning
# -------------------------------
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=(y_train==0).sum() / (y_train==1).sum(),
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1]
}

grid_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid_xgb.fit(X_train_res, y_train_res)
best_xgb = grid_xgb.best_estimator_

print("\nBest XGBoost Hyperparameters:", grid_xgb.best_params_)

# -------------------------------
# Cross-Validation Metrics
# -------------------------------
models = {
    "Logistic Regression": lr,
    "Random Forest": best_rf,
    "XGBoost": best_xgb
}

cv_results = []

for name, model in models.items():
    scores = cross_validate(
        model,
        X_train_res,
        y_train_res,
        cv=cv,
        scoring={'F1': make_scorer(f1_score), 'AUC-PR': 'average_precision'},
        return_train_score=False
    )
    cv_results.append({
        'Model': name,
        'F1_mean': np.mean(scores['test_F1']),
        'F1_std': np.std(scores['test_F1']),
        'AUC-PR_mean': np.mean(scores['test_AUC-PR']),
        'AUC-PR_std': np.std(scores['test_AUC-PR'])
    })

cv_df = pd.DataFrame(cv_results)
print("\nCross-Validation Results:\n", cv_df)

# -------------------------------
# Model Evaluation on Test Set
# -------------------------------
for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    preds = model.predict(X_test_scaled)
    probs = model.predict_proba(X_test_scaled)[:,1]
    f1 = f1_score(y_test, preds)
    auc_pr = average_precision_score(y_test, probs)
    print(f"\n{name} Test Performance:")
    print("F1:", f1)
    print("AUC-PR:", auc_pr)
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))

# -------------------------------
# Model Selection Justification
# -------------------------------
"""
Based on cross-validation metrics and test performance:
- Logistic Regression: interpretable but lower F1 and AUC-PR
- Random Forest: higher performance, medium interpretability
- XGBoost: best F1 and AUC-PR, slightly less interpretable but manageable with SHAP

=> Selected Model: XGBoost as the best balance of predictive power and interpretability.
"""


ModuleNotFoundError: No module named 'pandas'