# Task 2: Model Building & Training - Fraud Detection

This notebook implements the machine learning pipeline for fraud detection on two datasets:
1. **E-commerce Fraud Data**: Transactional data from an e-commerce platform.
2. **Bank Credit Card Fraud Data**: Anonymized credit card transactions.

**Objective**: Build, train, and evaluate classification models focusing on imbalanced data performance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_recall_curve, auc, f1_score, confusion_matrix, 
    classification_report, average_precision_score, PrecisionRecallDisplay
)
from xgboost import XGBClassifier

import joblib
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
RANDOM_STATE = 42

## 1. Data Preparation

We load the processed datasets and prepare them for modeling. This includes:
- Selecting features and target
- Dropping non-predictive identifiers and non-numeric columns
- Converting boolean columns to integers
- Stratified train-test split

In [None]:
def prepare_data(df, target_col, drop_cols=None):
    """Separates features and target, drops unnecessary columns, and splits data."""
    X = df.drop(columns=[target_col])
    if drop_cols:
        X = X.drop(columns=[col for col in drop_cols if col in X.columns])
    
    # Drop any remaining object (string) columns
    obj_cols = X.select_dtypes(include=['object']).columns
    if len(obj_cols) > 0:
        X = X.drop(columns=obj_cols)
    
    # Explicitly convert boolean columns to int
    bool_cols = X.select_dtypes(include=['bool']).columns
    if len(bool_cols) > 0:
        X[bool_cols] = X[bool_cols].astype(int)
    
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )
    
    return X_train, X_test, y_train, y_test

# File paths
FRAUD_DATA_PATH = Path('../data/processed/Fraud_Data_Processed.csv')
CREDIT_DATA_PATH = Path('../data/processed/creditcard_Processed.csv')

# Load datasets
fraud_df = pd.read_csv(FRAUD_DATA_PATH)
credit_df = pd.read_csv(CREDIT_DATA_PATH)

print(f"Fraud Data Shape: {fraud_df.shape}")
print(f"Credit Card Data Shape: {credit_df.shape}")

## Prepare E-commerce Fraud Data
# user_id is an identifier, not a predictor. 
# Note: prepare_data will also drop other non-numeric columns automatically
X_train_f, X_test_f, y_train_f, y_test_f = prepare_data(fraud_df, 'class', drop_cols=['user_id'])

## Prepare Credit Card Fraud Data
X_train_c, X_test_c, y_train_c, y_test_c = prepare_data(credit_df, 'Class')

print("\nData Splits:")
print(f"Fraud Dataset - Train: {X_train_f.shape}, Test: {X_test_f.shape}")
print(f"Credit Card Dataset - Train: {X_train_c.shape}, Test: {X_test_c.shape}")

## 2. Evaluation Helper Function

To ensure consistent evaluation, we define a function that calculates and displays key metrics.

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_prob)
    
    print(f"--- {model_name} Evaluation ---")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-PR: {auc_pr:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    return {
        'model_name': model_name,
        'f1': f1,
        'auc_pr': auc_pr,
        'y_prob': y_prob
    }

## 3. Baseline Model: Logistic Regression

We use `class_weight='balanced'` to handle the high class imbalance.

In [None]:
results = []

print("### Training Logistic Regression on Fraud Data ###")
lr_f = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_STATE)
lr_f.fit(X_train_f, y_train_f)
results.append(evaluate_model(lr_f, X_test_f, y_test_f, 'Logistic Regression (Fraud Data)'))

print("\n### Training Logistic Regression on Credit Card Data ###")
lr_c = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_STATE)
lr_c.fit(X_train_c, y_train_c)
results.append(evaluate_model(lr_c, X_test_c, y_test_c, 'Logistic Regression (Credit Data)'))

## 4. Ensemble Model: XGBoost

XGBoost is used with `scale_pos_weight` to focus on the minority (fraud) class.

In [None]:
def train_xgb(X_train, y_train, X_test, y_test, name):
    # Calculate scale_pos_weight
    neg = len(y_train[y_train == 0])
    pos = len(y_train[y_train == 1])
    scale_weight = neg / pos
    
    xgb = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_weight,
        random_state=RANDOM_STATE,
        eval_metric='aucpr',
        use_label_encoder=False
    )
    
    xgb.fit(X_train, y_train)
    return evaluate_model(xgb, X_test, y_test, f'XGBoost ({name})'), xgb

print("### Training XGBoost on Fraud Data ###")
res_xgb_f, xgb_f = train_xgb(X_train_f, y_train_f, X_test_f, y_test_f, 'Fraud Data')
results.append(res_xgb_f)

print("\n### Training XGBoost on Credit Card Data ###")
res_xgb_c, xgb_c = train_xgb(X_train_c, y_train_c, X_test_c, y_test_c, 'Credit Data')
results.append(res_xgb_c)

## 5. Cross-Validation

We perform 5-fold Stratified Cross-Validation to ensure model stability.

In [None]:
def run_cv(model, X, y, name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
    auc_pr_scores = cross_val_score(model, X, y, cv=skf, scoring='average_precision')
    
    print(f"--- {name} Cross-Validation ---")
    print(f"F1-Score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")
    print(f"AUC-PR: {auc_pr_scores.mean():.4f} (+/- {auc_pr_scores.std():.4f})")
    print()

print("Running CV for Fraud Data models...")
run_cv(lr_f, X_train_f, y_train_f, 'LogReg Fraud')
run_cv(xgb_f, X_train_f, y_train_f, 'XGBoost Fraud')

print("Running CV for Credit Card models...")
run_cv(lr_c, X_train_c, y_train_c, 'LogReg Credit')
run_cv(xgb_c, X_train_c, y_train_c, 'XGBoost Credit')

## 6. Model Comparison

Comparing Precision-Recall curves side-by-side for both datasets.

In [None]:
# Fraud Data PR Curve
plt.figure(figsize=(10, 6))
for res in results:
    if 'Fraud Data' in res['model_name']:
        label = f"{res['model_name']} (AUC-PR={res['auc_pr']:.3f})"
        PrecisionRecallDisplay.from_predictions(y_test_f, res['y_prob'], name=label, ax=plt.gca())

plt.title('Precision-Recall Curve Comparison - E-commerce Fraud Data')
plt.axhline(y=y_test_f.mean(), color='r', linestyle='--', label='No Skill')
plt.legend()
plt.show()

# Credit Card PR Curve
plt.figure(figsize=(10, 6))
for res in results:
    if 'Credit' in res['model_name']:
        label = f"{res['model_name']} (AUC-PR={res['auc_pr']:.3f})"
        PrecisionRecallDisplay.from_predictions(y_test_c, res['y_prob'], name=label, ax=plt.gca())

plt.title('Precision-Recall Curve Comparison - Credit Card Fraud Data')
plt.axhline(y=y_test_c.mean(), color='r', linestyle='--', label='No Skill')
plt.legend()
plt.show()

## 7. Conclusion & Model Selection

Based on the results above:

- **AUC-PR** is the primary metric for selection as it accounts for the precision-recall trade-off across all thresholds.
- **XGBoost** generally outperforms Logistic Regression on non-linear patterns in the Fraud Data.
- For the **Credit Card** data (PCA features), both models perform exceptionally well, but XGBoost often has a slight edge on recall for very low false positive rates.

### Business Impact:
- **False Positives**: Lead to customer friction (declined legitimate transactions).
- **False Negatives**: Lead to financial loss (missed fraud).
The chosen model allows for threshold tuning to balance these depending on company risk appetite.