# Model Building and Training for Fraud Detection

This notebook builds and trains models for fraud detection using the preprocessed datasets:
1. Fraud_Data.csv - E-commerce transaction data
2. creditcard.csv - Bank transaction data

We will build and compare two models for each dataset:
1. Logistic Regression - As a simple, interpretable baseline
2. XGBoost - As a powerful ensemble model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import joblib

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

# Import custom modules
from model import (
    train_logistic_regression, train_xgboost, evaluate_model,
    plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve,
    save_model
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display all columns
pd.set_option('display.max_columns', None)

## 1. Load the Preprocessed Data

In [None]:
# Load Fraud_Data splits
X_train_fraud = joblib.load('../data/processed/X_train_fraud.pkl')
X_test_fraud = joblib.load('../data/processed/X_test_fraud.pkl')
y_train_fraud = joblib.load('../data/processed/y_train_fraud.pkl')
y_test_fraud = joblib.load('../data/processed/y_test_fraud.pkl')
X_train_fraud_resampled = joblib.load('../data/processed/X_train_fraud_resampled.pkl')
y_train_fraud_resampled = joblib.load('../data/processed/y_train_fraud_resampled.pkl')

# Load Creditcard splits
X_train_cc = joblib.load('../data/processed/X_train_cc.pkl')
X_test_cc = joblib.load('../data/processed/X_test_cc.pkl')
y_train_cc = joblib.load('../data/processed/y_train_cc.pkl')
y_test_cc = joblib.load('../data/processed/y_test_cc.pkl')
X_train_cc_resampled = joblib.load('../data/processed/X_train_cc_resampled.pkl')
y_train_cc_resampled = joblib.load('../data/processed/y_train_cc_resampled.pkl')

# Print shapes
print(f"Fraud_Data: X_train shape = {X_train_fraud.shape}, X_test shape = {X_test_fraud.shape}")
print(f"Fraud_Data (resampled): X_train shape = {X_train_fraud_resampled.shape}")
print(f"Creditcard: X_train shape = {X_train_cc.shape}, X_test shape = {X_test_cc.shape}")
print(f"Creditcard (resampled): X_train shape = {X_train_cc_resampled.shape}")

## 2. Train Models for Fraud_Data

In [None]:
# Train Logistic Regression model
lr_fraud = train_logistic_regression(X_train_fraud_resampled, y_train_fraud_resampled)

# Train XGBoost model
xgb_fraud = train_xgboost(X_train_fraud_resampled, y_train_fraud_resampled)

In [None]:
# Evaluate Logistic Regression model
lr_fraud_metrics = evaluate_model(lr_fraud, X_test_fraud, y_test_fraud)
print("Logistic Regression Metrics:")
for metric, value in lr_fraud_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric}: {value:.4f}")

# Plot confusion matrix
plot_confusion_matrix(lr_fraud_metrics['confusion_matrix'], 'Logistic Regression (Fraud_Data)')

# Plot ROC curve
plot_roc_curve(y_test_fraud, lr_fraud.predict_proba(X_test_fraud)[:, 1], 'Logistic Regression (Fraud_Data)')

# Plot Precision-Recall curve
plot_precision_recall_curve(y_test_fraud, lr_fraud.predict_proba(X_test_fraud)[:, 1], 'Logistic Regression (Fraud_Data)')

In [None]:
# Evaluate XGBoost model
xgb_fraud_metrics = evaluate_model(xgb_fraud, X_test_fraud, y_test_fraud)
print("XGBoost Metrics:")
for metric, value in xgb_fraud_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric}: {value:.4f}")

# Plot confusion matrix
plot_confusion_matrix(xgb_fraud_metrics['confusion_matrix'], 'XGBoost (Fraud_Data)')

# Plot ROC curve
plot_roc_curve(y_test_fraud, xgb_fraud.predict_proba(X_test_fraud)[:, 1], 'XGBoost (Fraud_Data)')

# Plot Precision-Recall curve
plot_precision_recall_curve(y_test_fraud, xgb_fraud.predict_proba(X_test_fraud)[:, 1], 'XGBoost (Fraud_Data)')

In [None]:
# Compare models for Fraud_Data
fraud_metrics = pd.DataFrame({
    'Logistic Regression': {
        'Accuracy': lr_fraud_metrics['accuracy'],
        'Precision': lr_fraud_metrics['precision'],
        'Recall': lr_fraud_metrics['recall'],
        'F1 Score': lr_fraud_metrics['f1_score'],
        'ROC AUC': lr_fraud_metrics['roc_auc'],
        'PR AUC': lr_fraud_metrics['pr_auc']
    },
    'XGBoost': {
        'Accuracy': xgb_fraud_metrics['accuracy'],
        'Precision': xgb_fraud_metrics['precision'],
        'Recall': xgb_fraud_metrics['recall'],
        'F1 Score': xgb_fraud_metrics['f1_score'],
        'ROC AUC': xgb_fraud_metrics['roc_auc'],
        'PR AUC': xgb_fraud_metrics['pr_auc']
    }
})

fraud_metrics

In [None]:
# Save the models
save_model(lr_fraud, 'lr_fraud')
save_model(xgb_fraud, 'xgb_fraud')

## 3. Train Models for Creditcard

In [None]:
# Train Logistic Regression model
lr_cc = train_logistic_regression(X_train_cc_resampled, y_train_cc_resampled)

# Train XGBoost model
xgb_cc = train_xgboost(X_train_cc_resampled, y_train_cc_resampled)

In [None]:
# Evaluate Logistic Regression model
lr_cc_metrics = evaluate_model(lr_cc, X_test_cc, y_test_cc)
print("Logistic Regression Metrics:")
for metric, value in lr_cc_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric}: {value:.4f}")

# Plot confusion matrix
plot_confusion_matrix(lr_cc_metrics['confusion_matrix'], 'Logistic Regression (Creditcard)')

# Plot ROC curve
plot_roc_curve(y_test_cc, lr_cc.predict_proba(X_test_cc)[:, 1], 'Logistic Regression (Creditcard)')

# Plot Precision-Recall curve
plot_precision_recall_curve(y_test_cc, lr_cc.predict_proba(X_test_cc)[:, 1], 'Logistic Regression (Creditcard)')

In [None]:
# Evaluate XGBoost model
xgb_cc_metrics = evaluate_model(xgb_cc, X_test_cc, y_test_cc)
print("XGBoost Metrics:")
for metric, value in xgb_cc_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric}: {value:.4f}")

# Plot confusion matrix
plot_confusion_matrix(xgb_cc_metrics['confusion_matrix'], 'XGBoost (Creditcard)')

# Plot ROC curve
plot_roc_curve(y_test_cc, xgb_cc.predict_proba(X_test_cc)[:, 1], 'XGBoost (Creditcard)')

# Plot Precision-Recall curve
plot_precision_recall_curve(y_test_cc, xgb_cc.predict_proba(X_test_cc)[:, 1], 'XGBoost (Creditcard)')

In [None]:
# Compare models for Creditcard
cc_metrics = pd.DataFrame({
    'Logistic Regression': {
        'Accuracy': lr_cc_metrics['accuracy'],
        'Precision': lr_cc_metrics['precision'],
        'Recall': lr_cc_metrics['recall'],
        'F1 Score': lr_cc_metrics['f1_score'],
        'ROC AUC': lr_cc_metrics['roc_auc'],
        'PR AUC': lr_cc_metrics['pr_auc']
    },
    'XGBoost': {
        'Accuracy': xgb_cc_metrics['accuracy'],
        'Precision': xgb_cc_metrics['precision'],
        'Recall': xgb_cc_metrics['recall'],
        'F1 Score': xgb_cc_metrics['f1_score'],
        'ROC AUC': xgb_cc_metrics['roc_auc'],
        'PR AUC': xgb_cc_metrics['pr_auc']
    }
})

cc_metrics

In [None]:
# Save the models
save_model(lr_cc, 'lr_cc')
save_model(xgb_cc, 'xgb_cc')

## 4. Model Selection and Justification

### Fraud_Data

[After running the notebook, analyze the results and provide justification for the best model here]

### Creditcard

[After running the notebook, analyze the results and provide justification for the best model here]