# Machine Learning Analysis - Fraud Detection

This notebook implements machine learning models for fraud detection:
1. Data preprocessing and feature engineering
2. Model training (Logistic Regression, Random Forest, XGBoost, LightGBM)
3. Model evaluation and validation
4. Feature importance analysis
5. Model interpretation


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
import joblib

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Load data
data_path = Path('../../data/fraud_data.csv')
df = pd.read_csv(data_path)
print(f"Data loaded: {df.shape}")
print(f"Fraud rate: {df['isFraud'].mean():.4f}")
print(f"Target distribution:\n{df['isFraud'].value_counts()}")


## 1. Data Preprocessing


In [None]:
# Select features for modeling
# Use key numerical features and handle missing values
key_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 
                'addr1', 'addr2', 'dist1', 'dist2']
key_features = [f for f in key_features if f in df.columns]

# Add some C and D features if available
c_features = [col for col in df.columns if col.startswith('C') and col[1:].isdigit()][:10]
d_features = [col for col in df.columns if col.startswith('D') and col[1:].isdigit()][:10]

features = key_features + c_features + d_features
features = [f for f in features if f in df.columns]

print(f"Selected {len(features)} features for modeling")

# Prepare data
X = df[features].copy()
y = df['isFraud'].copy()

# Handle missing values
X = X.fillna(X.median())

# Handle infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Missing values in X: {X.isnull().sum().sum()}")


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}, Fraud rate: {y_train.mean():.4f}")
print(f"Test set: {X_test.shape}, Fraud rate: {y_test.mean():.4f}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data preprocessing complete!")


## 2. Model Training


In [None]:
# Model 1: Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression trained!")
print(f"AUC-ROC: {roc_auc_score(y_test, lr_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))


In [None]:
# Model 2: Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest trained!")
print(f"AUC-ROC: {roc_auc_score(y_test, rf_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))


In [None]:
# Model 3: XGBoost
print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    eval_metric='auc'
)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

print("XGBoost trained!")
print(f"AUC-ROC: {roc_auc_score(y_test, xgb_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, xgb_pred))


In [None]:
# Model 4: LightGBM
print("Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
lgb_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

print("LightGBM trained!")
print(f"AUC-ROC: {roc_auc_score(y_test, lgb_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lgb_pred))


## 3. Model Evaluation


In [None]:
# Compare models
models = {
    'Logistic Regression': (lr_pred_proba, lr_pred),
    'Random Forest': (rf_pred_proba, rf_pred),
    'XGBoost': (xgb_pred_proba, xgb_pred),
    'LightGBM': (lgb_pred_proba, lgb_pred)
}

# ROC Curve
plt.figure(figsize=(10, 8))
for name, (pred_proba, pred) in models.items():
    fpr, tpr, _ = roc_curve(y_test, pred_proba)
    auc = roc_auc_score(y_test, pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../../outputs/figures/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

# Model comparison table
comparison = []
for name, (pred_proba, pred) in models.items():
    auc = roc_auc_score(y_test, pred_proba)
    comparison.append({
        'Model': name,
        'AUC-ROC': auc,
        'Accuracy': (pred == y_test).mean()
    })

comparison_df = pd.DataFrame(comparison)
print("Model Comparison:")
print("="*80)
display(comparison_df.sort_values('AUC-ROC', ascending=False))


In [None]:
# Feature importance (using Random Forest and XGBoost)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Random Forest feature importance
rf_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

axes[0].barh(range(len(rf_importance)), rf_importance['importance'])
axes[0].set_yticks(range(len(rf_importance)))
axes[0].set_yticklabels(rf_importance['feature'])
axes[0].set_xlabel('Importance')
axes[0].set_title('Random Forest - Top 20 Feature Importance')
axes[0].invert_yaxis()

# XGBoost feature importance
xgb_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

axes[1].barh(range(len(xgb_importance)), xgb_importance['importance'])
axes[1].set_yticks(range(len(xgb_importance)))
axes[1].set_yticklabels(xgb_importance['feature'])
axes[1].set_xlabel('Importance')
axes[1].set_title('XGBoost - Top 20 Feature Importance')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('../../outputs/figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("Top 10 Features (Random Forest):")
display(rf_importance.head(10))
print("\nTop 10 Features (XGBoost):")
display(xgb_importance.head(10))


In [None]:
# Save best model
best_model = xgb_model  # XGBoost typically performs best
joblib.dump(best_model, '../../outputs/models/best_model.pkl')
joblib.dump(scaler, '../../outputs/models/scaler.pkl')
print("Best model saved to outputs/models/best_model.pkl")

# Summary
print("\n" + "="*80)
print("MACHINE LEARNING ANALYSIS SUMMARY")
print("="*80)
print(f"\nBest Model: XGBoost")
print(f"Best AUC-ROC: {roc_auc_score(y_test, xgb_pred_proba):.4f}")
print(f"\nModels trained: {len(models)}")
print(f"Features used: {len(features)}")
print("\nAnalysis complete!")
print("="*80)
