# 03 â€” Modeling

Compare:
- Isolation Forest (unsupervised)
- Autoencoder
- Gradient Boosting (LightGBM/XGBoost/CatBoost)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import sys
sys.path.append('../')

from src.models.isolation_forest import train_isolation_forest, predict_fraud_probability as predict_if
from src.models.autoencoder import train_autoencoder, predict_fraud_probability as predict_ae
from src.models.gradient_boosting import train_lightgbm, predict_proba_gbm
from src.evaluation.metrics import calculate_metrics, print_metrics_report


## 1. Load Processed Data


In [None]:
df = pd.read_csv("../data/processed/features.csv")
X = df.drop("fraud", axis=1)
y = df["fraud"]

# Remove metadata columns if present
metadata_cols = ['transaction_id', 'customer_id', 'timestamp']
X = X.drop(columns=[col for col in metadata_cols if col in X.columns])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

print(f"Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
print(f"Fraud rate - Train: {y_train.mean():.2%}, Val: {y_val.mean():.2%}, Test: {y_test.mean():.2%}")


## 2. Train Models


In [None]:
# Train Isolation Forest
print("Training Isolation Forest...")
model_if = train_isolation_forest(X_train, contamination=0.01, random_state=42)
proba_if = predict_if(model_if, X_test)

# Train Autoencoder
print("\nTraining Autoencoder...")
autoencoder, encoder, scaler = train_autoencoder(X_train, X_val=X_val, epochs=20, verbose=0)
proba_ae = predict_ae(autoencoder, X_test)

# Train LightGBM
print("\nTraining LightGBM...")
model_lgb = train_lightgbm(X_train, y_train, X_val=X_val, y_val=y_val, verbose_eval=0)
proba_lgb = predict_proba_gbm(model_lgb, X_test, model_type='lightgbm')[:, 1]

print("\nAll models trained!")


## 3. Evaluate Models  
Use PR-AUC, Recall@K, cost-based metrics.


In [None]:
# Evaluate Isolation Forest
pred_if = (proba_if >= 0.5).astype(int)
metrics_if = calculate_metrics(y_test, pred_if, proba_if)
print_metrics_report(metrics_if, "Isolation Forest")

# Evaluate Autoencoder
pred_ae = (proba_ae >= 0.5).astype(int)
metrics_ae = calculate_metrics(y_test, pred_ae, proba_ae)
print_metrics_report(metrics_ae, "Autoencoder")

# Evaluate LightGBM
pred_lgb = (proba_lgb >= 0.5).astype(int)
metrics_lgb = calculate_metrics(y_test, pred_lgb, proba_lgb)
print_metrics_report(metrics_lgb, "LightGBM")
