## 4. Ensemble Model Training (Random Forest & XGBoost)

In [None]:
# 4_ensemble_model_training.ipynb

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import GridSearchCV

In [None]:
# 4.1 Load resampled data and test set
def load_parquet(name):
    return pd.read_parquet(f"{name}.parquet")

X_train_smote = load_parquet("X_train_smote")
y_train_smote = load_parquet("y_train_smote").values.ravel()

X_train_adasyn = load_parquet("X_train_adasyn")
y_train_adasyn = load_parquet("y_train_adasyn").values.ravel()

X_train_tomek = load_parquet("X_train_tomek")
y_train_tomek = load_parquet("y_train_tomek").values.ravel()

X_train_orig = load_parquet("X_train_orig")
y_train_orig = load_parquet("y_train_orig").values.ravel()

X_test = load_parquet("X_test")
y_test = load_parquet("y_test").values.ravel()

In [None]:
# 4.2 Define a helper to train & evaluate
def train_and_evaluate(model, X_tr, y_tr, X_te, y_te, label):
    print(f"\n#### {label} ####")
    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    proba = model.predict_proba(X_te)[:, 1]
    print(classification_report(y_te, preds, digits=4))
    auc_score = roc_auc_score(y_te, proba)
    print(f"AUC-ROC: {auc_score:.4f}")
    # Compute PR-AUC
    precision, recall, _ = precision_recall_curve(y_te, proba)
    pr_auc = auc(recall, precision)
    print(f"AUC-PR: {pr_auc:.4f}")
    return model

In [None]:
# 4.3 Random Forest (using defaults as baseline, then tune)
rf_default = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_smote = train_and_evaluate(rf_default, X_train_smote, y_train_smote, X_test, y_test, "RF + SMOTE (default)")
rf_adasyn = train_and_evaluate(rf_default, X_train_adasyn, y_train_adasyn, X_test, y_test, "RF + ADASYN (default)")
rf_tomek = train_and_evaluate(rf_default, X_train_tomek, y_train_tomek, X_test, y_test, "RF + Tomek (default)")
rf_orig = train_and_evaluate(rf_default, X_train_orig, y_train_orig, X_test, y_test, "RF + Original (default)")


In [None]:
# 4.4 Hyperparameter tuning for XGBoost on the best-performing resampled set (e.g. SMOTE if that was best)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
param_grid = {
    "max_depth": [6, 8],
    "n_estimators": [100, 200],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

# Use SMOTE set for grid search (as an example)
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(X_train_smote, y_train_smote)

In [None]:
# 4.6 Save the best-performing models to disk
import joblib
joblib.dump(rf_smote, "rf_smote.pkl")
joblib.dump(xgb_smote, "xgb_smote.pkl")