In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [47]:
# ==========================================================
# 1️⃣ Préparation des données
# ==========================================================
X = pd.read_csv("data/X_final_selected.csv")
y = pd.read_csv("data/y.csv").squeeze()


# 3) Nettoyer les noms de colonnes (évite erreurs XGB)
X.columns = (X.columns.astype(str)
             .str.replace('[', '(', regex=False)
             .str.replace(']', ')', regex=False)
             .str.replace('<', 'inf_', regex=False)
             .str.replace('>', 'sup_', regex=False)
             .str.replace(',', '_', regex=False)
             .str.replace(' ', '_', regex=False))

In [48]:
# 4) Pondération si déséquilibre (auto)
pos_ratio = y.mean()
# évite division par 0/1
scale_pos = (1 - pos_ratio) / pos_ratio if 0 < pos_ratio < 1 else 1.0

In [49]:
# 5) Définir les 2–3 modèles
models = {
    "LightGBM": LGBMClassifier(
        n_estimators=800, learning_rate=0.03,
        num_leaves=63, max_depth=-1, subsample=0.8, colsample_bytree=0.7,
        reg_lambda=1.0, random_state=42,
        class_weight=None if not (0 < pos_ratio < 1) else {0:1, 1:scale_pos}
    ),
    "CatBoost": CatBoostClassifier(
        iterations=1000, learning_rate=0.03, depth=8,
        l2_leaf_reg=3.0, subsample=0.8, random_state=42,
        verbose=0,
        scale_pos_weight=None if not (0 < pos_ratio < 1) else scale_pos
    ),
    "XGBoost": XGBClassifier(
        n_estimators=900, learning_rate=0.03, max_depth=7,
        subsample=0.8, colsample_bytree=0.7,
        reg_lambda=1.0, random_state=42,
        eval_metric="auc", tree_method="hist",
        scale_pos_weight=1.0 if not (0 < pos_ratio < 1) else scale_pos
    )
}

In [50]:
# 6) Validation croisée AUC
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
res = []
for name, model in models.items():
    auc = cross_val_score(model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    res.append((name, auc.mean(), auc.std()))
res = sorted(res, key=lambda t: t[1], reverse=True)

for name, m, s in res:
    print(f"{name:10s}  AUC = {m:.4f} ± {s:.4f}")

CatBoost    AUC = 0.6616 ± 0.0053
LightGBM    AUC = 0.6598 ± 0.0093
XGBoost     AUC = 0.6489 ± 0.0058


In [51]:
# 7) Fit final sur train/test + AUC test (optionnel mais utile)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
best_name = res[0][0]
best_model = models[best_name]
best_model.fit(X_tr, y_tr)
from sklearn.metrics import roc_auc_score
auc_test = roc_auc_score(y_te, best_model.predict_proba(X_te)[:,1])
print(f"\nBest: {best_name} — Test AUC = {auc_test:.4f}")


Best: CatBoost — Test AUC = 0.6590
