In [1]:
# Imports & chargement des données préparées
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report


In [2]:
# Charger les données
df = pd.read_csv("../data/train.csv")

X = df.drop(columns=["target", "ID_code"])
y = df["target"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Charger le scaler
scaler = joblib.load("../models/scaler.pkl")
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [3]:
# Logistic Regression — Baseline
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced"  # important pour le déséquilibre
)

logreg.fit(X_train_scaled, y_train)

y_val_proba_lr = logreg.predict_proba(X_val_scaled)[:, 1]
auc_lr = roc_auc_score(y_val, y_val_proba_lr)

auc_lr


0.8598998891036755

In [4]:
print(classification_report(y_val, logreg.predict(X_val_scaled)))

              precision    recall  f1-score   support

           0       0.97      0.78      0.87     35980
           1       0.29      0.78      0.42      4020

    accuracy                           0.78     40000
   macro avg       0.63      0.78      0.64     40000
weighted avg       0.90      0.78      0.82     40000



In [5]:
# Random Forest — Ensemble robuste
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf.fit(X_train, y_train)

y_val_proba_rf = rf.predict_proba(X_val)[:, 1]
auc_rf = roc_auc_score(y_val, y_val_proba_rf)

auc_rf


0.8187174985273743

In [6]:
print(classification_report(y_val, rf.predict(X_val)))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35980
           1       0.00      0.00      0.00      4020

    accuracy                           0.90     40000
   macro avg       0.45      0.50      0.47     40000
weighted avg       0.81      0.90      0.85     40000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# XGBoost — State-of-the-art (si dispo)
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---- ----------------------------------- 7.9/72.0 MB 38.8 MB/s eta 0:00:02
   --------- ------------------------------ 16.5/72.0 MB 40.0 MB/s eta 0:00:02
   ------------ --------------------------- 23.3/72.0 MB 37.4 MB/s eta 0:00:02
   ---------------- ----------------------- 30.1/72.0 MB 35.6 MB/s eta 0:00:02
   --------------------- ------------------ 39.3/72.0 MB 37.0 MB/s eta 0:00:01
   --------------------------- ------------ 49.0/72.0 MB 38.2 MB/s eta 0:00:01
   -------------------------------- ------- 58.7/72.0 MB 39.2 MB/s eta 0:00:01
   ------------------------------------- -- 67.6/72.0 MB 39.9 MB/s eta 0:00:01
   ---------------------------------------  71.8/72.0 MB 40.4 MB/s eta 0:00:01
   

In [8]:
# Entraînement
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    random_state=42
)

xgb.fit(X_train, y_train)

y_val_proba_xgb = xgb.predict_proba(X_val)[:, 1]
auc_xgb = roc_auc_score(y_val, y_val_proba_xgb)

auc_xgb


0.8746944128717171

In [9]:
#LightGBM — Rapide et performant (optionnel)
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------------------------------ --- 1.3/1.5 MB 34.9 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 12.0 MB/s  0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [10]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=42
)

lgbm.fit(X_train, y_train)

y_val_proba_lgbm = lgbm.predict_proba(X_val)[:, 1]
auc_lgbm = roc_auc_score(y_val, y_val_proba_lgbm)

auc_lgbm


[LightGBM] [Info] Number of positive: 16078, number of negative: 143922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.458579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


0.8876726843824236

In [11]:
# Comparaison des modèles
results = {
    "Logistic Regression": auc_lr,
    "Random Forest": auc_rf,
}

# Ajouter seulement si entraînés
try:
    results["XGBoost"] = auc_xgb
except:
    pass

try:
    results["LightGBM"] = auc_lgbm
except:
    pass

results


{'Logistic Regression': 0.8598998891036755,
 'Random Forest': 0.8187174985273743,
 'XGBoost': 0.8746944128717171,
 'LightGBM': 0.8876726843824236}

In [12]:
# Sauvegarde du meilleur modèle
best_model_name = max(results, key=results.get)
best_auc = results[best_model_name]

best_model = {
    "Logistic Regression": logreg,
    "Random Forest": rf,
    "XGBoost": xgb if "XGBoost" in results else None,
    "LightGBM": lgbm if "LightGBM" in results else None
}[best_model_name]

joblib.dump(best_model, "../models/best_model.pkl")

best_model_name, best_auc



('LightGBM', 0.8876726843824236)

Plusieurs modèles de classification ont été entraînés et comparés, incluant une
régression logistique en tant que baseline, un Random Forest, ainsi que des
modèles de boosting (XGBoost et LightGBM). L’évaluation a été réalisée à l’aide du
ROC-AUC, métrique adaptée aux données déséquilibrées. Le modèle présentant la
meilleure performance sur le jeu de validation a été sélectionné et sauvegardé
pour une utilisation ultérieure via l’API.

In [13]:
# GridSearchCV pour Logistic Regression (rapide)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

lr = LogisticRegression(max_iter=5000, class_weight="balanced")

param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs"]
}

grid_lr = GridSearchCV(
    lr,
    param_grid_lr,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_lr.fit(X_train_scaled, y_train)

print("Best params LR:", grid_lr.best_params_)
print("Best CV AUC LR:", grid_lr.best_score_)

best_lr = grid_lr.best_estimator_
p_val_lr = best_lr.predict_proba(X_val_scaled)[:, 1]
print("Val AUC LR:", roc_auc_score(y_val, p_val_lr))


Fitting 3 folds for each of 4 candidates, totalling 12 fits




Best params LR: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV AUC LR: 0.8584713342274782
Val AUC LR: 0.8599015345728278
