In [1]:
# # 02 - Entraînement des modèles de crédit scoring
#
# Objectifs :
# - Entraîner plusieurs modèles (baseline + modèle avancé)
# - Gérer le déséquilibre de classes
# - Utiliser la validation croisée et l'AUC
# - Définir une métrique métier (coût FN/FP)
# - Optimiser le seuil de décision
# - Tracker les expériences avec MLflow et exporter le modèle final


import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

import mlflow
import mlflow.sklearn

# pour importer les modules du dossier src/
sys.path.append("..")

from src.data_prep import build_datasets, build_preprocessor
from src.metrics import compute_classic_metrics, business_cost, cost_curve

# ## 1. Chargement des données et préparation de base


train_df, test_df = build_datasets()

X = train_df.drop(columns=["TARGET"])
y = train_df["TARGET"]

X.shape, y.shape




((307511, 192), (307511,))

In [2]:
# ## 2. Déséquilibre des classes

n_pos = int((y == 1).sum())
n_neg = int((y == 0).sum())
imbalance_ratio = n_neg / n_pos

print("Nombre de bons payeurs (0):", n_neg)
print("Nombre de mauvais payeurs (1):", n_pos)
print("Ratio négatifs/positifs (scale_pos_weight):", imbalance_ratio)

Nombre de bons payeurs (0): 282686
Nombre de mauvais payeurs (1): 24825
Ratio négatifs/positifs (scale_pos_weight): 11.387150050352467


In [3]:
# ## 3. Échantillon pour l'expérimentation
#
# Pour limiter le temps de calcul, la validation croisée est faite
# sur un échantillon stratifié de 50 000 clients.
# Le modèle final sera ensuite ré-entraîné sur toutes les données.


sample_size = 50000
X_small = X.sample(n=min(sample_size, len(X)), random_state=42)
y_small = y.loc[X_small.index]

X_small.shape, y_small.shape

((50000, 192), (50000,))

In [4]:
# ## 4. Définition des modèles
#
# - Régression logistique (baseline)
# - LightGBM (modèle avancé)


logreg_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1,
)

lgbm_model = LGBMClassifier(
    n_estimators=200,  # valeur modérée pour limiter le temps de calcul
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    scale_pos_weight=imbalance_ratio,
    n_jobs=-1,
    random_state=42,
)

models = {
    "logreg_baseline": logreg_model,
    "lgbm_baseline": lgbm_model,
}

models

{'logreg_baseline': LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=-1),
 'lgbm_baseline': LGBMClassifier(colsample_bytree=0.8, learning_rate=0.05, n_estimators=200,
                n_jobs=-1, objective='binary', random_state=42,
                scale_pos_weight=11.387150050352467, subsample=0.8)}

In [8]:
# # 02 - Entraînement des modèles de crédit scoring
#
# Objectifs :
# - Entraîner plusieurs modèles (baseline + modèle avancé)
# - Gérer le déséquilibre de classes
# - Utiliser la validation croisée et l'AUC
# - Définir une métrique métier (coût FN/FP)
# - Optimiser le seuil de décision
# - Tracker les expériences avec MLflow et exporter le modèle final


import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

import mlflow
import mlflow.sklearn

# pour importer les modules du dossier src/
sys.path.append("..")

from src.data_prep import build_datasets, build_preprocessor
from src.metrics import compute_classic_metrics, business_cost, cost_curve

# ## 1. Chargement des données et préparation de base


train_df, test_df = build_datasets()

X = train_df.drop(columns=["TARGET"])
y = train_df["TARGET"]

X.shape, y.shape

preprocessor = build_preprocessor(train_df)
preprocessor

# ## 2. Déséquilibre des classes


n_pos = int((y == 1).sum())
n_neg = int((y == 0).sum())
imbalance_ratio = n_neg / n_pos

print("Nombre de bons payeurs (0):", n_neg)
print("Nombre de mauvais payeurs (1):", n_pos)
print("Ratio négatifs/positifs (scale_pos_weight):", imbalance_ratio)

# ## 3. Échantillon pour l'expérimentation
#
# Pour limiter le temps de calcul, la validation croisée est faite
# sur un échantillon stratifié de 50 000 clients.
# Le modèle final sera ensuite ré-entraîné sur toutes les données.


sample_size = 50000
X_small = X.sample(n=min(sample_size, len(X)), random_state=42)
y_small = y.loc[X_small.index]

X_small.shape, y_small.shape

# ## 4. Définition des modèles
#
# - Régression logistique (baseline)
# - LightGBM (modèle avancé)


logreg_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1,
)

lgbm_model = LGBMClassifier(
    n_estimators=200,  # valeur modérée pour limiter le temps de calcul
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    scale_pos_weight=imbalance_ratio,
    n_jobs=-1,
    random_state=42,
)

models = {
    "logreg_baseline": logreg_model,
    "lgbm_baseline": lgbm_model,
}

models

# ## 5. Validation croisée (AUC) + tracking MLflow
#
# - StratifiedKFold pour respecter le déséquilibre
# - AUC comme métrique principale
# - Tracking MLflow : paramètres, métriques, modèle


# IMPORTANT : on pointe sur le mlruns à la racine du projet
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("credit_scoring")

mlflow.get_tracking_uri()

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

results_auc = {}

for name, model in models.items():
    print(f"\n=== Modèle : {name} ===")

    pipeline = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model),
    ])

    with mlflow.start_run(run_name=name):
        # Validation croisée sur l'échantillon
        scores = cross_val_score(
            pipeline,
            X_small,
            y_small,
            cv=cv,
            scoring="roc_auc",
            n_jobs=1,  # évite les problèmes de parallélisation sur Windows
        )
        auc_mean = float(scores.mean())
        auc_std = float(scores.std())

        results_auc[name] = (auc_mean, auc_std)

        # Log des principaux hyperparamètres
        mlflow.log_param("model_type", name)

        if isinstance(model, LogisticRegression):
            mlflow.log_param("max_iter", model.max_iter)
            mlflow.log_param("class_weight", "balanced")

        if isinstance(model, LGBMClassifier):
            mlflow.log_param("n_estimators", model.n_estimators)
            mlflow.log_param("learning_rate", model.learning_rate)
            mlflow.log_param("num_leaves", model.num_leaves)
            mlflow.log_param("subsample", model.subsample)
            mlflow.log_param("colsample_bytree", model.colsample_bytree)
            mlflow.log_param("scale_pos_weight", imbalance_ratio)

        # Log des métriques de CV
        mlflow.log_metric("cv_auc_mean", auc_mean)
        mlflow.log_metric("cv_auc_std", auc_std)

        # Entraîne le pipeline sur tout l'échantillon pour logguer un modèle
        pipeline.fit(X_small, y_small)
        mlflow.sklearn.log_model(pipeline, artifact_path="model")

        print("AUC moyenne (3-fold):", auc_mean)
        print("Écart-type AUC:", auc_std)

results_auc


Nombre de bons payeurs (0): 282686
Nombre de mauvais payeurs (1): 24825
Ratio négatifs/positifs (scale_pos_weight): 11.387150050352467

=== Modèle : logreg_baseline ===




AUC moyenne (3-fold): 0.6347927347356171
Écart-type AUC: 0.005508871059131385

=== Modèle : lgbm_baseline ===
[LightGBM] [Info] Number of positive: 2682, number of negative: 30651
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21349
[LightGBM] [Info] Number of data points in the train set: 33333, number of used features: 285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080461 -> initscore=-2.436103
[LightGBM] [Info] Start training from score -2.436103




[LightGBM] [Info] Number of positive: 2682, number of negative: 30651
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21337
[LightGBM] [Info] Number of data points in the train set: 33333, number of used features: 285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080461 -> initscore=-2.436103
[LightGBM] [Info] Start training from score -2.436103




[LightGBM] [Info] Number of positive: 2682, number of negative: 30652
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21342
[LightGBM] [Info] Number of data points in the train set: 33334, number of used features: 285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080458 -> initscore=-2.436135
[LightGBM] [Info] Start training from score -2.436135




[LightGBM] [Info] Number of positive: 4023, number of negative: 45977
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22067
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 287
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080460 -> initscore=-2.436113
[LightGBM] [Info] Start training from score -2.436113




AUC moyenne (3-fold): 0.7448715111676378
Écart-type AUC: 0.004166649015972984


{'logreg_baseline': (0.6347927347356171, 0.005508871059131385),
 'lgbm_baseline': (0.7448715111676378, 0.004166649015972984)}

In [10]:
# ## 6. Split train/validation pour le modèle final LightGBM


X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

X_train.shape, X_valid.shape

((246008, 192), (61503, 192))

In [11]:
# ## 7. Pipeline LightGBM final (sur le train) et métriques classiques
lgbm_final = LGBMClassifier(
    n_estimators=500,  # un peu plus élevé pour le modèle final
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    scale_pos_weight=imbalance_ratio,
    n_jobs=-1,
    random_state=42,
)

final_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", lgbm_final),
])
final_pipeline

final_pipeline.fit(X_train, y_train)
# Probabilités de défaut sur le jeu de validation
y_valid_proba = final_pipeline.predict_proba(X_valid)[:, 1]

# Métriques classiques pour le seuil 0.5
metrics_05 = compute_classic_metrics(
    y_true=y_valid,
    y_proba=y_valid_proba,
    threshold=0.5,
)
cost_05, conf_05 = business_cost(
    y_true=y_valid,
    y_proba=y_valid_proba,
    threshold=0.5,
    cost_fn=10.0,
    cost_fp=1.0,
    normalize=True,
)
metrics_05, cost_05, conf_05

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.192220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23444
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 298
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482




({'auc': np.float64(0.7679064000196357),
  'precision': 0.17520187548840843,
  'recall': 0.6773413897280967,
  'f1': 0.27839403973509935},
 np.float64(0.5178934360925483),
 {'tn': np.int64(40706),
  'fp': np.int64(15832),
  'fn': np.int64(1602),
  'tp': np.int64(3363)})