In [1]:
# === 1. IMPORTS ===
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold, train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, precision_recall_curve,
    confusion_matrix, classification_report, brier_score_loss
)

# Modèles
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# === 2. LOAD DATA ===
X = pd.read_csv("data/X_final_selected.csv")
y = pd.read_csv("data/y.csv").squeeze()  # convert to Series

print(f"Shape X: {X.shape}")
print(f"Shape y: {y.shape}")
print("\nDistribution de la cible:")
print(y.value_counts(normalize=True).round(3))


Shape X: (98053, 22)
Shape y: (98053,)

Distribution de la cible:
readmitted
0    0.887
1    0.113
Name: proportion, dtype: float64


In [2]:
# X, y déjà chargés (X: 98k x 22 num; y: binaire)
y.name = "target"  # harmoniser le nom

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(X_train.shape, X_test.shape)
print(y_train.value_counts(normalize=True).round(3))

(78442, 22) (19611, 22)
target
0    0.887
1    0.113
Name: proportion, dtype: float64


In [4]:

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (average_precision_score, roc_auc_score, f1_score,
                             precision_recall_curve, confusion_matrix, classification_report,
                             brier_score_loss)

scaler = StandardScaler()

logreg_L1 = Pipeline([
    ("scaler", scaler),
    ("clf", LogisticRegression(penalty="l1", solver="liblinear",
                               class_weight="balanced", max_iter=200))
])

logreg_L2 = Pipeline([
    ("scaler", scaler),
    ("clf", LogisticRegression(penalty="l2", solver="liblinear",
                               class_weight="balanced", max_iter=200))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"pr_auc": "average_precision", "roc_auc": "roc_auc"}

def cv_scores(pipe, name):
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    print(f"{name:>16} | PR-AUC: {scores['test_pr_auc'].mean():.3f} ± {scores['test_pr_auc'].std():.3f} "
          f"| ROC-AUC: {scores['test_roc_auc'].mean():.3f} ± {scores['test_roc_auc'].std():.3f}")
    return scores

scores_L1 = cv_scores(logreg_L1, "LogReg L1")
scores_L2 = cv_scores(logreg_L2, "LogReg L2")
best_pipe = logreg_L1 if scores_L1["test_pr_auc"].mean() >= scores_L2["test_pr_auc"].mean() else logreg_L2
best_name = "LogReg L1" if best_pipe is logreg_L1 else "LogReg L2"
print("\n=> Meilleur sur PR-AUC (CV):", best_name)


       LogReg L1 | PR-AUC: 0.197 ± 0.005 | ROC-AUC: 0.638 ± 0.007
       LogReg L2 | PR-AUC: 0.197 ± 0.005 | ROC-AUC: 0.638 ± 0.007

=> Meilleur sur PR-AUC (CV): LogReg L1
