In [11]:
# --- Setup (uncomment next 2 lines if you need to read your CSV) ---
# CSV_PATH = "./your_data.csv"
# df = pd.read_csv(CSV_PATH)

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


In [12]:
import re
import numpy as np
import pandas as pd

# --- Helpers de nettoyage ---
_non_num_pat = re.compile(r"[^0-9\-\.,]")  # garde chiffres, signe, point, virgule

def to_numeric_safe(s: pd.Series) -> pd.Series:
    """Convertit en numérique en retirant les caractères parasites (espaces, _, %, etc.)."""
    s = s.astype(str).str.replace(r"\s+", "", regex=True) \
                     .str.replace("_", "", regex=False) \
                     .str.replace("%", "", regex=False)
    s = s.str.replace(_non_num_pat, "", regex=True).str.replace(",", ".", regex=False)
    # Evite '' -> NaN puis cast en float
    s = pd.to_numeric(s.replace({"": np.nan, ".": np.nan, "-": np.nan}), errors="coerce")
    return s

def parse_credit_history_age(col: pd.Series) -> pd.Series:
    """
    Convertit 'X Years Y Months' en nombre total de mois.
    Gère les variantes et parasitages simples.
    """
    def _one(x: str) -> float:
        if pd.isna(x): return np.nan
        x = str(x).lower().replace("_", " ").replace("-", " ")
        yrs = re.search(r"(\d+)\s*year", x)
        mos = re.search(r"(\d+)\s*month", x)
        y = int(yrs.group(1)) if yrs else 0
        m = int(mos.group(1)) if mos else 0
        # fallback: si juste un nombre « 123 », on le prend comme mois déjà
        if not yrs and not mos:
            m_only = re.search(r"\d+", x)
            return float(m_only.group(0)) if m_only else np.nan
        return float(y * 12 + m)
    return col.apply(_one)

def normalize_text(col: pd.Series) -> pd.Series:
    """Nettoie les catégorielles: trim, minuscule, supprime ponctuation légère et underscores."""
    return (col.astype(str)
              .str.strip()
              .str.lower()
              .str.replace("_", " ", regex=False)
              .str.replace(r"[!\|\.]+", "", regex=True)
              .str.replace(r"\s+", " ", regex=True)
              .replace({"nan": np.nan, "none": np.nan, "": np.nan}))


In [15]:

# Lecture (plus robuste)
df = pd.read_csv("train-3.csv", low_memory=False)

# Harmonise les noms de colonnes (trim + retire espaces/underscores doubles)
df.columns = (df.columns.str.strip()
                        .str.replace(r"\s+", "_", regex=True)
                        .str.replace("__+", "_", regex=True))

# 1) Normalisation catégorielles "texte"
text_like = [
    "Occupation","Type_of_Loan","Payment_Behaviour","Payment_of_Min_Amount",
    "Credit_Mix","Credit_Score","SSN","Name","Month","Customer_ID","ID"
]
for c in [col for col in text_like]:
    df[c] = normalize_text(df[c])

# 2) Colonnes attendues numériques -> nettoyage dur
num_should_be = [
    "Age","Annual_Income","Monthly_Inhand_Salary","Num_Bank_Accounts","Num_Credit_Card",
    "Interest_Rate","Delay_from_due_date","Num_Credit_Inquiries","Credit_Utilization_Ratio",
    "Total_EMI_per_month","Amount_invested_monthly","Outstanding_Debt","Monthly_Balance",
    "Num_of_Loan","Changed_Credit_Limit"
]
for c in [col for col in num_should_be]:
    df[c] = to_numeric_safe(df[c])

# 3) Champ spécial: Credit_History_Age -> mois
if "Credit_History_Age" in df.columns:
    df["Credit_History_Age_Months"] = parse_credit_history_age(df["Credit_History_Age"])
    # on peut garder l’original pour debug si tu veux ; sinon:
    df.drop(columns=["Credit_History_Age"], inplace=True)

# 4) Standardise quelques catégorielles clés
if "Payment_of_Min_Amount" in df.columns:
    df["Payment_of_Min_Amount"] = df["Payment_of_Min_Amount"].replace({
        "yes":"yes","y":"yes",
        "no":"no","n":"no",
        "nm":"no"  # souvent « Not Mentioned » -> on l'assimile à 'no' ou mets np.nan si tu préfères
    })

if "Credit_Mix" in df.columns:
    # garde en texte propre; l’encodage viendra plus tard
    df["Credit_Mix"] = df["Credit_Mix"].replace({
        "good":"good","standard":"standard","bad":"bad"
    })

if "Payment_Behaviour" in df.columns:
    # Exemple de simplification légère (retire 'spent', 'avg', etc. si bruit)
    df["Payment_Behaviour"] = df["Payment_Behaviour"].str.replace(r"(spent|avg|high|low)", "", regex=True).str.strip()
    df["Payment_Behaviour"] = df["Payment_Behaviour"].replace({"": np.nan})

# 5) Drop colonnes identifiants / fuites de cible
for col in ["ID","Customer_ID","Name","SSN","Month"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)


In [16]:


# -----------------------------
# 2) Feature/target split + type inference
# -----------------------------
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Infer categorical vs numerical (heuristic: object/category -> categorical;
# low-cardinality integers can be categorical too if you want. Here we keep ints as numeric.)
cat_cols: List[str] = [c for c in X.columns if X[c].dtype == "object"]
num_cols: List[str] = [c for c in X.columns if c not in cat_cols]

print(f"Dropping columns: {drop_cols}")
print(f"Detected {len(cat_cols)} categorical: {cat_cols[:10]}{' ...' if len(cat_cols)>10 else ''}")
print(f"Detected {len(num_cols)} numerical: {num_cols[:10]}{' ...' if len(num_cols)>10 else ''}")


Dropping columns: ['ID', 'Customer_ID', 'Name', 'SSN', 'Month']
Detected 6 categorical: ['Occupation', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
Detected 16 numerical: ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Changed_Credit_Limit', 'Num_Credit_Inquiries'] ...


In [5]:
# -----------------------------
# 3) Preprocess + Model
# -----------------------------
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
])

clf = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=256,
    learning_rate="adaptive",
    max_iter=200,
    early_stopping=True,
    n_iter_no_change=15,
    random_state=42,
    verbose=False
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", clf),
])


In [19]:
# Sépare X / y
assert "Credit_Score" in df.columns, "La colonne cible 'Credit_Score' est introuvable."
y = df["Credit_Score"].astype("category")
X = df.drop(columns=["Credit_Score"])

# Détection robuste des types APRÈS nettoyage
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]
print(f"{len(num_cols)} numériques / {len(cat_cols)} catégorielles")


16 numériques / 6 catégorielles


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preproc = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop",
    n_jobs=None
)

clf = Pipeline(steps=[
    ("prep", preproc),
    ("model", LogisticRegression(
        max_iter=2000,
        multi_class="multinomial",
        class_weight="balanced",
        n_jobs=-1
    ))
])


In [21]:
clf.fit(X_train, y_train)



              precision    recall  f1-score   support

        good       0.51      0.86      0.64      3566
        poor       0.74      0.76      0.75      5799
    standard       0.86      0.65      0.74     10635

    accuracy                           0.72     20000
   macro avg       0.70      0.76      0.71     20000
weighted avg       0.76      0.72      0.73     20000

[[3065   40  461]
 [ 663 4436  700]
 [2228 1499 6908]]


In [22]:
# --- Évaluation enrichie (multiclass) ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score,
    balanced_accuracy_score, cohen_kappa_score, roc_auc_score
)

# Prédictions
y_pred = clf.predict(X_test)





In [None]:

# Libellés (ordre fixé pour cohérence)
labels = sorted(pd.unique(pd.concat([pd.Series(y_test), pd.Series(y_pred)], ignore_index=True)))
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_norm = cm / cm.sum(axis=1, keepdims=True)

# --- Métriques globales
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
    "precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),
    "precision_weighted": precision_score(y_test, y_pred, average="weighted", zero_division=0),
    "recall_weighted": recall_score(y_test, y_pred, average="weighted", zero_division=0),
    "f1_weighted": f1_score(y_test, y_pred, average="weighted", zero_division=0),
    "cohen_kappa": cohen_kappa_score(y_test, y_pred),
}

# ROC-AUC macro (si le modèle expose predict_proba)
try:
    if hasattr(clf, "predict_proba"):
        proba = clf.predict_proba(X_test)
        # Gestion des cibles catégorielles
        y_true_idx = pd.Series(y_test).astype(pd.CategoricalDtype(categories=clf.classes_)).cat.codes.values
        metrics["roc_auc_ovr_macro"] = roc_auc_score(
            y_true_idx,
            proba,
            multi_class="ovr",
            average="macro"
        )
except Exception:
    pass

# --- Rapport détaillé par classe en DataFrame
report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report_dict).transpose()
display(report_df.style.format({
    "precision": "{:.3f}", "recall": "{:.3f}", "f1-score": "{:.3f}", "support": "{:.0f}"
}))

# --- Tableau des métriques globales
metrics_df = pd.DataFrame([metrics]).T.rename(columns={0: "value"})
display(metrics_df.style.format("{:.4f}"))

# --- Matrice de confusion normalisée (%) pour lecture rapide
cm_norm_df = pd.DataFrame(cm_norm, index=[f"true_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
display(cm_norm_df.applymap(lambda x: f"{x*100:.1f}%"))

# --- Matrice de confusion (image) ---
fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
im = ax.imshow(cm)  # Ne pas fixer de palette pour respecter ta contrainte
ax.set_title("Matrice de confusion (comptes)")
ax.set_xticks(np.arange(len(labels)))
ax.set_yticks(np.arange(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_yticklabels(labels)
ax.set_xlabel("Prédit")
ax.set_ylabel("Vrai")

# Annotations des cases
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, int(cm[i, j]), ha="center", va="center")

fig.tight_layout()

plt.show()