In [18]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [19]:
_non_num_pat = re.compile(r"[^0-9\-\.,]")

def to_numeric_safe(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace(r"\s+", "", regex=True) \
                     .str.replace("_", "", regex=False) \
                     .str.replace("%", "", regex=False)
    s = s.str.replace(_non_num_pat, "", regex=True).str.replace(",", ".", regex=False)
    s = pd.to_numeric(s.replace({"": np.nan, ".": np.nan, "-": np.nan}), errors="coerce")
    return s

def parse_credit_history_age(col: pd.Series) -> pd.Series:
    """
    Convertir 'X años y X meses ' en meses.
    """
    def _one(x: str) -> float:
        if pd.isna(x): return np.nan
        x = str(x).lower().replace("_", " ").replace("-", " ")
        yrs = re.search(r"(\d+)\s*year", x)
        mos = re.search(r"(\d+)\s*month", x)
        y = int(yrs.group(1)) if yrs else 0
        m = int(mos.group(1)) if mos else 0

        if not yrs and not mos:
            m_only = re.search(r"\d+", x)
            return float(m_only.group(0)) if m_only else np.nan
        return float(y * 12 + m)
    return col.apply(_one)

def normalize_text(col: pd.Series) -> pd.Series:
    return (col.astype(str)
              .str.strip()
              .str.lower()
              .str.replace("_", " ", regex=False)
              .str.replace(r"[!\|\.]+", "", regex=True)
              .str.replace(r"\s+", " ", regex=True)
              .replace({"nan": np.nan, "none": np.nan, "": np.nan}))

In [20]:
df = pd.read_csv("train-3.csv", low_memory=False)

df.columns = (df.columns.str.strip()
                        .str.replace(r"\s+", "_", regex=True)
                        .str.replace("__+", "_", regex=True))

# Normalizar texto
text_like = [
    "Occupation","Type_of_Loan","Payment_Behaviour","Payment_of_Min_Amount",
    "Credit_Mix","Credit_Score","SSN","Name","Month","Customer_ID","ID"
]
for c in [col for col in text_like]:
    df[c] = normalize_text(df[c])

# Columnas numéricas
num_should_be = [
    "Age","Annual_Income","Monthly_Inhand_Salary","Num_Bank_Accounts","Num_Credit_Card",
    "Interest_Rate","Delay_from_due_date","Num_Credit_Inquiries","Credit_Utilization_Ratio",
    "Total_EMI_per_month","Amount_invested_monthly","Outstanding_Debt","Monthly_Balance",
    "Num_of_Loan","Changed_Credit_Limit"
]
for c in [col for col in num_should_be]:
    df[c] = to_numeric_safe(df[c])

# Credit_History_Age -> mois
if "Credit_History_Age" in df.columns:
    df["Credit_History_Age_Months"] = parse_credit_history_age(df["Credit_History_Age"])
    df.drop(columns=["Credit_History_Age"], inplace=True)

# Estandarización
if "Payment_of_Min_Amount" in df.columns:
    df["Payment_of_Min_Amount"] = df["Payment_of_Min_Amount"].replace({
        "yes":"yes","y":"yes",
        "no":"no","n":"no",
        "nm":"no" 
    })

if "Credit_Mix" in df.columns:
    df["Credit_Mix"] = df["Credit_Mix"].replace({
        "good":"good","standard":"standard","bad":"bad"
    })

if "Payment_Behaviour" in df.columns:
    df["Payment_Behaviour"] = df["Payment_Behaviour"].str.replace(r"(spent|avg|high|low)", "", regex=True).str.strip()
    df["Payment_Behaviour"] = df["Payment_Behaviour"].replace({"": np.nan})

for col in ["ID","Customer_ID","Name","SSN","Month"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

In [22]:

X = df.drop(columns=['Credit_Score'])
y = df['Credit_Score']


cat_cols: List[str] = [c for c in X.columns if X[c].dtype == "object"]
num_cols: List[str] = [c for c in X.columns if c not in cat_cols]


print(f"Detected {len(cat_cols)} categorical: {cat_cols[:10]}{' ...' if len(cat_cols)>10 else ''}")
print(f"Detected {len(num_cols)} numerical: {num_cols[:10]}{' ...' if len(num_cols)>10 else ''}")

Detected 6 categorical: ['Occupation', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
Detected 16 numerical: ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Changed_Credit_Limit', 'Num_Credit_Inquiries'] ...


In [23]:
# Modelo
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
])

clf = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=256,
    learning_rate="adaptive",
    max_iter=200,
    early_stopping=True,
    n_iter_no_change=15,
    random_state=42,
    verbose=False
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", clf),
])

In [24]:

assert "Credit_Score" in df.columns, "La colonne cible 'Credit_Score' est introuvable."
y = df["Credit_Score"].astype("category")
X = df.drop(columns=["Credit_Score"])

# Détection robuste des types APRÈS nettoyage
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]
print(f"{len(num_cols)} numériques / {len(cat_cols)} catégorielles")

16 numériques / 6 catégorielles


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preproc = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop",
    n_jobs=None
)

clf = Pipeline(steps=[
    ("prep", preproc),
    ("model", LogisticRegression(
        max_iter=2000,
        multi_class="multinomial",
        class_weight="balanced",
        n_jobs=-1
    ))
])

In [None]:
clf.fit(X_train, y_train)



In [None]:
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score,
    balanced_accuracy_score, cohen_kappa_score, roc_auc_score
)

# Prédictions
y_pred = clf.predict(X_test)