# notebooks/model_selection.ipynb

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

## 1. Chargement

In [20]:
df = pd.read_csv(r"../../data/datasets/cleaned_train.csv")
X = df.drop(columns=['Credit_Score'])
y = df['Credit_Score']

## 2. Preprocessing

In [21]:
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

## 3. DÃ©finition du prÃ©traitement

In [22]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Colonnes numÃ©riques ({len(numeric_features)}) : {numeric_features}")
print(f"Colonnes catÃ©gorielles ({len(categorical_features)}) : {categorical_features}")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

Colonnes numÃ©riques (17) : ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance', 'Credit_History_Months']
Colonnes catÃ©gorielles (6) : ['Month', 'Occupation', 'Type_of_Loan', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']


## 4. Test de plusieurs modÃ¨les

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    "RÃ©gression Logistique": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nðŸš€ EntraÃ®nement de : {name}...")
    
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc
    
    print(f"RÃ©sultat pour {name} :")
    print(classification_report(y_val, y_pred))

best_model = max(results, key=results.get)
print(f"\nBest model : {best_model} avec une prÃ©cision de {results[best_model]:.4f}")


ðŸš€ EntraÃ®nement de : RÃ©gression Logistique...
RÃ©sultat pour RÃ©gression Logistique :
              precision    recall  f1-score   support

        Good       0.58      0.60      0.59      3289
        Poor       0.78      0.68      0.73      5355
    Standard       0.74      0.78      0.76      9836

    accuracy                           0.72     18480
   macro avg       0.70      0.69      0.69     18480
weighted avg       0.73      0.72      0.72     18480


ðŸš€ EntraÃ®nement de : Random Forest...
RÃ©sultat pour Random Forest :
              precision    recall  f1-score   support

        Good       0.69      0.68      0.68      3289
        Poor       0.77      0.75      0.76      5355
    Standard       0.78      0.80      0.79      9836

    accuracy                           0.76     18480
   macro avg       0.75      0.74      0.74     18480
weighted avg       0.76      0.76      0.76     18480


ðŸš€ EntraÃ®nement de : Gradient Boosting...
RÃ©sultat pour Gradient Boos