# notebooks/model_selection.ipynb

In [155]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score



## 1. Chargement

In [156]:
df = pd.read_csv(r"../../data/datasets/cleaned_train.csv")

cols = ['Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Outstanding_Debt', 'Num_Credit_Card', 'Credit_History_Months']
df = df[cols + ['Credit_Score']]


In [157]:
df['Debt_Income_Ratio'] = df['Outstanding_Debt'] / (df['Annual_Income'] + 1)
df['Cards_per_Account'] = df['Num_Credit_Card'] / (df['Num_Bank_Accounts'] + 1)
df['Credit_Age_Years'] = df['Credit_History_Months'] / 12


In [158]:
df['Annual_Income'] = np.log1p(df['Annual_Income'])
df['Outstanding_Debt'] = np.log1p(df['Outstanding_Debt'])

In [159]:
df['Income_Bin'] = pd.qcut(df['Annual_Income'], q=5, duplicates='drop')
df['Debt_Bin'] = pd.qcut(df['Outstanding_Debt'], q=5, duplicates='drop')
df['Credit_Age_Bin'] = pd.cut(df['Credit_History_Months'], bins=[0, 12, 36, 60, 120, 300])


In [160]:
df['Income_per_Account'] = df['Annual_Income'] / (df['Num_Bank_Accounts'] + 1)
df['Debt_per_Card'] = df['Outstanding_Debt'] / (df['Num_Credit_Card'] + 1)


In [161]:
X = df.drop(columns=['Credit_Score'])
X = X.replace([np.inf, -np.inf], np.nan)

y = df['Credit_Score']

## 2. Preprocessing

In [162]:
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

## 3. D√©finition du pr√©traitement

In [163]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Colonnes num√©riques ({len(numeric_features)}) : {numeric_features}")
print(f"Colonnes cat√©gorielles ({len(categorical_features)}) : {categorical_features}")

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

Colonnes num√©riques (11) : ['Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Outstanding_Debt', 'Num_Credit_Card', 'Credit_History_Months', 'Debt_Income_Ratio', 'Cards_per_Account', 'Credit_Age_Years', 'Income_per_Account', 'Debt_per_Card']
Colonnes cat√©gorielles (0) : []


In [164]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

## 4. Test de plusieurs mod√®les

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

models = {
    "R√©gression Logistique": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "xgboost": XGBClassifier(n_estimators=800,max_depth=6,learning_rate=0.03,subsample=0.85,colsample_bytree=0.85,min_child_weight=6,gamma=0.3,reg_alpha=0.2,reg_lambda=1.5,objective="multi:softprob", num_class=3,eval_metric="mlogloss",random_state=42),
    "Extra Trees" : ExtraTreesClassifier(n_estimators=800,max_depth=20,min_samples_leaf=2,min_samples_split=5,max_features="sqrt",random_state=42,n_jobs=-1)
}

trained_pipelines = {} 
results = {}

for name, model in models.items():
    print(f"\nüöÄ Entra√Ænement de : {name}...")
    
    # Create the pipeline
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train
    clf.fit(X_train, y_train)
    
    # Store the TRAINED pipeline in our dictionary
    trained_pipelines[name] = clf  
    
    # Evaluate
    y_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc
    
    print(f"R√©sultat pour {name} : {acc:.4f}")

# Find the name of the best model
best_model_name = max(results, key=results.get)
print(f"\nüèÜ Best model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

# Retrieve the actual model object using the name
best_pipeline = trained_pipelines[best_model_name]


üöÄ Entra√Ænement de : R√©gression Logistique...
R√©sultat pour R√©gression Logistique : 0.5937

üöÄ Entra√Ænement de : Random Forest...
R√©sultat pour Random Forest : 0.7960

üöÄ Entra√Ænement de : Gradient Boosting...
R√©sultat pour Gradient Boosting : 0.6666

üöÄ Entra√Ænement de : xgboost...
R√©sultat pour xgboost : 0.7146

üöÄ Entra√Ænement de : Extra Trees...
R√©sultat pour Extra Trees : 0.7504

üèÜ Best model: Random Forest with accuracy 0.7960


In [166]:
import joblib
# Save locally to the root or a 'models' folder
filename = "../../../Backend/best_model.joblib"
joblib.dump(best_pipeline, filename)

print(f"‚úÖ Model saved locally as {filename}")

‚úÖ Model saved locally as ../../../Backend/best_model.joblib
