# notebooks/model_selection.ipynb

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

## 1. Chargement

In [4]:
df = pd.read_csv(r"../../data/datasets/cleaned_train.csv")
X = df.drop(columns=['Credit_Score'])
y = df['Credit_Score']

## 2. Preprocessing

In [5]:
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

## 3. D√©finition du pr√©traitement

In [6]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Colonnes num√©riques ({len(numeric_features)}) : {numeric_features}")
print(f"Colonnes cat√©gorielles ({len(categorical_features)}) : {categorical_features}")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

Colonnes num√©riques (17) : ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance', 'Credit_History_Months']
Colonnes cat√©gorielles (6) : ['Month', 'Occupation', 'Type_of_Loan', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']


## 4. Test de plusieurs mod√®les

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    "R√©gression Logistique": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

trained_pipelines = {} 
results = {}

for name, model in models.items():
    print(f"\nüöÄ Entra√Ænement de : {name}...")
    
    # Create the pipeline
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train
    clf.fit(X_train, y_train)
    
    # Store the TRAINED pipeline in our dictionary
    trained_pipelines[name] = clf  
    
    # Evaluate
    y_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc
    
    print(f"R√©sultat pour {name} : {acc:.4f}")

# Find the name of the best model
best_model_name = max(results, key=results.get)
print(f"\nüèÜ Best model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

# Retrieve the actual model object using the name
best_pipeline = trained_pipelines[best_model_name]


üöÄ Entra√Ænement de : R√©gression Logistique...
R√©sultat pour R√©gression Logistique : 0.7226

üöÄ Entra√Ænement de : Random Forest...
R√©sultat pour Random Forest : 0.7622

üöÄ Entra√Ænement de : Gradient Boosting...
R√©sultat pour Gradient Boosting : 0.7026

üèÜ Best model: Random Forest with accuracy 0.7622


In [10]:
import joblib
# Save locally to the root or a 'models' folder
filename = "../../../Backend/best_model.joblib"
joblib.dump(best_pipeline, filename)

print(f"‚úÖ Model saved locally as {filename}")

‚úÖ Model saved locally as ../../../Backend/best_model.joblib
