# notebooks/model_selection.ipynb

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

## 1. Chargement

In [17]:
df = pd.read_csv(r"../../src/data/datasets/cleaned_train.csv")
X = df.drop(columns=['Credit_Score'])
y = df['Credit_Score']

FileNotFoundError: [Errno 2] No such file or directory: '../../src/data/datasets/cleaned_train.csv'

## 2. Preprocessing

In [None]:
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

## 3. D√©finition du pr√©traitement

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Colonnes num√©riques ({len(numeric_features)}) : {numeric_features}")
print(f"Colonnes cat√©gorielles ({len(categorical_features)}) : {categorical_features}")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

## 4. Test de plusieurs mod√®les

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    "R√©gression Logistique": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nüöÄ Entra√Ænement de : {name}...")
    
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc
    
    print(f"R√©sultat pour {name} :")
    print(classification_report(y_val, y_pred))

best_model = max(results, key=results.get)
print(f"\nBest model : {best_model} avec une pr√©cision de {results[best_model]:.4f}")