In [32]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
import pickle

In [9]:
now = Path.cwd()
root = now.parent
path = root / "data" / "processed" / "train_processed.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,1.0,0.0,0.0,0.0,0.0,5849,0.0,128.0,360.0,1.0,2.0,1
1,1.0,1.0,1.0,1.0,0.0,0.0,4583,1508.0,128.0,360.0,1.0,0.0,0
2,2.0,1.0,1.0,0.0,0.0,1.0,3000,0.0,66.0,360.0,1.0,2.0,1
3,3.0,1.0,1.0,0.0,1.0,0.0,2583,2358.0,120.0,360.0,1.0,2.0,1
4,4.0,1.0,0.0,0.0,0.0,0.0,6000,0.0,141.0,360.0,1.0,2.0,1


In [15]:
X = df.drop(["Loan_ID",'Loan_Status'], axis=1)
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [33]:
def gridsearch_compare_models(
    X, y,
    test_size=0.2,
    random_state=42,
    scoring="f1",
    pos_label=1,
    save_dir="."   # root
):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    models = {
        "LogReg": (
            Pipeline([
                ("scaler", StandardScaler()),
                ("clf", LogisticRegression(max_iter=5000))
            ]),
            {
                "clf__C": [0.01, 0.1, 1, 10],
                "clf__solver": ["lbfgs", "liblinear"],
                "clf__class_weight": [None, "balanced"],
            }
        ),

        "KNN": (
            Pipeline([
                ("scaler", StandardScaler()),
                ("clf", KNeighborsClassifier())
            ]),
            {
                "clf__n_neighbors": [3, 5, 7, 11],
                "clf__weights": ["uniform", "distance"],
                "clf__p": [1, 2],
            }
        ),

        "SVC": (
            Pipeline([
                ("scaler", StandardScaler()),
                ("clf", SVC())
            ]),
            {
                "clf__C": [0.1, 1, 10],
                "clf__kernel": ["linear", "rbf"],
                "clf__gamma": ["scale", "auto"],
                "clf__class_weight": [None, "balanced"],
            }
        ),

        "GradientBoosting": (
            GradientBoostingClassifier(random_state=random_state),
            {
                "n_estimators": [100, 200],
                "learning_rate": [0.05, 0.1],
                "max_depth": [2, 3],
            }
        ),

        "AdaBoost": (
            AdaBoostClassifier(random_state=random_state),
            {
                "n_estimators": [100, 200],
                "learning_rate": [0.05, 0.1, 0.5],
            }
        ),
    }

    results = []
    best_params = []

    for name, (estimator, param_grid) in models.items():
        gs = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            scoring=scoring,
            cv=cv,
            n_jobs=-1,
            refit=True
        )
        gs.fit(X_train, y_train)

        # ðŸ”¹ Sauvegarde pickle du meilleur modÃ¨le
        model_path = os.path.join(save_dir, f"model_{name}.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(gs.best_estimator_, f)

        # ðŸ”¹ Ã‰valuation
        y_pred = gs.predict(X_test)

        results.append({
            "model": name,
            "best_cv_f1": gs.best_score_,
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, pos_label=pos_label, zero_division=0),
            "recall": recall_score(y_test, y_pred, pos_label=pos_label, zero_division=0),
            "f1": f1_score(y_test, y_pred, pos_label=pos_label, zero_division=0),
        })

        best_params.append({
            "model": name,
            "best_params": gs.best_params_
        })

    df_scores = pd.DataFrame(results).sort_values("f1", ascending=False).reset_index(drop=True)
    df_best_params = pd.DataFrame(best_params)

    return df_scores, df_best_params


In [34]:
df_scores, df_best_params = gridsearch_compare_models(X, y, scoring="f1", pos_label=1)
print(df_scores)
print(df_best_params)


              model  best_cv_f1  accuracy  precision  recall   f1
0            LogReg        0.87      0.85       0.83    0.98 0.90
1               SVC        0.87      0.85       0.83    0.98 0.90
2          AdaBoost        0.87      0.85       0.83    0.98 0.90
3  GradientBoosting        0.86      0.83       0.82    0.96 0.89
4               KNN        0.85      0.83       0.83    0.95 0.89
              model                                        best_params
0            LogReg  {'clf__C': 0.01, 'clf__class_weight': None, 'c...
1               KNN  {'clf__n_neighbors': 11, 'clf__p': 2, 'clf__we...
2               SVC  {'clf__C': 0.1, 'clf__class_weight': None, 'cl...
3  GradientBoosting  {'learning_rate': 0.05, 'max_depth': 2, 'n_est...
4          AdaBoost       {'learning_rate': 0.05, 'n_estimators': 100}
