# Notebook de choix de modèle de classification binaire

Notebook ChatGPT : 

**Modèles à comparer**

a) Modèles classiques (baseline)

* Logistic Regression
* Decision Tree
* Random Forest
* Gradient Boosting (XGBoost / LightGBM / CatBoost)
* Support Vector Machine (SVM) avec kernel linéaire et RBF)

b) Modèles linéaires avancés

* Ridge / Lasso / ElasticNet
* Stochastic Gradient Descent (SGDClassifier)

c) Réseaux de neurones simples

* MLP (1 à 3 couches cachées)

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import data_preparation


In [7]:
df = pd.read_csv("./data/train.csv", sep=",")

X, _ = data_preparation.prepare_df(df.drop(columns=["Survived", "PassengerId"]))
y = df.loc[:, "Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Optimisation sur l'ensemble des modèles

In [9]:
# Normalisation pour les modèles linéaires et SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dictionnaire des modèles
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": lgb.LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "SVM": SVC(),
    "Ridge": RidgeClassifier(),
    "Lasso": SGDClassifier(loss='squared_hinge', penalty='l1', max_iter=1000),  # Lasso adapté pour classification
    "ElasticNet": SGDClassifier(loss='squared_hinge', penalty='elasticnet', max_iter=1000),
    "SGDClassifier": SGDClassifier(max_iter=1000, tol=1e-3),
    "MLP": MLPClassifier(max_iter=1000)
}

# Evaluation
results = {}
for name, model in models.items():
    # Choisir si on utilise les features normalisées
    if name in ["Logistic Regression", "SVM", "Ridge", "Lasso", "ElasticNet", "SGDClassifier", "MLP"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 211
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838




In [10]:
for name, acc in results.items():
    print(f"{name}: Accuracy = {acc:.4f}")


Logistic Regression: Accuracy = 0.8156
Decision Tree: Accuracy = 0.7709
Random Forest: Accuracy = 0.8268
XGBoost: Accuracy = 0.8212
LightGBM: Accuracy = 0.8101
CatBoost: Accuracy = 0.8268
SVM: Accuracy = 0.8156
Ridge: Accuracy = 0.7877
Lasso: Accuracy = 0.6425
ElasticNet: Accuracy = 0.6760
SGDClassifier: Accuracy = 0.7654
MLP: Accuracy = 0.8492


## Optimisation sur les paramètres

In [27]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, randint
import json

In [19]:
def extract_results_from_rand_search(rand_search):
    results = {
        "best_params": rand_search.best_params_,
        "best_score": float(rand_search.best_score_),  # convertir en float pour JSON
        "best_estimator": str(rand_search.best_estimator_),  # ou str() pour une description
        "cv_results": {}
    }

    # Extraire cv_results_
    cv_results = rand_search.cv_results_
    for key, value in cv_results.items():
        # Convertir les arrays numpy en liste pour JSON
        if isinstance(value, np.ndarray):
            results["cv_results"][key] = value.tolist()
        else:
            results["cv_results"][key] = value
    
    return results

### RandomSearch

#### a. MLP

In [None]:
# 4min 06.3
mlp = MLPClassifier(max_iter=1000, random_state=42)

mlp_param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (100,50), (50,50,25)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': uniform(0.0001, 0.01),  # régularisation
    'learning_rate': ['constant', 'adaptive']
}

mlp_rand = RandomizedSearchCV(
    mlp, mlp_param_dist, n_iter=30, cv=3, scoring='accuracy', n_jobs=-1, random_state=42
)
mlp_rand.fit(X_train_scaled, y_train)
print("MLP Best params:", mlp_rand.best_params_)
print("MLP Best CV accuracy:", mlp_rand.best_score_)

MLP Best params: {'activation': 'relu', 'alpha': np.float64(0.003062735057040824), 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'solver': 'adam'}
MLP Best CV accuracy: 0.8216560885957759


In [None]:
with open("./data/MLP_randomized_search_results.json", "w") as f:
    json.dump(extract_results_from_rand_search(mlp_rand), f, indent=4)

#### b. Random Forest

In [21]:
rf = RandomForestClassifier(random_state=42)

rf_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

rf_rand = RandomizedSearchCV(rf, rf_param_dist, n_iter=30, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
rf_rand.fit(X_train, y_train)
print("RF Best params:", rf_rand.best_params_)
print("RF Best CV accuracy:", rf_rand.best_score_)

RF Best params: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 140}
RF Best CV accuracy: 0.8286529801794136


In [22]:
with open("./data/RF_randomized_search_results.json", "w") as f:
    json.dump(extract_results_from_rand_search(rf_rand), f, indent=4)

#### c. XGBoost

In [23]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3)
}

xgb_rand = RandomizedSearchCV(xgb_model, xgb_param_dist, n_iter=30, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
xgb_rand.fit(X_train, y_train)
print("XGBoost Best params:", xgb_rand.best_params_)
print("XGBoost Best CV accuracy:", xgb_rand.best_score_)

XGBoost Best params: {'colsample_bytree': np.float64(0.8834959481464842), 'learning_rate': np.float64(0.01070663052197174), 'max_depth': 3, 'n_estimators': 148, 'subsample': np.float64(0.8574323980775167)}
XGBoost Best CV accuracy: 0.8328901180725454


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
with open("./data/XGBoost_randomized_search_results.json", "w") as f:
    json.dump(extract_results_from_rand_search(xgb_rand), f, indent=4)

#### d. CatBoost

In [25]:
cat_model = CatBoostClassifier(verbose=0, random_state=42)

cat_param_dist = {
    'iterations': randint(200, 1000),
    'depth': randint(4, 8),
    'learning_rate': uniform(0.01, 0.1),
    'l2_leaf_reg': randint(1, 5),
    'border_count': [32, 50, 100]
}

cat_rand = RandomizedSearchCV(cat_model, cat_param_dist, n_iter=30, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
cat_rand.fit(X_train, y_train)
print("CatBoost Best params:", cat_rand.best_params_)
print("CatBoost Best CV accuracy:", cat_rand.best_score_)

CatBoost Best params: {'border_count': 100, 'depth': 4, 'iterations': 204, 'l2_leaf_reg': 2, 'learning_rate': np.float64(0.033598491974895575)}
CatBoost Best CV accuracy: 0.8328723894621138


In [26]:
with open("./data/CatBoost_randomized_search_results.json", "w") as f:
    json.dump(extract_results_from_rand_search(cat_rand), f, indent=4)

### Grid Search

In [28]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_param_grid = {
    'n_estimators': [120, 140, 160, 180],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.008, 0.010, 0.012, 0.015],
    'subsample': [0.857],
    'colsample_bytree': [0.8, 0.88, 0.95]
}

xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
print("XGBoost Best params:", xgb_grid.best_params_)
print("XGBoost Best accuracy:", xgb_grid.best_score_)

XGBoost Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.015, 'max_depth': 3, 'n_estimators': 160, 'subsample': 0.857}
XGBoost Best accuracy: 0.8356643356643356


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [30]:
with open("./data/XGBoost_grid_search_results.json", "w") as f:
    json.dump(extract_results_from_rand_search(xgb_grid), f, indent=4)

In [29]:
cat_model = CatBoostClassifier(verbose=0, random_state=42)

cat_param_grid = {
    'iterations': [180, 200, 220, 250],
    'depth': [4, 5, 6],
    'learning_rate': [0.025, 0.030, 0.035, 0.040],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [100]
}

cat_grid = GridSearchCV(cat_model, cat_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
cat_grid.fit(X_train, y_train)
print("CatBoost Best params:", cat_grid.best_params_)
print("CatBoost Best accuracy:", cat_grid.best_score_)

CatBoost Best params: {'border_count': 100, 'depth': 5, 'iterations': 220, 'l2_leaf_reg': 3, 'learning_rate': 0.03}
CatBoost Best accuracy: 0.8342558849601103


In [31]:
with open("./data/CatBoost_grid_search_results.json", "w") as f:
    json.dump(extract_results_from_rand_search(cat_grid), f, indent=4)