In [None]:

# Importation des bibliothèques nécessaires
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import joblib

# Charger et inspecter le jeu de données
data = pd.read_csv('../data/clean_data.csv')

# Diviser les données en caractéristiques (X) et cible (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardiser les données
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fonction d'optimisation et d'évaluation des modèles
def evaluate_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    return best_model, best_score

# Logistic Regression
lr_params = {'C': [0.01, 0.1, 1, 10]}
best_lr, best_lr_score = evaluate_model(LogisticRegression(max_iter=1000), lr_params)

# Decision Tree Classifier
dt_params = {'max_depth': [3, 5, 10, None]}
best_dt, best_dt_score = evaluate_model(DecisionTreeClassifier(), dt_params)

# Random Forest Classifier
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
best_rf, best_rf_score = evaluate_model(RandomForestClassifier(), rf_params)

# Support Vector Machine
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
best_svm, best_svm_score = evaluate_model(SVC(), svm_params)

# XGBoost Classifier
xgb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
best_xgb, best_xgb_score = evaluate_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params)

# Évaluation des modèles avec Cross Validation
models = {
    "Logistic Regression": best_lr,
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "SVM": best_svm,
    "XGBoost": best_xgb
}

for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{model_name} - Cross Validation Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Évaluation finale sur le jeu de test et affichage des métriques
def print_metrics(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    print(f"{model_name} Test Accuracy: {acc:.4f}")
    print(f"{model_name} Precision: {prec:.4f}")
    print(f"{model_name} Recall: {rec:.4f}")
    print(f"{model_name} AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred))

for model_name, model in models.items():
    print_metrics(model, X_test, y_test, model_name)

# Visualisation de la matrice de confusion pour le meilleur modèle
best_model = max(models.items(), key=lambda item: cross_val_score(item[1], X_train, y_train, cv=5).mean())[1]
y_pred_best = best_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title(f"Matrice de confusion pour {type(best_model).__name__}")
plt.xlabel("Valeurs prédites")
plt.ylabel("Valeurs réelles")
plt.show()

# Sauvegarde du meilleur modèle
joblib.dump(best_model, "../models/best_model.pkl")


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
