In [1]:

# Importation des bibliothèques nécessaires
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
# remove warning
import warnings
warnings.filterwarnings("ignore")

# Charger et inspecter le jeu de données
data = pd.read_csv('../data/balanced_clean_data.csv')

# Diviser les données en caractéristiques (X) et cible (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardiser les données
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fonction d'optimisation et d'évaluation des modèles
def evaluate_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    return best_model, best_score

In [None]:
# Logistic Regression
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['liblinear', 'saga']
}

best_lr, best_lr_score = evaluate_model(LogisticRegression(max_iter=1000), lr_params)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
best_lr_score

0.7332287866907579

In [None]:
# Sauvegarder le meilleur modèle
best_lrmodel_path = '../models/lr_model.pkl'
joblib.dump(best_lr, best_lrmodel_path)

['/content/lr_model.pkl']

In [None]:
# Decision Tree Classifier
dt_params = {
    'max_depth': [3, 5, 10, 15, None],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}
best_dt, best_dt_score = evaluate_model(DecisionTreeClassifier(), dt_params)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


In [None]:
best_dt_score

0.8022432073091741

In [None]:
# Sauvegarder le meilleur modèle
best_dtmodel_path = '../models/dt_model.pkl'
joblib.dump(best_dt, best_dtmodel_path)

['/content/dt_model.pkl']

In [None]:
# Random Forest Classifier
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
best_rf, best_rf_score = evaluate_model(RandomForestClassifier(), rf_params)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [None]:
best_rf_score

In [None]:
# XGBoost Classifier
xgb_params = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10, 15],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.5, 1],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 10, 100]
}
best_xgb, best_xgb_score = evaluate_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params)

Fitting 5 folds for each of 62208 candidates, totalling 311040 fits


In [None]:
# Évaluation des modèles avec Cross Validation
models = {
    "Logistic Regression": best_lr,
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "XGBoost": best_xgb
}

for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{model_name} - Cross Validation Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
import numpy as np
from sklearn.preprocessing import label_binarize

def print_metrics(model, X_test, y_test, model_name):
    # Vérification et conversion de y_test
    if len(y_test.shape) > 1 and y_test.shape[1] > 1:
        y_test_onehot = y_test
        y_test = np.argmax(y_test, axis=1)  # Convertir en étiquettes simples
    else:
        y_test_onehot = label_binarize(y_test, classes=np.unique(y_test))  # Encodage one-hot pour AUC si nécessaire

    # Prédictions
    y_pred = model.predict(X_test)

    # Calcul des métriques
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')

    # Gestion de roc_auc_score (uniquement si le modèle a predict_proba)
    auc = None
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
        try:
            auc = roc_auc_score(y_test_onehot, y_proba, multi_class='ovr')
        except ValueError as e:
            print(f"Warning: Unable to compute AUC for {model_name}. Reason: {e}")

    # Affichage des résultats
    print(f"{model_name} Test Accuracy: {acc:.4f}")
    print(f"{model_name} Precision: {prec:.4f}")
    print(f"{model_name} Recall: {rec:.4f}")
    if auc is not None:
        print(f"{model_name} AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred))

# Appeler la fonction pour chaque modèle
for model_name, model in models.items():
    print_metrics(model, X_test, y_test, model_name)


In [None]:
# Visualisation de la matrice de confusion pour le meilleur modèle
best_model = max(models.items(), key=lambda item: cross_val_score(item[1], X_train, y_train, cv=5).mean())[1]
y_pred_best = best_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title(f"Matrice de confusion pour {type(best_model).__name__}")
plt.xlabel("Valeurs prédites")
plt.ylabel("Valeurs réelles")
plt.show()

In [None]:
# Sauvegarder le meilleur modèle
best_model_path = '../models/best_model.pkl'
joblib.dump(best_model, best_model_path)

['/content/best_model.pkl']

tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
# Define a simple deep learning model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2)

Epoch 1/50
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.6222 - loss: 0.7122 - val_accuracy: 0.7321 - val_loss: 0.5916
Epoch 2/50
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7147 - loss: 0.6141 - val_accuracy: 0.7382 - val_loss: 0.5721
Epoch 3/50
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7280 - loss: 0.5896 - val_accuracy: 0.7414 - val_loss: 0.5613
Epoch 4/50
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.7285 - loss: 0.5782 - val_accuracy: 0.7432 - val_loss: 0.5519
Epoch 5/50
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7328 - loss: 0.5647 - val_accuracy: 0.7439 - val_loss: 0.5448
Epoch 6/50
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7373 - loss: 0.5571 - val_accuracy: 0.7438 - val_loss: 0.5395
Epoch 7/50
[1

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7488 - loss: 0.5074
Test Loss: 0.5075079202651978, Test Accuracy: 0.7485477328300476


In [None]:
# Save the entire model
model.save('../models/tfmodel.keras')