In [1]:
random_state = 1
test_size = 0.2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

pd.options.display.max_columns = 80

In [2]:
data = pd.read_csv('../data/Data_binary.csv')

In [3]:
# Séparation des features et de la target
X = data.drop('Label', axis=1)
y = data['Label']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

print(f"Taille de l'ensemble d'entraînement: {X_train.shape}")
print(f"Taille de l'ensemble de test: {X_test.shape}")

Taille de l'ensemble d'entraînement: (2016638, 70)
Taille de l'ensemble de test: (504160, 70)


In [None]:
rf = RandomForestClassifier(random_state=random_state,n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(rf.score(X_test, y_test))
print(f1_score(y_test, y_pred))

0.9985857664233576
0.9958074853732396


In [6]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [7]:
param_grid_rf = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 20, 30, None],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    }

rf_model = RandomForestClassifier(random_state=random_state)

# Configurer GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    cv=4,
    n_jobs=-1,
    verbose=3,
    scoring='accuracy'
)

# Entraîner le modèle avec GridSearchCV
print("Début du Grid Search pour Random Forest...")
grid_search_rf.fit(X_train, y_train)

Début du Grid Search pour Random Forest...
Fitting 4 folds for each of 12 candidates, totalling 48 fits


KeyboardInterrupt: 

In [None]:
# Afficher les meilleurs paramètres
print("\nMeilleurs paramètres:", grid_search_rf.best_params_)
print(f"Meilleur score CV: {grid_search_rf.best_score_:.4f}")

# Prédictions avec le meilleur modèle
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)

# Évaluation
print(f"\nAccuracy sur l'ensemble de test: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nRapport de classification:")
print(classification_report(y_test, y_pred_rf))

# Matrice de confusion
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Matrice de Confusion - Random Forest')
plt.ylabel('Vraie classe')
plt.xlabel('Classe prédite')
plt.show()