# 04 - Entra√Ænement des Mod√®les (Version Simplifi√©e)

Ce notebook teste 2 mod√®les :
1. **Random Forest**
2. **XGBoost**

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import optuna
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline
print("‚úì Imports OK")

‚úì Imports OK


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Charger donn√©es
X = pd.read_csv('../data/processed/X_features.csv')
y = pd.read_csv('../data/processed/y_target.csv').values.ravel()
print(f"X: {X.shape}, y: {y.shape}")
print(f"Fraudes: {(y==1).sum()/len(y)*100:.2f}%")

X: (10000, 10), y: (10000,)
Fraudes: 5.51%


In [3]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scale_pos_weight = (y_train==0).sum() / (y_train==1).sum()
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"Scale pos weight: {scale_pos_weight:.2f}")

Train: 8000, Test: 2000
Scale pos weight: 17.14


In [4]:
# Fonction √©valuation
def evaluate(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print(f"\n{'='*80}\n{name}\n{'='*80}")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f} ‚≠ê")
    print(f"F1:        {f1_score(y_test, y_pred):.4f}")
    print(f"ROC-AUC:   {roc_auc_score(y_test, y_proba):.4f}")
    print(f"\n{classification_report(y_test, y_pred, target_names=['Normal', 'Fraude'])}")
    
    # Graphiques
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0], xticklabels=['Normal', 'Fraude'], yticklabels=['Normal', 'Fraude'])
    axes[0].set_title(f'Matrice - {name}', fontweight='bold')
    axes[0].set_ylabel('Vraie classe')
    axes[0].set_xlabel('Pr√©dite')
    
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    axes[1].plot(fpr, tpr, linewidth=2, label=f'AUC={roc_auc_score(y_test, y_proba):.4f}')
    axes[1].plot([0,1], [0,1], 'k--', label='Random')
    axes[1].set_title(f'ROC - {name}', fontweight='bold')
    axes[1].set_xlabel('FPR')
    axes[1].set_ylabel('TPR')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return {'name': name, 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred), 'recall': recall_score(y_test, y_pred), 'f1': f1_score(y_test, y_pred), 'roc_auc': roc_auc_score(y_test, y_proba)}

print("‚úì Fonction cr√©√©e")

‚úì Fonction cr√©√©e


In [None]:
# RANDOM FOREST avec Optuna
print("üå≤ RANDOM FOREST - Optuna (30 trials)")
def obj_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'class_weight': 'balanced',
        'random_state': 42
    }
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='f1', n_jobs=-1).mean()

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(obj_rf, n_trials=30, show_progress_bar=True)
print(f"Meilleur F1 (CV): {study_rf.best_value:.4f}")
print(f"Params: {study_rf.best_params}")

best_rf = RandomForestClassifier(**study_rf.best_params, class_weight='balanced', random_state=42)
best_rf.fit(X_train, y_train)
results_rf = evaluate(best_rf, X_test, y_test, "Random Forest")

[I 2025-12-17 15:41:40,616] A new study created in memory with name: no-name-2889a524-aad6-4996-9c14-d44518a35962


üå≤ RANDOM FOREST - Optuna (30 trials)


  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# XGBOOST avec Optuna
print("üöÄ XGBOOST - Optuna (30 trials)")
def obj_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42,
        'eval_metric': 'logloss'
    }
    model = XGBClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='f1', n_jobs=-1).mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(obj_xgb, n_trials=30, show_progress_bar=True)
print(f"Meilleur F1 (CV): {study_xgb.best_value:.4f}")
print(f"Params: {study_xgb.best_params}")

best_xgb = XGBClassifier(**study_xgb.best_params, scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
best_xgb.fit(X_train, y_train)
results_xgb = evaluate(best_xgb, X_test, y_test, "XGBoost")

In [None]:
# COMPARAISON
comp = pd.DataFrame([results_rf, results_xgb]).set_index('name')
print("\n" + "="*80)
print("COMPARAISON")
print("="*80)
print(comp.round(4))
best_name = comp['recall'].idxmax()
print(f"\nüèÜ MEILLEUR (Recall): {best_name}")

comp.plot(kind='bar', figsize=(12, 5))
plt.title('Comparaison', fontweight='bold')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Importance des features
best_model = best_rf if best_name == 'Random Forest' else best_xgb
feat_imp = pd.DataFrame({'feature': X.columns, 'importance': best_model.feature_importances_}).sort_values('importance', ascending=False)
print(f"\nüìä Importance - {best_name}")
print(feat_imp.to_string(index=False))

plt.figure(figsize=(10, 6))
plt.barh(feat_imp['feature'], feat_imp['importance'])
plt.xlabel('Importance')
plt.title(f'Features - {best_name}', fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Sauvegarde
import pickle
model_path = f'../artifacts/models/best_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)
print(f"‚úì Mod√®le sauvegard√©: {model_path}")
comp.to_csv('../artifacts/metrics/comparison.csv')
print("‚úì M√©triques sauvegard√©es")