In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score, 
    precision_score, recall_score, roc_auc_score,
    precision_recall_curve, average_precision_score,
    roc_curve
)

# Mod√®les
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Gestion du d√©s√©quilibre
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTEENN

# Configuration des graphiques
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
%matplotlib inline


In [2]:
print("Chargement des donn√©es nettoy√©es\n")
df_train = pd.read_csv('data_train_cleaned.csv')
df_test = pd.read_csv('data_test_cleaned.csv')


Chargement des donn√©es nettoy√©es



In [3]:
print("PR√âPARATION DES FEATURES")
print("="*70)

# Colonnes √† exclure
exclude_cols = ['ZIBZIN', 'IDAvisAutorisationCheque', 'FlagImpaye', 
                'DateTransaction', 'CodeDecision','VerifianceCPT2', 'VerifianceCPT3']

# Si 'Mois' et 'Date' existent (cr√©√©s dans le notebook EDA), les exclure aussi
if 'Mois' in df_train.columns:
    exclude_cols.append('Mois')
if 'Date' in df_train.columns:
    exclude_cols.append('Date')

# Features disponibles
feature_cols = [col for col in df_train.columns if col not in exclude_cols]

print(f"\nFeatures s√©lectionn√©es ({len(feature_cols)}):")
for i, col in enumerate(feature_cols, 1):
    print(f" {i:2d}. {col}")

print(f"\n Colonnes exclues: {exclude_cols}")
print(" CodeDecision exclu car information post-transaction = FUITE!")

# Cr√©er X et y
X_train = df_train[feature_cols].copy()
y_train = df_train['FlagImpaye'].copy()

X_test = df_test[feature_cols].copy()
y_test = df_test['FlagImpaye'].copy()

print(f"\n‚úì Features extraites:")
print(f"  X_train: {X_train.shape}")
print(f"  X_test:  {X_test.shape}")

PR√âPARATION DES FEATURES

Features s√©lectionn√©es (16):
  1. Montant
  2. VerifianceCPT1
  3. D2CB
  4. ScoringFP1
  5. ScoringFP2
  6. ScoringFP3
  7. TauxImpNb_RB
  8. TauxImpNB_CPM
  9. EcartNumCheq
 10. NbrMagasin3J
 11. DiffDateTr1
 12. DiffDateTr2
 13. DiffDateTr3
 14. CA3TRetMtt
 15. CA3TR
 16. Heure

 Colonnes exclues: ['ZIBZIN', 'IDAvisAutorisationCheque', 'FlagImpaye', 'DateTransaction', 'CodeDecision', 'VerifianceCPT2', 'VerifianceCPT3', 'Mois', 'Date']
 CodeDecision exclu car information post-transaction = FUITE!

‚úì Features extraites:
  X_train: (3888468, 16)
  X_test:  (737068, 16)


In [4]:
# Imputation des valeurs manquantes
print("\nImputation des valeurs manquantes...")
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test_imputed = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)
print("Imputation termin√©e")




Imputation des valeurs manquantes...
Imputation termin√©e


In [5]:
# Normalisation
print("\nNormalisation (StandardScaler)...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
print("Normalisation termin√©e")

print(f"\nPr√©paration termin√©e!")
print(f"X_train_scaled: {X_train_scaled.shape}")
print(f"X_test_scaled:  {X_test_scaled.shape}")
print(f"y_train: Fraudes = {(y_train==1).sum():,} / {len(y_train):,}")
print(f"y_test:  Fraudes = {(y_test==1).sum():,} / {len(y_test):,}")


Normalisation (StandardScaler)...
Normalisation termin√©e

Pr√©paration termin√©e!
X_train_scaled: (3888468, 16)
X_test_scaled:  (737068, 16)
y_train: Fraudes = 23,346 / 3,888,468
y_test:  Fraudes = 6,485 / 737,068


In [6]:
def evaluate_model(model, X_test, y_test, model_name, verbose=True):
    """
    √âvalue un mod√®le et retourne les m√©triques
    
    Parameters:
    -----------
    model : sklearn model
        Mod√®le entra√Æn√©
    X_test : array-like
        Features de test
    y_test : array-like
        Labels de test
    model_name : str
        Nom du mod√®le
    verbose : bool
        Afficher les r√©sultats d√©taill√©s
    
    Returns:
    --------
    dict : Dictionnaire avec les m√©triques
    """
    
    # Pr√©dictions
    y_pred = model.predict(X_test)
    
    # Calculer les m√©triques
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    
    # Matrice de confusion
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    if verbose:
        print(f"R√âSULTATS - {model_name}")
        print(f"F1-score:  {f1:.4f} ‚≠ê")
        print(f"Pr√©cision: {precision:.4f}")
        print(f"Rappel:    {recall:.4f}")
        print(f"\nMatrice de confusion:")
        print(f"  TN: {tn:,}  |  FP: {fp:,}")
        print(f"  FN: {fn:,}  |  TP: {tp:,}")
        print(f"\nInterpr√©tation:")
        print(f"  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): {tp:,}")
        print(f"  ‚Ä¢ Fraudes manqu√©es (FN): {fn:,}")
        print(f"  ‚Ä¢ Fausses alarmes (FP): {fp:,}")
    
    return {
        'model': model_name,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'tp': int(tp),
        'fp': int(fp),
        'fn': int(fn),
        'tn': int(tn)
    }

print("Fonction d'√©valuation d√©finie")

Fonction d'√©valuation d√©finie


## Mod√©lisation


In [7]:
# Initialiser la liste des r√©sultats
results = []

### Logistic Regression + class_weight

In [8]:
print("Logistic Regression")
#approche cost-sensitive: donne plus de poids √† la classe minoritaire
model1 = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model1.fit(X_train_scaled, y_train)
print("Entra√Ænement termin√©")

result1 = evaluate_model(model1, X_test_scaled, y_test, "LogReg + class_weight")
results.append(result1)

Logistic Regression
Entra√Ænement termin√©
R√âSULTATS - LogReg + class_weight
F1-score:  0.0386 ‚≠ê
Pr√©cision: 0.0200
Rappel:    0.5806

Matrice de confusion:
  TN: 545,877  |  FP: 184,706
  FN: 2,720  |  TP: 3,765

Interpr√©tation:
  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): 3,765
  ‚Ä¢ Fraudes manqu√©es (FN): 2,720
  ‚Ä¢ Fausses alarmes (FP): 184,706


### Random Forest + class_weight

In [9]:
print("Random Forest")
#Ensemble d'arbres avec pond√©ration des classes

model2 = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
model2.fit(X_train_scaled, y_train)

result2 = evaluate_model(model2, X_test_scaled, y_test, "RandomForest + class_weight")
results.append(result2)

Random Forest
R√âSULTATS - RandomForest + class_weight
F1-score:  0.0362 ‚≠ê
Pr√©cision: 0.0186
Rappel:    0.6534

Matrice de confusion:
  TN: 507,126  |  FP: 223,457
  FN: 2,248  |  TP: 4,237

Interpr√©tation:
  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): 4,237
  ‚Ä¢ Fraudes manqu√©es (FN): 2,248
  ‚Ä¢ Fausses alarmes (FP): 223,457


### SMOTE + Logistic Regression

In [10]:
print("Logistic Regression")
#G√©n√©ration d'exemples synth√©tiques de la classe minoritaire

# SMOTE
smote = SMOTE(sampling_strategy=0.1, random_state=42)  # 10% du ratio
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Taille apr√®s SMOTE: {X_train_smote.shape[0]:,} exemples")
print(f"Fraudes: {(y_train_smote==1).sum():,} ({(y_train_smote==1).sum()/len(y_train_smote)*100:.1f}%)")
model3 = LogisticRegression(max_iter=1000, random_state=42)
model3.fit(X_train_smote, y_train_smote)
print("Entra√Ænement termin√©")

result3 = evaluate_model(model3, X_test_scaled, y_test, "SMOTE + LogReg")
results.append(result3)

Logistic Regression
Taille apr√®s SMOTE: 4,251,634 exemples
Fraudes: 386,512 (9.1%)
Entra√Ænement termin√©
R√âSULTATS - SMOTE + LogReg
F1-score:  0.0941 ‚≠ê
Pr√©cision: 0.1557
Rappel:    0.0674

Matrice de confusion:
  TN: 728,213  |  FP: 2,370
  FN: 6,048  |  TP: 437

Interpr√©tation:
  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): 437
  ‚Ä¢ Fraudes manqu√©es (FN): 6,048
  ‚Ä¢ Fausses alarmes (FP): 2,370


### RandomUnderSampler + Logistic Regression

In [11]:
print("M√âTHODE 5: RandomUnderSampler (under-sampling) + Logistic Regression")
#R√©duction de la classe majoritaire

# Under-sampling
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled, y_train)

print(f"Taille apr√®s under-sampling: {X_train_rus.shape[0]:,} exemples")
print(f"Fraudes: {(y_train_rus==1).sum():,} ({(y_train_rus==1).sum()/len(y_train_rus)*100:.1f}%)")
model4 = LogisticRegression(max_iter=1000, random_state=42)
model4.fit(X_train_rus, y_train_rus)
result4 = evaluate_model(model4, X_test_scaled, y_test, "UnderSampling + LogReg")
results.append(result4)

M√âTHODE 5: RandomUnderSampler (under-sampling) + Logistic Regression
Taille apr√®s under-sampling: 70,038 exemples
Fraudes: 23,346 (33.3%)
R√âSULTATS - UnderSampling + LogReg
F1-score:  0.0670 ‚≠ê
Pr√©cision: 0.0370
Rappel:    0.3553

Matrice de confusion:
  TN: 670,566  |  FP: 60,017
  FN: 4,181  |  TP: 2,304

Interpr√©tation:
  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): 2,304
  ‚Ä¢ Fraudes manqu√©es (FN): 4,181
  ‚Ä¢ Fausses alarmes (FP): 60,017


### Balanced Random Forest

In [12]:
print("Balanced Random Forest (imblearn)")
#Random Forest sp√©cialement con√ßu pour donn√©es d√©s√©quilibr√©es

model5 = BalancedRandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
model5.fit(X_train_scaled, y_train)
result5 = evaluate_model(model5, X_test_scaled, y_test, "BalancedRandomForest")
results.append(result5)

Balanced Random Forest (imblearn)
R√âSULTATS - BalancedRandomForest
F1-score:  0.0350 ‚≠ê
Pr√©cision: 0.0180
Rappel:    0.6848

Matrice de confusion:
  TN: 487,793  |  FP: 242,790
  FN: 2,044  |  TP: 4,441

Interpr√©tation:
  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): 4,441
  ‚Ä¢ Fraudes manqu√©es (FN): 2,044
  ‚Ä¢ Fausses alarmes (FP): 242,790


## XGBoost + SMOTE

In [13]:
#  XGBoost + SMOTE (BONUS - Plus performant)


print("XGBoost + SMOTE")
#"Gradient Boosting optimis√© (XGBoost) avec sur-√©chantillonnage

# Installer XGBoost si pas encore fait (d√©commenter si n√©cessaire)
# !pip install xgboost --break-system-packages

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Taille apr√®s SMOTE: {X_train_smote.shape[0]:,} exemples")
print(f"Fraudes: {(y_train_smote==1).sum():,} ({(y_train_smote==1).sum()/len(y_train_smote)*100:.1f}%)")

# Configuration XGBoost (optimis√©e pour gros datasets)
model6 = XGBClassifier(
    n_estimators=50,        # Nombre d'arbres (r√©duit pour vitesse)
    max_depth=5,            # Profondeur max
    learning_rate=0.1,      # Taux d'apprentissage
    subsample=0.8,          # Sous-√©chantillonnage pour r√©gularisation
    colsample_bytree=0.8,   # Sous-√©chantillonnage des features
    random_state=42,
    eval_metric='logloss',  # M√©trique d'√©valuation
    tree_method='hist',     # M√©thode rapide pour gros datasets
    n_jobs=-1               # Utiliser tous les CPU
)

model6.fit(X_train_smote, y_train_smote)
# √âvaluation avec la fonction existante
result6 = evaluate_model(model6, X_test_scaled, y_test, "XGBoost + SMOTE")
results.append(result6)

XGBoost + SMOTE
Taille apr√®s SMOTE: 4,251,634 exemples
Fraudes: 386,512 (9.1%)
R√âSULTATS - XGBoost + SMOTE
F1-score:  0.1304 ‚≠ê
Pr√©cision: 0.1568
Rappel:    0.1116

Matrice de confusion:
  TN: 726,690  |  FP: 3,893
  FN: 5,761  |  TP: 724

Interpr√©tation:
  ‚Ä¢ Vraies fraudes d√©tect√©es (TP): 724
  ‚Ä¢ Fraudes manqu√©es (FN): 5,761
  ‚Ä¢ Fausses alarmes (FP): 3,893


### SMOTEENN + Logistic Regression

In [None]:
print("SMOTEENN + Logistic Regression")
#Combinaison de SMOTE (over-sampling) et ENN (under-sampling)

# SMOTEENN
smoteenn = SMOTEENN(sampling_strategy=0.1, random_state=42)
X_train_smoteenn, y_train_smoteenn = smoteenn.fit_resample(X_train_scaled, y_train)

print(f"Taille apr√®s SMOTEENN: {X_train_smoteenn.shape[0]:,} exemples")
print(f"Fraudes: {(y_train_smoteenn==1).sum():,} ({(y_train_smoteenn==1).sum()/len(y_train_smoteenn)*100:.1f}%)")

model7 = LogisticRegression(max_iter=1000, random_state=42)
model7.fit(X_train_smoteenn, y_train_smoteenn)
print("Entra√Ænement termin√©")

result7 = evaluate_model(model7, X_test_scaled, y_test, "SMOTEENN + LogReg")
results.append(result7)

SMOTEENN + Logistic Regression


### Gradient Boosting + sample_weight

In [None]:
print("Gradient Boosting")
#Boosting avec √©chantillonnage strati√©

# Calculer le ratio de d√©s√©quilibre
nb_normales = (y_train == 0).sum()
nb_fraudes = (y_train == 1).sum()
ratio = nb_normales / nb_fraudes if nb_fraudes > 0 else 1

# Cr√©er les poids
class_weights = {0: 1, 1: ratio}
sample_weights = np.array([class_weights[i] for i in y_train])

print(f"Ratio de d√©s√©quilibre: 1:{ratio:.1f}")
model8 = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
model8.fit(X_train_scaled, y_train, sample_weight=sample_weights)
print("Entra√Ænement termin√©")

result8 = evaluate_model(model8, X_test_scaled, y_test, "GradientBoosting + sample_weight")
results.append(result8)

## Comparaison des R√©sultats

In [None]:
print(" COMPARAISON FINALE DES 7 M√âTHODES")

# Cr√©er un DataFrame de r√©sultats
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('f1', ascending=False)

print("\n TABLEAU R√âCAPITULATIF (tri√© par F-mesure):\n")
print(results_df[['model', 'f1', 'precision', 'recall', 'tp', 'fp', 'fn']].to_string(index=False))

# Meilleur mod√®le
best_model = results_df.iloc[0]
print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model['model']}")
print(f" F-mesure: {best_model['f1']:.4f}")
print(f" Pr√©cision: {best_model['precision']:.4f}")
print(f" Rappel: {best_model['recall']:.4f}")
print(f" TP (fraudes d√©tect√©es): {best_model['tp']:,}")
print(f" FN (fraudes manqu√©es): {best_model['fn']:,}")
print(f" FP (fausses alarmes): {best_model['fp']:,}")

In [None]:
# Visualisation des r√©sultats
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# F1-score
axes[0].barh(results_df['model'], results_df['f1'], color='steelblue', edgecolor='black')
axes[0].set_xlabel('F1-Score', fontsize=12, fontweight='bold')
axes[0].set_title('Comparaison F1-Score', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)
for i, v in enumerate(results_df['f1']):
    axes[0].text(v, i, f' {v:.4f}', va='center', fontweight='bold')

# Pr√©cision vs Rappel
axes[1].scatter(results_df['recall'], results_df['precision'], s=200, alpha=0.6, c=results_df['f1'], 
                cmap='viridis', edgecolors='black', linewidth=2)
for i, model in enumerate(results_df['model']):
    axes[1].annotate(model, (results_df.iloc[i]['recall'], results_df.iloc[i]['precision']),
                    fontsize=8, ha='right')
axes[1].set_xlabel('Rappel', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Pr√©cision', fontsize=12, fontweight='bold')
axes[1].set_title('Pr√©cision vs Rappel', fontsize=14, fontweight='bold')
axes[1].grid(alpha=0.3)

# Nombre de TP (fraudes d√©tect√©es)
axes[2].barh(results_df['model'], results_df['tp'], color='green', alpha=0.7, edgecolor='black')
axes[2].set_xlabel('Nombre de TP (fraudes d√©tect√©es)', fontsize=12, fontweight='bold')
axes[2].set_title('Fraudes Correctement D√©tect√©es', fontsize=14, fontweight='bold')
axes[2].grid(axis='x', alpha=0.3)
for i, v in enumerate(results_df['tp']):
    axes[2].text(v, i, f' {v:,}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

## Analyse du Meilleur Mod√®le

Analyse approfondie du mod√®le ayant obtenu la meilleure F-mesure.

In [None]:
print("ANALYSE APPROFONDIE DU MEILLEUR MOD√àLE")

best_model_name = best_model["model"]
print(f"\nMod√®le s√©lectionn√©: {best_model_name}")

# Mapping exact : label -> mod√®le entra√Æn√©
trained_models = {
    "LogReg + class_weight": model1,
    "RandomForest + class_weight": model2,
    "SMOTE + LogReg": model3,
    "UnderSampling + LogReg": model4,
    "BalancedRandomForest": model5,
    "XGBoost + SMOTE": model6,
    "SMOTEENN + LogReg": model7,
    "GradientBoosting + sample_weight": model8,
}

# R√©cup√©rer le mod√®le
best_trained_model = trained_models.get(best_model_name)

if best_trained_model is None:
    raise ValueError(
        f"Mod√®le '{best_model_name}' introuvable dans trained_models. "
        f"Mod√®les dispo: {list(trained_models.keys())}"
    )

print(f"Objet mod√®le r√©cup√©r√©: {type(best_trained_model).__name__}")


# Matrice de confusion d√©taill√©e
y_pred_best = best_trained_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred_best)

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Matrice de confusion
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Normal', 'Fraude'], yticklabels=['Normal', 'Fraude'])
axes[0].set_ylabel('Vraie Classe', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Classe Pr√©dite', fontsize=12, fontweight='bold')
axes[0].set_title('Matrice de Confusion', fontsize=14, fontweight='bold')

# Proportions
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Greens', ax=axes[1],
            xticklabels=['Normal', 'Fraude'], yticklabels=['Normal', 'Fraude'])
axes[1].set_ylabel('Vraie Classe', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Classe Pr√©dite', fontsize=12, fontweight='bold')
axes[1].set_title('Matrice de Confusion (Normalis√©e)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Rapport de classification
print("\nüìã Rapport de classification d√©taill√©:\n")
print(classification_report(y_test, y_pred_best, target_names=['Normal', 'Fraude']))

##  Optimisation du Seuil de D√©cision

Pour certains mod√®les, on peut optimiser le seuil de classification (par d√©faut 0.5).

In [None]:
print("OPTIMISATION DU SEUIL DE D√âCISION")
# V√©rifier si le mod√®le a predict_proba
if hasattr(best_trained_model, 'predict_proba'):
    print("\nLe meilleur mod√®le supporte predict_proba ‚Üí optimisation possible")
    
    # Obtenir les probabilit√©s
    y_proba = best_trained_model.predict_proba(X_test_scaled)[:, 1]
    
    # Tester diff√©rents seuils
    seuils = np.linspace(0.1, 0.9, 17)
    f1_scores = []
    
    for seuil in seuils:
        y_pred_seuil = (y_proba >= seuil).astype(int)
        f1 = f1_score(y_test, y_pred_seuil)
        f1_scores.append(f1)
    
    # Meilleur seuil
    best_threshold_idx = np.argmax(f1_scores)
    best_threshold = seuils[best_threshold_idx]
    best_f1_threshold = f1_scores[best_threshold_idx]
    
    print(f"\n Seuil optimal trouv√©: {best_threshold:.2f}")
    print(f"  F1-score avec seuil optimal: {best_f1_threshold:.4f}")
    print(f"  F1-score avec seuil 0.5:     {best_model['f1']:.4f}")
    print(f"  Am√©lioration: {(best_f1_threshold - best_model['f1']):.4f}")
    
    # Visualisation
    plt.figure(figsize=(10, 6))
    plt.plot(seuils, f1_scores, marker='o', linewidth=2, markersize=8)
    plt.axvline(best_threshold, color='red', linestyle='--', linewidth=2, 
                label=f'Seuil optimal: {best_threshold:.2f}')
    plt.axvline(0.5, color='gray', linestyle='--', linewidth=2, alpha=0.5,
                label='Seuil par d√©faut: 0.50')
    plt.xlabel('Seuil de d√©cision', fontsize=12, fontweight='bold')
    plt.ylabel('F1-Score', fontsize=12, fontweight='bold')
    plt.title('Impact du Seuil de D√©cision sur la F-mesure', fontsize=14, fontweight='bold')
    plt.legend(fontsize=11)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    
else:
    print("\n Le meilleur mod√®le ne supporte pas predict_proba")
    print(" Optimisation du seuil non applicable")

## Sauvegarde des R√©sultats

In [None]:
print("SAUVEGARDE DES R√âSULTATS")

# Sauvegarder le tableau de r√©sultats
results_df.to_csv('resultats_modelisation_partie1.csv', index=False)
print("\n‚úì R√©sultats sauvegard√©s: resultats_modelisation_partie1.csv")

# Cr√©er un r√©sum√© textuel
with open('resume_modelisation.txt', 'w', encoding='utf-8') as f:
    f.write("R√âSUM√â - MOD√âLISATION PARTIE 1: MAXIMISATION F-MESURE\n")
    f.write(f"Dataset:\n")
    f.write(f"  Train: {len(df_train):,} transactions ({train_fraudes:,} fraudes)\n")
    f.write(f"  Test:  {len(df_test):,} transactions ({test_fraudes:,} fraudes)\n")
    f.write(f"  Features utilis√©es: {len(feature_cols)}\n\n")
    
    f.write("R√âSULTATS PAR MOD√àLE:\n")
    f.write(results_df[['model', 'f1', 'precision', 'recall', 'tp', 'fn', 'fp']].to_string(index=False))
    
    f.write(f"\n\nMEILLEUR MOD√àLE: {best_model['model']}\n")
    f.write(f"  F-mesure: {best_model['f1']:.4f}\n")
    f.write(f"  Pr√©cision: {best_model['precision']:.4f}\n")
    f.write(f"  Rappel: {best_model['recall']:.4f}\n")
    f.write(f"  TP (fraudes d√©tect√©es): {best_model['tp']:,}\n")
    f.write(f"  FN (fraudes manqu√©es): {best_model['fn']:,}\n")
    f.write(f"  FP (fausses alarmes): {best_model['fp']:,}\n")

print("‚úì R√©sum√© sauvegard√©: resume_modelisation.txt")

print("\n‚úÖ Tous les r√©sultats ont √©t√© sauvegard√©s!")