In [1]:
import pandas as pd
data = pd.read_csv('../data/data.csv')

### Define X and Y

In [6]:
X=data.drop(columns=['EmployeeCount','StandardHours','Over18','EmployeeNumber','TrainingTimesLastYear','HourlyRate'])
y=data['Attrition']

### Split data 

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

### Pipeline construction 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (confusion_matrix, classification_report, 
                             roc_curve, roc_auc_score, accuracy_score,
                             precision_score, recall_score, f1_score)
import warnings
warnings.filterwarnings('ignore')

# D√©finition des colonnes
cat_cols = ['BusinessTravel', 'Department', 'JobRole', 'MaritalStatus', 'OverTime']
num_cols = ['Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome', 'MonthlyRate', 
            'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 
            'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 
            'YearsWithCurrManager']

# Preprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ]
)

# ========== 1. D√âFINITION DES MOD√àLES ET GRILLES DE PARAM√àTRES ==========
models_config = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'model__C': [0.01, 0.1, 1, 10, 100],
            'model__penalty': ['l2'],
            'model__solver': ['lbfgs', 'liblinear']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [10, 20, 30, None],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7],
            'model__subsample': [0.8, 1.0]
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['rbf', 'linear'],
            'model__gamma': ['scale', 'auto']
        }
    }
}

# ========== 2. ENTRA√éNEMENT AVEC GRIDSEARCHCV ==========
def train_models(X_train, y_train):
    """
    Entra√Æne tous les mod√®les avec GridSearchCV
    """
    trained_models = {}
    
    for model_name, config in models_config.items():
        print(f"\n{'='*60}")
        print(f"Entra√Ænement de {model_name}...")
        print(f"{'='*60}")
        
        # Cr√©er le pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', config['model'])
        ])
        
        # GridSearchCV
        grid_search = GridSearchCV(
            pipeline,
            param_grid=config['params'],
            cv=5,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        
        # Entra√Æner
        grid_search.fit(X_train, y_train)
        
        print(f"\nMeilleur score CV: {grid_search.best_score_:.4f}")
        print(f"Meilleurs param√®tres: {grid_search.best_params_}")
        
        trained_models[model_name] = grid_search
    
    return trained_models

# ========== 3. MATRICE DE CONFUSION ==========
def plot_confusion_matrix(y_true, y_pred, model_name):
    """
    Affiche la matrice de confusion
    """
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No Attrition', 'Attrition'],
                yticklabels=['No Attrition', 'Attrition'])
    plt.title(f'Matrice de Confusion - {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('Valeur R√©elle')
    plt.xlabel('Valeur Pr√©dite')
    plt.tight_layout()
    plt.show()
    
    return cm

# ========== 4. COURBE ROC ==========
def plot_roc_curve(models_dict, X_test, y_test):
    """
    Affiche les courbes ROC pour tous les mod√®les
    """
    plt.figure(figsize=(10, 8))
    
    for model_name, grid_search in models_dict.items():
        # Pr√©dictions de probabilit√©
        y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
        
        # Calculer ROC
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        # Tracer
        plt.plot(fpr, tpr, linewidth=2, 
                label=f'{model_name} (AUC = {auc_score:.3f})')
    
    # Ligne diagonale
    plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Chance (AUC = 0.5)')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taux de Faux Positifs (FPR)', fontsize=12)
    plt.ylabel('Taux de Vrais Positifs (TPR)', fontsize=12)
    plt.title('Courbes ROC - Comparaison des Mod√®les', fontsize=14, fontweight='bold')
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

# ========== 5. RAPPORT DE CLASSIFICATION ==========
def print_classification_report(y_true, y_pred, model_name):
    """
    Affiche le rapport de classification d√©taill√©
    """
    print(f"\n{'='*60}")
    print(f"RAPPORT DE CLASSIFICATION - {model_name}")
    print(f"{'='*60}")
    print(classification_report(y_true, y_pred, 
                                target_names=['No Attrition', 'Attrition']))

# ========== 6. TABLEAU COMPARATIF DES PERFORMANCES ==========
def create_comparison_table(models_dict, X_test, y_test):
    """
    Cr√©e un tableau comparatif des performances
    """
    results = []
    
    for model_name, grid_search in models_dict.items():
        y_pred = grid_search.predict(X_test)
        y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
        
        results.append({
            'Mod√®le': model_name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_pred_proba),
            'CV Score': grid_search.best_score_
        })
    
    df_results = pd.DataFrame(results)
    df_results = df_results.round(4)
    df_results = df_results.sort_values('ROC-AUC', ascending=False)
    
    print(f"\n{'='*80}")
    print("TABLEAU COMPARATIF DES PERFORMANCES")
    print(f"{'='*80}")
    print(df_results.to_string(index=False))
    print(f"{'='*80}\n")
    
    return df_results

# ========== 7. VISUALISATION COMPARATIVE ==========
def plot_metrics_comparison(df_results):
    """
    Graphique comparatif des m√©triques
    """
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for idx, metric in enumerate(metrics):
        ax = axes[idx]
        data = df_results.sort_values(metric, ascending=True)
        
        bars = ax.barh(data['Mod√®le'], data[metric], color='skyblue', edgecolor='navy')
        
        # Colorer la meilleure barre
        max_idx = data[metric].idxmax()
        bars[list(data.index).index(max_idx)].set_color('green')
        
        ax.set_xlabel('Score', fontsize=11)
        ax.set_title(metric, fontsize=12, fontweight='bold')
        ax.set_xlim([0, 1])
        ax.grid(axis='x', alpha=0.3)
        
        # Ajouter les valeurs sur les barres
        for i, (idx, row) in enumerate(data.iterrows()):
            ax.text(row[metric] + 0.01, i, f'{row[metric]:.3f}', 
                   va='center', fontsize=9)
    
    # Cacher le dernier subplot
    axes[-1].axis('off')
    
    plt.tight_layout()
    plt.show()

# ========== 8. FONCTION PRINCIPALE ==========
def evaluate_models(X_train, X_test, y_train, y_test):
    """
    Fonction principale pour √©valuer tous les mod√®les
    """
    # 1. Entra√Æner les mod√®les
    print("\nüöÄ PHASE 1: ENTRA√éNEMENT DES MOD√àLES AVEC GRIDSEARCHCV")
    trained_models = train_models(X_train, y_train)
    
    # 2. Tableau comparatif
    print("\nüìä PHASE 2: COMPARAISON DES PERFORMANCES")
    df_results = create_comparison_table(trained_models, X_test, y_test)
    
    # 3. Courbes ROC
    print("\nüìà PHASE 3: COURBES ROC")
    plot_roc_curve(trained_models, X_test, y_test)
    
    # 4. Graphiques comparatifs
    print("\nüìâ PHASE 4: VISUALISATIONS COMPARATIVES")
    plot_metrics_comparison(df_results)
    
    # 5. √âvaluation d√©taill√©e de chaque mod√®le
    print("\nüîç PHASE 5: √âVALUATION D√âTAILL√âE PAR MOD√àLE")
    for model_name, grid_search in trained_models.items():
        y_pred = grid_search.predict(X_test)
        
        # Matrice de confusion
        plot_confusion_matrix(y_test, y_pred, model_name)
        
        # Rapport de classification
        print_classification_report(y_test, y_pred, model_name)
    
    # 6. Meilleur mod√®le
    best_model_name = df_results.iloc[0]['Mod√®le']
    best_model = trained_models[best_model_name]
    
    print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model_name}")
    print(f"ROC-AUC: {df_results.iloc[0]['ROC-AUC']:.4f}")
    
    return trained_models, df_results, best_model

# ========== EXEMPLE D'UTILISATION ==========
"""
# Charger vos donn√©es
df = pd.read_csv('votre_fichier.csv')

# S√©parer X et y
X = df[cat_cols + num_cols]
y = df['Attrition']  # Assurez-vous que c'est binaire (0/1)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Lancer l'√©valuation compl√®te
trained_models, results_df, best_model = evaluate_models(
    X_train, X_test, y_train, y_test
)

# Faire des pr√©dictions avec le meilleur mod√®le
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)
"""