In [None]:
# 7.Cr√©ation du pipeline
def create_pipeline(X_train):
    """Pipeline am√©lior√© avec corrections des erreurs"""
    
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42)),
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(score_func=f_regression, k='all'))
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'
    )
    
    models = {
        'Ridge': RidgeCV(alphas=np.logspace(-3, 3, 100)),
        'Lasso': LassoCV(alphas=np.logspace(-4, 2, 50), cv=5, random_state=42),
        'RandomForest': RandomForestRegressor(
            n_estimators=200,
            max_depth=10,
            min_samples_leaf=10,
            random_state=42,
            n_jobs=-1
        ),
        'LinearRegression': LinearRegression(),
        'DecisionTree': DecisionTreeRegressor(
            max_depth=6,
            min_samples_leaf=10,
            random_state=42
        ),
        'KNN': KNeighborsRegressor(n_neighbors=5),
        
        'HistGradientBoosting': HistGradientBoostingRegressor(
            max_iter=200,
            learning_rate=0.05,
            max_depth=3,
            min_samples_leaf=15,
            l2_regularization=0.2,
            random_state=42
        ),
        'XGBoost': XGBRegressor(
            n_estimators=300,
            max_depth=4,
            learning_rate=0.05,
            reg_alpha=0.2,
            reg_lambda=1.0,
            early_stopping_rounds=10,
            random_state=42
        )
    }
    
    # 5. Construction des pipelines optimis√©s
    pipelines = {}
    for name, model in models.items():
        steps = [
            ('preprocessor', preprocessor),
            ('model', model)
        ]
        
        # Ajout de RFECV seulement pour les mod√®les lin√©aires
        if name in ['Ridge', 'Lasso', 'LinearRegression']:
            steps.insert(1, ('feature_selector', RFECV(
                estimator=model,
                step=5,
                cv=3,
                scoring='neg_mean_squared_error',
                n_jobs=-1
            )))
        
        pipelines[name] = Pipeline(steps)
    return pipelines, preprocessor, numeric_features, categorical_features

In [None]:
# 8.Evaluation et optimisation des mod√®les
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def evaluate_and_optimize_models(pipelines, X_train, X_test, y_train, y_test):
    """
    √âvalue et optimise les mod√®les avec gestion sp√©ciale pour XGBoost,
    correction pour v√©hicules premium et comparaison des performances.
    """
    all_results = {}
    results = []
    
    print("üìä D√©but de l'√©valuation des mod√®les...")

    for name, pipeline in pipelines.items():
        print(f"\nüîç Traitement du mod√®le {name}...")
        
        try:
            # Gestion sp√©ciale pour XGBoost
            if 'XGB' in name or 'XGBoost' in name:
                # 1. Cr√©er une copie du preprocessor
                preprocessor = clone(pipeline.named_steps['preprocessor'])
                
                # 2. Pr√©parer les donn√©es transform√©es
                X_train_transformed = preprocessor.fit_transform(X_train, y_train)
                X_test_transformed = preprocessor.transform(X_test)
                
                # 3. Configurer XGBoost avec early stopping
                model = XGBRegressor(
                    n_estimators=1000,  # Nombre √©lev√© car early stopping activ√©
                    max_depth=5,
                    learning_rate=0.05,
                    reg_alpha=0.1,
                    reg_lambda=1.0,
                    early_stopping_rounds=50,
                    random_state=42,
                    n_jobs=-1
                )
                
                # 4. Entra√Ænement avec jeu de validation
                model.fit(
                    X_train_transformed, y_train,
                    eval_set=[(X_test_transformed, y_test)],
                    verbose=10  # Affiche les m√©triques toutes les 10 it√©rations
                )
                
                # 5. Reconstruire le pipeline complet
                pipeline.named_steps['preprocessor'] = preprocessor
                pipeline.named_steps['model'] = model
            else:
                # Entra√Ænement standard pour les autres mod√®les
                pipeline.fit(X_train, y_train)
            
                # Entra√Ænement du mod√®le
                pipeline.fit(X_train, y_train)
            
                # Pr√©dictions de base
                y_train_pred = pipeline.predict(X_train)
                y_test_pred = pipeline.predict(X_test)
            
                
            # Calcul des m√©triques
            metrics = {
                'Mod√®le': name,
                'MAPE_train': mean_absolute_percentage_error(y_train, y_train_pred),
                'MAPE_test': mean_absolute_percentage_error(y_test, y_test_pred),
                'RMSE_train': np.sqrt(mean_squared_error(y_train, y_train_pred)),
                'MAE_train': mean_absolute_error(y_train, y_train_pred),
                'R¬≤_train': r2_score(y_train, y_train_pred),
                'RMSE_test': np.sqrt(mean_squared_error(y_test, y_test_pred)),
                'MAE_test': mean_absolute_error(y_test, y_test_pred),
                'R¬≤_test': r2_score(y_test, y_test_pred),
                'Type': type(pipeline.named_steps['model']).__name__
            }
            
            # Stockage des r√©sultats
            results.append(metrics)
            all_results[name] = metrics
            
            print(f"‚úÖ {name} - R¬≤ Test: {metrics['R¬≤_test']:.3f} | RMSE Test: {metrics['RMSE_test']:.2f}")
            
        except Exception as e:
            print(f"‚ùå Erreur avec {name}: {str(e)}")
            continue
    
    # Cr√©ation du DataFrame des r√©sultats
    results_df = pd.DataFrame(results)
    
    # Calcul du score agr√©g√© (plus bas = meilleur)
    results_df['Score_agr√©g√©'] = (
        results_df['RMSE_test'] + 
        results_df['MAE_test'] - 
        results_df['R¬≤_test']
    )
    
    # S√©lection du meilleur mod√®le
    best_idx = results_df['Score_agr√©g√©'].idxmin()
    best_model_name = results_df.loc[best_idx, 'Mod√®le']
    best_model_final = pipelines[best_model_name]
    
    # Affichage des r√©sultats
    print("\nüìã R√©sultats d√©taill√©s :")
    print(results_df.sort_values('Score_agr√©g√©').to_string(index=False))
    
    # Visualisation
    plot_model_comparisons(results_df)
    
    return all_results, best_model_name, best_model_final

def plot_model_comparisons(results_df):
    """Visualisation comparative des performances des mod√®les"""
    plt.figure(figsize=(15, 8))
    
    # Tri des r√©sultats par R¬≤ test
    results_df = results_df.sort_values('R¬≤_test', ascending=False)
    
    # Graphique √† barres pour R¬≤
    plt.subplot(2, 2, 1)
    plt.barh(results_df['Mod√®le'], results_df['R¬≤_test'], color='skyblue')
    plt.title('Comparaison des R¬≤ (Test)')
    plt.xlim(0, 1)
    
    # Graphique √† barres pour RMSE
    plt.subplot(2, 2, 2)
    plt.barh(results_df['Mod√®le'], results_df['RMSE_test'], color='lightgreen')
    plt.title('Comparaison des RMSE (Test)')
    
    # Graphique √† barres pour MAE
    plt.subplot(2, 2, 3)
    plt.barh(results_df['Mod√®le'], results_df['MAE_test'], color='salmon')
    plt.title('Comparaison des MAE (Test)')
    
    # Graphique combin√©
    plt.subplot(2, 2, 4)
    width = 0.3
    x = np.arange(len(results_df))
    plt.bar(x - width, results_df['R¬≤_test'], width, label='R¬≤')
    plt.bar(x, results_df['RMSE_test'], width, label='RMSE')
    plt.bar(x + width, results_df['MAE_test'], width, label='MAE')
    plt.xticks(x, results_df['Mod√®le'], rotation=45)
    plt.legend()
    plt.title('M√©triques combin√©es')
    
    plt.tight_layout()
    plt.show()