In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../res/clean_train.csv')
df = df.drop(columns= ['New_Price'])
df.head()

Unnamed: 0,Name,Version,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Cng Lxi R Wagon,Mumbai,2010,72000,CNG,Manual,First,37.24,998.0,58.16,5.0,1.75
1,Hyundai,1.6 Crdi Creta Option Sx,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda,Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti,Ertiga Vdi,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi,2.0 A4 Multitronic New Tdi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [2]:
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor as GBR
from pipeline_GBR import pipeline_create

# Construire le pipeline avec GradientBoostingRegressor comme 'regressor'
pipeline = pipeline_create(X_train, GBR(random_state=0))

# Mettre à jour la grille de paramètres pour cibler directement GradientBoostingRegressor
param_grid = {
    'gradientboostingregressor__n_estimators': [100, 200, 300],
    'gradientboostingregressor__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingregressor__max_depth': [3, 5, 7],
    'gradientboostingregressor__min_samples_split': [2, 4],
    'gradientboostingregressor__min_samples_leaf': [1, 3],
    'gradientboostingregressor__max_features': ['sqrt', 'log2']
}

# Appliquer GridSearchCV sur le pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error

# Configuration de KFold
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Initialiser des listes pour enregistrer les scores pour chaque pli
mse_scores = []
rmse_scores = []
r2_scores = []

# Boucler sur chaque pli défini par KFold
for train_index, test_index in kf.split(X):

    # On redéfini les données d'apprentissage et de test pour chaque pli
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calcul du MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    
    # Calcul du RMSE
    rmse = np.sqrt(mse)
    rmse_scores.append(rmse)
    
    # Calcul du R2
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

print(f"MSE : {mse_scores}")
print(f"RMSE : {rmse_scores}")
print(f"R² : {r2_scores}")

print("Mean MSE:", np.mean(mse_scores))
print("Mean RMSE:", np.mean(rmse_scores))
print("Mean R²:", np.mean(r2_scores))

MSE : [9.228169225839062, 15.270829257942497, 21.18087989429114, 13.032786318007767, 9.410405581007382, 11.911205000275796, 11.204327779596738, 24.634355682194265, 11.172088300060622, 10.675208715850351]
RMSE : [3.037790187922639, 3.907790841120146, 4.602268994125739, 3.6100950566443215, 3.0676384371381484, 3.4512613636576117, 3.3472866294353607, 4.963300885720537, 3.342467397007878, 3.2672937908688824]
R² : [0.9156575919236407, 0.8825546491812187, 0.8454118103675005, 0.8830904539125, 0.9178211978742835, 0.9147445581289406, 0.9140331710786755, 0.819300548309859, 0.910355349496873, 0.9087206434491554]
Mean MSE: 13.772025575506564
Mean RMSE: 3.6597193583641263
Mean R²: 0.8911689973722648
