In [8]:
import pandas as pd
import numpy as np
from csv_cleaner import clean_df

df = pd.read_csv('../res/train.csv')
df = clean_df(df, 100)
df = df.drop(columns=['Model'])
df.head()

Unnamed: 0,Brand,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti,Mumbai,2010,72000,CNG,Manual,First,37.24,998.0,58.16,5.0,0,1.75
1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,0,12.5
2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,1,4.5
3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,0,6.0
4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,0,17.74


In [9]:
from sklearn.model_selection import train_test_split

X = df.drop(columns= ['Price'])
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor as GBR
from pipeline_GBR import pipeline_create


# Mettre à jour la grille de paramètres pour cibler directement GradientBoostingRegressor au sein du pipeline
param_grid = {
    'gradientboostingregressor__n_estimators': [100, 200, 300],
    'gradientboostingregressor__max_depth': [3, 5, 7],
    'gradientboostingregressor__min_samples_split': [2, 4],
    'gradientboostingregressor__min_samples_leaf': [1, 3],
    'gradientboostingregressor__max_features': ['sqrt'],
    'gradientboostingregressor__subsample': [0.8, 0.9, 1.0]
}

# Construire le pipeline avec GradientBoostingRegressor comme 'regressor'
pipeline = pipeline_create(X_train, GBR(random_state=0))

# Appliquer GridSearchCV sur le pipeline au lieu de juste le modèle
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error

# Configuration de KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)

# Initialiser des listes pour enregistrer les scores pour chaque pli
mse_scores = []
rmse_scores = []
r2_scores = []

# Boucler sur chaque pli défini par KFold
for train_index, test_index in kf.split(X):

    # On redéfini les données d'apprentissage et de test pour chaque pli
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)

    # Calcul du MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    
    # Calcul du RMSE
    rmse = np.sqrt(mse)
    rmse_scores.append(rmse)
    
    # Calcul du R2
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

print(f"MSE : {mse_scores}")
print(f"RMSE : {rmse_scores}")
print(f"R² : {r2_scores}")

print("Mean MSE:", np.mean(mse_scores))
print("Mean RMSE:", np.mean(rmse_scores))
print("Mean R²:", np.mean(r2_scores))

MSE : [14.319822082745585, 14.73421741872218, 9.638627229841772, 12.85567161407526, 5.700671959394344]
RMSE : [3.784154077564177, 3.8385176069313762, 3.1046138616326786, 3.585480667089876, 2.387607999524701]
R² : [0.8805476771150134, 0.8815314092720951, 0.9241777899320573, 0.9036142110021363, 0.9528133117962397]
Mean MSE: 11.449802060955827
Mean RMSE: 3.3400748425485616
Mean R²: 0.9085368798235084


In [14]:
# Enregistrement du modèle

import pickle

#Chemin où vous souhaitez enregistrer le modèle
chemin_fichier = 'GBR.pkl'

with open(chemin_fichier, 'wb') as fichier:
    pickle.dump(pipeline, fichier)

print(f"Modèle enregistré avec succès dans {chemin_fichier}")


Modèle enregistré avec succès dans GBR.pkl


In [13]:
#Utilsation du modèle
# import pickle

# with open('GBR.pkl', 'rb') as fichier:
#     GBR = pickle.load(fichier)

# donnees_de_test = pd.read_csv('../res/clean_train.csv')
# predictions = GBR.predict(donnees_de_test)

# print(predictions)
# print(len(predictions))