In [29]:
import pandas as pd
import numpy as np

df = pd.read_csv('../res/clean_train.csv')
df.head()

Unnamed: 0,Name,Version,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti,Cng Lxi R Wagon,Mumbai,2010,72000,CNG,Manual,First,37.24,998.0,58.16,5.0,,1.75
1,Hyundai,1.6 Crdi Creta Option Sx,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,,12.5
2,Honda,Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,8.61 Lakh,4.5
3,Maruti,Ertiga Vdi,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,,6.0
4,Audi,2.0 A4 Multitronic New Tdi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,,17.74


In [30]:
df = df.drop(columns= ['New_Price'])
df.head()

Unnamed: 0,Name,Version,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Cng Lxi R Wagon,Mumbai,2010,72000,CNG,Manual,First,37.24,998.0,58.16,5.0,1.75
1,Hyundai,1.6 Crdi Creta Option Sx,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda,Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti,Ertiga Vdi,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi,2.0 A4 Multitronic New Tdi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [31]:
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from pipeline import pipeline_create

# Construire le pipeline avec ElasticNet comme 'regressor'
pipeline = pipeline_create(X_train, ElasticNet(random_state=0))

# Définir la grille de recherche avec les noms corrects
param_grid = {
    'elasticnet__alpha': [0.1, 1.0, 10.0],
    'elasticnet__l1_ratio': [0.1, 0.5, 0.9],
    'elasticnet__max_iter': [1000, 5000, 10000]
}

# Appliquer GridSearchCV sur le pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [33]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error

# Configuration de KFold
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Initialiser des listes pour enregistrer les scores pour chaque pli
mse_scores = []
rmse_scores = []
r2_scores = []

# Boucler sur chaque pli défini par KFold
for train_index, test_index in kf.split(X):

    # On redéfini les données d'apprentissage et de test pour chaque pli
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calcul du MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    
    # Calcul du RMSE
    rmse = np.sqrt(mse)
    rmse_scores.append(rmse)
    
    # Calcul du R2
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

print(f"MSE : {mse_scores}")
print(f"RMSE : {rmse_scores}")
print(f"R² : {r2_scores}")

print("Mean MSE:", np.mean(mse_scores))
print("Mean RMSE:", np.mean(rmse_scores))
print("Mean R²:", np.mean(r2_scores))

MSE : [36.086209891156116, 47.59424331965138, 53.33974052680101, 73.18095572980732, 38.99246684870729, 55.542568541516935, 49.09260852756708, 64.21217906801235, 41.48342561970525, 46.76036461665667]
RMSE : [6.007179861728473, 6.89885811708368, 7.303406090777166, 8.554586824026472, 6.24439483446613, 7.452688678692874, 7.006611772288164, 8.013250218732244, 6.440762813495406, 6.838155059418927]
R² : [0.6701840022561674, 0.6339607686515369, 0.6107010679132415, 0.3435362087691286, 0.6594881920901808, 0.6024494395360518, 0.6233298452517415, 0.5289868466985838, 0.667137683531309, 0.6001711902874669]
Mean MSE: 50.628476268958146
Mean RMSE: 7.075989427070954
Mean R²: 0.5939945244985408
