In [272]:
#import

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV


In [243]:
df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')
df_data_dict = pd.read_excel('data dictionary.xlsx')

In [244]:
df_test.head(10)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,25.27 Lakh
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,
5,Mahindra XUV500 W8 2WD,Coimbatore,2016,85609,Diesel,Manual,Second,16.0 kmpl,2179 CC,140 bhp,7.0,
6,Toyota Fortuner 4x2 AT TRD Sportivo,Pune,2015,59000,Diesel,Automatic,First,12.55 kmpl,2982 CC,168.7 bhp,7.0,
7,Hyundai EON Era Plus,Jaipur,2013,65000,Petrol,Manual,First,21.1 kmpl,814 CC,55.2 bhp,5.0,
8,Honda City 1.5 S MT,Mumbai,2011,66000,Petrol,Manual,Second,17.0 kmpl,1497 CC,118 bhp,5.0,
9,Mahindra XUV500 W6 2WD,Coimbatore,2015,54684,Diesel,Manual,First,15.1 kmpl,2179 CC,140 bhp,7.0,


In [245]:
df_train.head(10)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
5,Hyundai EON LPG Era Plus Option,Hyderabad,2012,75000,LPG,Manual,First,21.1 km/kg,814 CC,55.2 bhp,5.0,,2.35
6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5
7,Toyota Innova Crysta 2.8 GX AT 8S,Mumbai,2016,36000,Diesel,Automatic,First,11.36 kmpl,2755 CC,171.5 bhp,8.0,21 Lakh,17.5
8,Volkswagen Vento Diesel Comfortline,Pune,2013,64430,Diesel,Manual,First,20.54 kmpl,1598 CC,103.6 bhp,5.0,,5.2
9,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3 kmpl,1248 CC,74 bhp,5.0,,1.95


In [246]:
# Définir les colonnes à sélectionner pour l'entraînement du modèle
features = ['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Fuel_Type', 'Transmission', 'Owner_Type']

# Séparer les données d'entraînement et de test
X_train = df_train[features]
y_train = df_train['Price']
X_test = df_test[features]


In [247]:
# Définir le nombre de clusters 
n_clusters = 10

# Vectoriser la colonne "Name" en utilisant TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_train['Name'])

# Appliquer le clustering k-means
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Ajouter les étiquettes de cluster à la colonne "Name"
df_train['Name_Cluster'] = kmeans.labels_

In [248]:
from collections import Counter

# Créer une liste de noms pour chaque cluster
cluster_names = []
for i in range(n_clusters):
    # Extraire les noms dans le cluster i
    names = df_train['Name'][df_train['Name_Cluster'] == i]
    # Compter les occurrences de chaque nom
    counter = Counter(names)
    # Sélectionner le nom le plus fréquent comme nom du cluster
    cluster_name = counter.most_common(1)[0][0]
    cluster_names.append(cluster_name)

# Afficher les noms des clusters
for i, name in enumerate(cluster_names):
    print(f'Cluster {i}: {name}')

Cluster 0: Mercedes-Benz New C-Class C 220d Avantgarde Edition C
Cluster 1: Honda City 1.5 S MT
Cluster 2: Volkswagen Vento Diesel Highline
Cluster 3: Maruti Swift VDI
Cluster 4: Toyota Fortuner 3.0 Diesel
Cluster 5: Hyundai i10 Sportz
Cluster 6: Ford Figo Diesel Titanium
Cluster 7: Mahindra XUV500 W8 2WD
Cluster 8: Chevrolet Beat LT
Cluster 9: Maruti Wagon R VXI


In [249]:
# Créer un dictionnaire pour mapper les anciens noms de cluster aux nouveaux noms
cluster_map = dict(zip(range(n_clusters), cluster_names))

# Remplacer les valeurs dans la colonne "Name_Cluster" par les nouveaux noms de cluster
df_train['Name_Cluster'] = df_train['Name_Cluster'].map(cluster_map)


In [250]:
df_train['Engine'] = df_train['Engine'].str.replace(' CC', '', regex=False)
df_train['Mileage'] = df_train['Mileage'].str.replace(' kmpl| km/kg', '', regex=True).astype(float)


In [251]:
df_train['Power'] = df_train['Power'].str.replace(' bhp', '')
df_train['Power'] = df_train['Power'].replace('null', np.nan)
df_train['Power'] = df_train['Power'].astype(float)

In [255]:
df_train = df_train.drop(['New_Price'], axis=1)

In [259]:
df_train.head(10)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Name_Cluster
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,1.75,Maruti Wagon R VXI
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,12.5,Mahindra XUV500 W8 2WD
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199,88.7,5.0,4.5,Honda City 1.5 S MT
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,6.0,Maruti Swift VDI
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,17.74,Mahindra XUV500 W8 2WD
5,Hyundai EON LPG Era Plus Option,Hyderabad,2012,75000,LPG,Manual,First,21.1,814,55.2,5.0,2.35,Mahindra XUV500 W8 2WD
6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461,63.1,5.0,3.5,Mahindra XUV500 W8 2WD
7,Toyota Innova Crysta 2.8 GX AT 8S,Mumbai,2016,36000,Diesel,Automatic,First,11.36,2755,171.5,8.0,17.5,Toyota Fortuner 3.0 Diesel
8,Volkswagen Vento Diesel Comfortline,Pune,2013,64430,Diesel,Manual,First,20.54,1598,103.6,5.0,5.2,Volkswagen Vento Diesel Highline
9,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3,1248,74.0,5.0,1.95,Mahindra XUV500 W8 2WD


In [266]:
# Séparer les données en features et cible
X = df_train.drop(['Price'], axis=1)
y = df_train['Price']


# Définir les transformations pour les colonnes numériques et catégorielles
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combiner les transformations en une seule étape de prétraitement
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Définir le modèle de régression
model = GradientBoostingRegressor(random_state=42)

# Créer la pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


In [267]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraîner le modèle sur les données d'entraînement
pipeline.fit(X_train, y_train)

# Évaluer les performances du modèle sur les données de test
score = pipeline.score(X_test, y_test)
print('Score de la pipeline sur les données de test :', score)

Score de la pipeline sur les données de test : 0.8569287123604391


In [271]:
# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

# Coefficient de détermination (R²)
r2 = r2_score(y_test, y_pred)
print("R² :", r2)

# Erreur moyenne absolue (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("MAE :", mae)

# Erreur quadratique moyenne (MSE)
mse = mean_squared_error(y_test, y_pred)
print("MSE :", mse)

# Erreur quadratique moyenne racine (RMSE)
rmse = np.sqrt(mse)
print("RMSE :", rmse)

R² : 0.8569287123604391
MAE : 1.9913500941602176
MSE : 17.60627996143487
RMSE : 4.195983789462832


In [273]:
params = {
    'model__n_estimators': [50, 100, 150, 200],
    'model__learning_rate': [0.1, 0.01, 0.001],
    'model__max_depth': [3, 5, 7],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}


In [274]:
grid_search = GridSearchCV(pipeline, params, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)


In [275]:
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur score :", grid_search.best_score_)


Meilleurs hyperparamètres : {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 150}
Meilleur score : 0.8852087503998408


In [276]:
y_pred = grid_search.predict(X_test)
score = r2_score(y_test, y_pred)
print("Score de la pipeline optimisée sur les données de test :", score)

Score de la pipeline optimisée sur les données de test : 0.862451366814387
