In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [2]:
file_path = '../data/dataset_clean.csv'
car = pd.read_csv(file_path)
car.head()

Unnamed: 0,model,prix,annee,mise_en_circulation,kilometrage,energie,boite,nb_portes,nb_places,premiere_main,puissance,departement
0,RENAULT TWINGO 3,11080,2020,2020,27297,Essence,mécanique,5,4,oui,90.0,35
1,BMW SERIE 4 F36 GRAN COUPE,50690,2019,2019,59778,Diesel,automatique,5,5,non,258.0,31
2,RENAULT TRAFIC 2 MINIBUS,13890,2007,2007,225334,Diesel,mécanique,4,9,non,135.0,28
3,CITROEN C3 (3E GENERATION),16780,2020,2021,14218,Essence,mécanique,5,5,non,110.0,61
4,CUPRA FORMENTOR,44210,2022,2022,16426,Essence,automatique,5,5,oui,150.0,30


In [3]:
Y = car['prix']
X = car.drop(['prix'], axis=1)
X

Unnamed: 0,model,annee,mise_en_circulation,kilometrage,energie,boite,nb_portes,nb_places,premiere_main,puissance,departement
0,RENAULT TWINGO 3,2020,2020,27297,Essence,mécanique,5,4,oui,90.0,35
1,BMW SERIE 4 F36 GRAN COUPE,2019,2019,59778,Diesel,automatique,5,5,non,258.0,31
2,RENAULT TRAFIC 2 MINIBUS,2007,2007,225334,Diesel,mécanique,4,9,non,135.0,28
3,CITROEN C3 (3E GENERATION),2020,2021,14218,Essence,mécanique,5,5,non,110.0,61
4,CUPRA FORMENTOR,2022,2022,16426,Essence,automatique,5,5,oui,150.0,30
...,...,...,...,...,...,...,...,...,...,...,...
2255,PEUGEOT 2008 (2E GENERATION),2022,2022,1,Diesel,mécanique,5,5,oui,110.0,42
2256,PORSCHE 911 TYPE 992,2020,2020,9523,Essence,automatique,2,4,oui,450.0,27
2257,CITROEN C3 (3E GENERATION),2022,2023,1,Essence,mécanique,5,5,oui,110.0,13
2258,AUDI A3 (3E GENERATION) SPORTBACK,2017,2017,61363,Diesel,automatique,5,5,non,150.0,94


In [4]:
model_column = X[['model']]
boite_column = X[['boite']]
energie_column = X[['energie']]
premiere_main_column = X[['premiere_main']]


In [5]:
encoder = OrdinalEncoder()
model_column = encoder.fit_transform(model_column)
boite_column = encoder.fit_transform(boite_column)
energie_column = encoder.fit_transform(energie_column)
premiere_main_column = encoder.fit_transform(premiere_main_column)

In [6]:
relational_df = car
relational_df[['model_ecoded']] = model_column 
relational_df[['boite_ecoded']] = boite_column
relational_df[['energie_ecoded']] = energie_column
relational_df[['premiere_main_ecoded']] = premiere_main_column
relational_df.to_csv('../data/dataset_relational.csv', index=False)

X[['model']] = model_column 
X[['boite']] = boite_column
X[['energie']] = energie_column
X[['premiere_main']] = premiere_main_column

In [7]:
X.to_csv('../data/dataset_clean_encoded.csv', index=False)

In [8]:
X

Unnamed: 0,model,annee,mise_en_circulation,kilometrage,energie,boite,nb_portes,nb_places,premiere_main,puissance,departement
0,459.0,2020,2020,27297,2.0,2.0,5,4,1.0,90.0,35
1,89.0,2019,2019,59778,0.0,1.0,5,5,0.0,258.0,31
2,455.0,2007,2007,225334,0.0,2.0,4,9,0.0,135.0,28
3,131.0,2020,2021,14218,2.0,2.0,5,5,0.0,110.0,61
4,152.0,2022,2022,16426,2.0,1.0,5,5,1.0,150.0,30
...,...,...,...,...,...,...,...,...,...,...,...
2255,373.0,2022,2022,1,0.0,2.0,5,5,1.0,110.0,42
2256,409.0,2020,2020,9523,2.0,1.0,2,4,1.0,450.0,27
2257,131.0,2022,2023,1,2.0,2.0,5,5,1.0,110.0,13
2258,22.0,2017,2017,61363,0.0,1.0,5,5,0.0,150.0,94


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.1, random_state=42)

In [10]:
# !pip install xgboost
from xgboost.sklearn import XGBRegressor

regressor = XGBRegressor()
regressor.fit(X_train, Y_train)

In [11]:
predictions = regressor.predict(X_test)

In [12]:
mean_squared_error(predictions, Y_test)

73803274.15203938

In [13]:
pickle.dump(regressor, open('model', 'wb'))

In [14]:
r2_score(predictions, Y_test)

0.8034580570108816

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
colonne = X[['kilometrage']]
colonne = scaler.fit_transform(colonne)
X[['kilometrage']] = colonne
X[['kilometrage']]

Unnamed: 0,kilometrage
0,0.111750
1,0.244728
2,0.922517
3,0.058205
4,0.067244
...,...
2255,0.000000
2256,0.038983
2257,0.000000
2258,0.251217


In [16]:
Y = scaler.fit_transform(car[['prix']])
Y

array([[0.04156353],
       [0.24582302],
       [0.05605404],
       ...,
       [0.10483705],
       [0.09751444],
       [0.13515883]])

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.1, random_state=42)

In [18]:
regressor = XGBRegressor()
regressor.fit(X_train, Y_train)

In [19]:
predictions = regressor.predict(X_test)
mean_squared_error(predictions, Y_test)

0.0016167338202413215

In [20]:
r2_score(predictions, Y_test)

0.8182300410019774