In [448]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso,ElasticNet


import os
import mlflow
from mlflow import log_metric, log_param, log_artifacts


In [None]:
%env AWS_ACCESS_KEY_ID=
%env AWS_SECRET_ACCESS_KEY=

In [450]:
# Set tracking URI to your Heroku application
mlflow.set_tracking_uri("https://getaroundmlflowserver.herokuapp.com")

# Set your variables for your environment
EXPERIMENT_NAME="GET_ARROUND_EXP_LAST_ONE_MODEL"

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()

2023/05/28 15:33:38 INFO mlflow.tracking.fluent: Experiment with name 'GET_ARROUND_EXP_LAST_ONE_MODEL' does not exist. Creating a new experiment.


In [451]:
dataset = pd.read_csv("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_pricing_project.csv")

In [452]:
dataset.drop(columns='Unnamed: 0',inplace=True)

In [453]:
dataset

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,Toyota,39743,110,diesel,black,van,False,True,False,False,False,False,True,121
4839,Toyota,49832,100,diesel,grey,van,False,True,False,False,False,False,True,132
4840,Toyota,19633,110,diesel,grey,van,False,True,False,False,False,False,True,130
4841,Toyota,27920,110,diesel,brown,van,True,True,False,False,False,False,True,151


In [454]:
data_desc = dataset.describe(include='all')
print(dataset.shape)
data_desc

(4843, 14)


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,28,,,4,10,8,2,2,2,2,2,2,2,
top,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,,140962.8,128.98823,,,,,,,,,,,121.214536
std,,60196.74,38.99336,,,,,,,,,,,33.568268
min,,-64.0,0.0,,,,,,,,,,,10.0
25%,,102913.5,100.0,,,,,,,,,,,104.0
50%,,141080.0,120.0,,,,,,,,,,,119.0
75%,,175195.5,135.0,,,,,,,,,,,136.0


In [455]:
#Sur le kilométrage on voit une valeur négative. C'est impossible. Supprimons
dataset = dataset[~dataset['mileage']<0]

In [456]:
#Pas de valeur manquante
print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Percentage of missing values: 


model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

In [457]:
#Outliers
dataset[dataset['engine_power']>150]
dataset[dataset['engine_power']<70]
#Sur le nombre de chevaux une voiture est à 0, et une citroen à 317 ch.... ca ne parrait pas très cohérent, supprimons les.

#Les différents type de carburant ecologique sont très mal représentés 12 contre 4640 diesel. Le modèle va avoir du mal à généraliser ces cas mais nous devons les garder
dataset.groupby(by="fuel").count()

#Vérifions les couleurs rien détrange, certaines couleurs sont sous représentées... peut-être faudrait il les regrouper
dataset.groupby(by="paint_color").count()

#Les différents types de véhicules ne présente pas de problèmes particulier
dataset.groupby(by="car_type").count()

#Les options remontent 2 valeurs à chaque fois True False

#Voyons pour le prix les outliers
dataset[dataset['rental_price_per_day']<50]
dataset[dataset['rental_price_per_day']>300]


#Vérifions les marques
dataset.groupby(by="model_key").count()


Unnamed: 0_level_0,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
model_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alfa Romeo,3,3,3,3,3,3,3,3,3,3,3,3,3
Audi,526,526,526,526,526,526,526,526,526,526,526,526,526
BMW,827,827,827,827,827,827,827,827,827,827,827,827,827
Citroën,969,969,969,969,969,969,969,969,969,969,969,969,969
Ferrari,33,33,33,33,33,33,33,33,33,33,33,33,33
Fiat,2,2,2,2,2,2,2,2,2,2,2,2,2
Ford,5,5,5,5,5,5,5,5,5,5,5,5,5
Honda,1,1,1,1,1,1,1,1,1,1,1,1,1
KIA Motors,3,3,3,3,3,3,3,3,3,3,3,3,3
Lamborghini,2,2,2,2,2,2,2,2,2,2,2,2,2


In [458]:
#Pour les colonnes engine power, rental price et kilometrage, nous allons supprimer les valeurs à +/- 3 std
list_col = ['mileage','engine_power','rental_price_per_day']
for col in list_col:
    upper_limit = dataset[col].mean() + 3*dataset[col].std()
    lower_limit = dataset[col].mean() - 3*dataset[col].std()
    dataset = dataset[(dataset[col]<upper_limit) & (dataset[col]>lower_limit)]
dataset

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True,131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,Toyota,39743,110,diesel,black,van,False,True,False,False,False,False,True,121
4839,Toyota,49832,100,diesel,grey,van,False,True,False,False,False,False,True,132
4840,Toyota,19633,110,diesel,grey,van,False,True,False,False,False,False,True,130
4841,Toyota,27920,110,diesel,brown,van,True,True,False,False,False,False,True,151


In [459]:
#Au total nous avons nettoyer une 100aines de lignes

In [460]:
#Idées à tester : 
#-Regrouper les couleurs de véhicules avec un groupe Autre
#-Regrouper les carburant avec un type Autre
#-Regrouper les marques avec Autre => Obligatoire car nous avons des marques qui apparaissent 1 seule fois
#-Créer une colonne qui indique le nombre d'option

In [461]:
serie = dataset.groupby(by="model_key").count()
marque_to_keep = serie.where(serie>50).dropna().index.to_list()
dataset['marque'] = dataset["model_key"].apply(lambda x : 'Autre' if x not in marque_to_keep else x)

In [462]:
marque_to_keep

['Audi',
 'BMW',
 'Citroën',
 'Mercedes',
 'Mitsubishi',
 'Nissan',
 'Peugeot',
 'Renault',
 'Volkswagen']

In [463]:
serie = dataset.groupby(by="paint_color").count().iloc[:,0]
color_to_keep = serie.where(serie>300).dropna().index.to_list()
dataset['colorclean'] = dataset["paint_color"].apply(lambda x : 'Autre' if x not in color_to_keep else x)

In [464]:
dataset.groupby(by="fuel").count().iloc[:,0]

fuel
diesel           4527
electro             3
hybrid_petrol       5
petrol            170
Name: model_key, dtype: int64

In [465]:
serie = dataset.groupby(by="fuel").count().iloc[:,0]
fuel_to_keep = serie.where(serie>200).dropna().index.to_list()
dataset['fuelclean'] = dataset["fuel"].apply(lambda x : 'Autre' if x not in fuel_to_keep else x)

In [466]:
dataset['nb_option'] = dataset[['private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires']].sum(axis=1)

In [467]:
dataset.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day,marque,colorclean,fuelclean,nb_option
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106,Citroën,black,diesel,5
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101,Citroën,white,diesel,2
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158,Citroën,Autre,diesel,5
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183,Citroën,silver,diesel,4
5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True,131,Citroën,black,Autre,5


In [468]:
dataset.columns

Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day', 'marque',
       'colorclean', 'fuelclean', 'nb_option'],
      dtype='object')

In [469]:
# Separate target variable Y from features X
print("Separating labels from features...")
#Liste de toutes les features
features_list = ['mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires','marque']

#Liste en regroupant les catégories en "autre"
#features_list = ['marque','mileage', 'engine_power', 'fuelclean', 'colorclean','nb_option']

#Liste apres le feature selection Lasso
#features_list = ['mileage', 'engine_power','car_type', 'private_parking_available', 'has_gps','automatic_car', 'has_getaround_connect','marque']

target_variable = "rental_price_per_day"

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    106
2    101
3    158
4    183
5    131
Name: rental_price_per_day, dtype: int64

X :
   mileage  engine_power    fuel paint_color     car_type  \
0   140411           100  diesel       black  convertible   
2   183297           120  diesel       white  convertible   
3   128035           135  diesel         red  convertible   
4    97097           160  diesel      silver  convertible   
5   152352           225  petrol       black  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                       True     True                 False          False   
2                      False    False                 False          False   
3                       True     True                 False          False   
4                       True     True                 False          False   
5                       True     True                 False          False   

   has_getaround_connec

In [470]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [471]:
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires', 'marque']


In [472]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
# Create pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [473]:
model = Pipeline([
  ('prep', preprocessor),
  ('reg', LinearRegression())
])

In [474]:
#X_train = preprocessor.fit_transform(X_train)

In [475]:
#X_test = preprocessor.transform(X_test)

In [476]:
#Test
with mlflow.start_run(experiment_id = experiment.experiment_id):

    # Instanciate and fit the model 
    #regressor = LinearRegression()
    #regressor = ElasticNet()
    model.fit(X_train, Y_train)

    # Store metrics 
    Y_test_pred = model.predict(X_test)
    r2_score = r2_score(Y_test, Y_test_pred)

    # Log Metric 
    mlflow.log_metric("R2 SCORE", r2_score)


                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['mileage', 'engine_power']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(drop='first'))]),
                                 ['fuel', 'paint_color', 'car_type',
                                  'private_parking_...`
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['mileage', 'engine_power']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(drop='first'))]),
                                 ['fuel', 'paint_color', 'car_type',
                                  'private_parking_available'...`


In [309]:
#Test
with mlflow.start_run(experiment_id = experiment.experiment_id):
    regressor = Ridge()
    # Grid of values to be tested
    params = {
        'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
    }
    best_ridge = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
    best_ridge.fit(X_train, Y_train)
    # Store metrics 
    Y_test_pred = best_ridge.predict(X_test)
    r2_score = r2_score(Y_test, Y_test_pred)

    # Log Metric 
    mlflow.log_metric("R2 SCORE", r2_score)

2023/05/28 11:05:55 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.


In [336]:
#Test
with mlflow.start_run(experiment_id = experiment.experiment_id):
    regressor = Lasso()
    # Grid of values to be tested
    params = {
        'alpha': [1, 2, 3, 5, 10, 20, 30]
    }
    best_lasso = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
    best_lasso.fit(X_train, Y_train)
    # Store metrics 
    Y_test_pred = best_lasso.predict(X_test)
    r2_score = r2_score(Y_test, Y_test_pred)

    # Log Metric 
    mlflow.log_metric("R2 SCORE", r2_score)

2023/05/28 11:06:17 INFO mlflow.sklearn.utils: Logging the 5 best runs, 2 runs will be omitted.


In [279]:

data_dict = {
    'Feature': preprocessor.get_feature_names_out(),
    #'Best_Ridge': best_ridge.best_estimator_.coef_,
    'Best_Lasso': best_lasso.best_estimator_.coef_
            }

In [282]:
#Get the features use for lasso 
data_dict = {
    'Feature': preprocessor.get_feature_names_out(),
    'Best_Ridge': best_ridge.best_estimator_.coef_,
    'Best_Lasso': best_lasso.best_estimator_.coef_
            }

coefficients = pd.DataFrame(data = data_dict)
coefficients.head()

best_ridge.best_estimator_.coef_ != 0

mask = coefficients['Best_Lasso'] != 0
best_features = coefficients.loc[mask, 'Feature'].to_list()
best_features

['num__mileage',
 'num__engine_power',
 'cat__car_type_estate',
 'cat__car_type_suv',
 'cat__private_parking_available_True',
 'cat__has_gps_True',
 'cat__automatic_car_True',
 'cat__has_getaround_connect_True',
 'cat__marque_Citroën']

In [None]:
#C'est assez impréssionant si on recolle les features sélectionnées avec la vie réelle.
#Les critères comme le carburant ont peu d'importance, la couleur pareil 
#Boite de vitesse et GPS sont importants mais pas l'air conditionné 

In [None]:
#Conclusion : La régréssion lineaire simple nous donne le meilleur coefficient R2