In [12]:
import pandas as pd
import plotly.express as px
import json
import os

data = pd.read_csv('../data/get_around_pricing_project.csv',index_col=0)
print(data.head())
print(data.shape)

  model_key  mileage  engine_power    fuel paint_color     car_type  \
0   Citroën   140411           100  diesel       black  convertible   
1   Citroën    13929           317  petrol        grey  convertible   
2   Citroën   183297           120  diesel       white  convertible   
3   Citroën   128035           135  diesel         red  convertible   
4   Citroën    97097           160  diesel      silver  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                       True     True                 False          False   
1                       True     True                 False          False   
2                      False    False                 False          False   
3                       True     True                 False          False   
4                       True     True                 False          False   

   has_getaround_connect  has_speed_regulator  winter_tires  \
0                   True                 

In [13]:
print(data.dtypes)


model_key                    object
mileage                       int64
engine_power                  int64
fuel                         object
paint_color                  object
car_type                     object
private_parking_available      bool
has_gps                        bool
has_air_conditioning           bool
automatic_car                  bool
has_getaround_connect          bool
has_speed_regulator            bool
winter_tires                   bool
rental_price_per_day          int64
dtype: object


In [14]:
print(data.describe())
print(data[data['engine_power']==0])
median_engine = data['engine_power'].median()
data.loc[data['engine_power'] == 0, 'engine_power'] = median_engine

            mileage  engine_power  rental_price_per_day
count  4.843000e+03    4843.00000           4843.000000
mean   1.409628e+05     128.98823            121.214536
std    6.019674e+04      38.99336             33.568268
min   -6.400000e+01       0.00000             10.000000
25%    1.029135e+05     100.00000            104.000000
50%    1.410800e+05     120.00000            119.000000
75%    1.751955e+05     135.00000            136.000000
max    1.000376e+06     423.00000            422.000000
     model_key  mileage  engine_power    fuel paint_color car_type  \
3765    Nissan    81770             0  diesel       white      suv   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
3765                      False    False                 False          False   

      has_getaround_connect  has_speed_regulator  winter_tires  \
3765                  False                False         False   

      rental_price_per_day  
3765                   108  


In [15]:
data.isna().sum()

model_key                    0
mileage                      0
engine_power                 0
fuel                         0
paint_color                  0
car_type                     0
private_parking_available    0
has_gps                      0
has_air_conditioning         0
automatic_car                0
has_getaround_connect        0
has_speed_regulator          0
winter_tires                 0
rental_price_per_day         0
dtype: int64

In [16]:
distri_model = data.model_key.value_counts()
fig = px.bar(distri_model.reset_index(),x='model_key',y='count',title='Distribution modèle de voiture')
fig.show()


In [17]:
mean_by_model = data.groupby('model_key')['rental_price_per_day'].mean().reset_index().sort_values('rental_price_per_day',ascending=False)
fig = px.bar(mean_by_model,x='model_key',y='rental_price_per_day',title="Moyenne prix location par marque de voiture")
fig.show()

In [18]:
data.columns

Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [19]:
print(data.paint_color.value_counts().index.to_list())
print(data.model_key.value_counts().index.to_list())
print(len(data.model_key.value_counts().index.to_list()))
print(data.car_type.value_counts().index.to_list())
print(data.fuel.value_counts().index.to_list())
data_dashboard = {
    "colors" : data.paint_color.value_counts().index.to_list(),
    "models" : data.model_key.value_counts().index.to_list(),
    "car_type" : data.car_type.value_counts().index.to_list(),
    "fuel" : data.fuel.value_counts().index.to_list()
}
json_data = json.dumps(data_dashboard)
if not os.path.exists("../getaround/src/lib/server"):
    os.mkdir('../getaround/src/lib/server')
if not os.path.exists("../getaround/src/lib/server/data"):
    os.mkdir('../getaround/src/lib/server/data')
with open('../getaround/src/lib/server/data/car_attributes.json','w',encoding='utf-8') as w:
    w.write(json_data)

['black', 'grey', 'blue', 'white', 'brown', 'silver', 'red', 'beige', 'green', 'orange']
['Citroën', 'Renault', 'BMW', 'Peugeot', 'Audi', 'Nissan', 'Mitsubishi', 'Mercedes', 'Volkswagen', 'Toyota', 'SEAT', 'Subaru', 'PGO', 'Ferrari', 'Opel', 'Maserati', 'Suzuki', 'Porsche', 'Ford', 'KIA Motors', 'Alfa Romeo', 'Fiat', 'Lexus', 'Lamborghini', 'Mazda', 'Honda', 'Mini', 'Yamaha']
28
['estate', 'sedan', 'suv', 'hatchback', 'subcompact', 'coupe', 'convertible', 'van']
['diesel', 'petrol', 'hybrid_petrol', 'electro']


In [20]:
data.columns

Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [21]:
# Features :
features_to_keep = ['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires']
# Target = 

target_feature = 'rental_price_per_day'



In [22]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

KeyboardInterrupt: 

In [None]:
y = data[target_feature].copy()
X = data.drop(target_feature,axis=1)
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

num_features = ['mileage', 'engine_power']
cat_features = ['model_key','fuel', 'paint_color','car_type']
bool_features = ['private_parking_available','has_gps','has_air_conditioning','automatic_car','has_getaround_connect','has_speed_regulator','winter_tires']

preprocessor = ColumnTransformer(transformers=[
    ('num',StandardScaler(),num_features),
    ("cat", OneHotEncoder(),cat_features,),
    ("bool","passthrough",bool_features)
])
model_logistic_regre = Pipeline([
    ('preprocessing', preprocessor),
    ('rf', LogisticRegression(max_iter=200))
])
model_forest = Pipeline([
    ('preprocessing', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=300, random_state=42))
])

In [None]:


model_logistic_regre.fit(x_train,y_train)

y_train_pred = model_logistic_regre.predict(x_train)
y_test_pred = model_logistic_regre.predict(x_test)

print(f"Accuracy de train :{accuracy_score(y_train,y_train_pred)}")
print(f"Accuracy de test : {accuracy_score(y_test,y_test_pred)}")

print(
    "Accuracy on training set : ", model_logistic_regre.score(x_train, y_train)
)
print("Accuracy on test set : ", model_logistic_regre.score(x_test, y_test))

In [None]:
model_forest.fit(x_train,y_train)

y_train_pred = model_forest.predict(x_train)
y_test_pred = model_forest.predict(x_test)

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"MAE de train : {mae_train}")
print(f"MAE de test : {mae_test}")
print(f"R2 de train : {r2_train}")
print(f"R2 de test : {r2_test}")

Le RandomForest donne de bon résultat sans avoir l'air de trop overfit

En moyenne, le modèle se trompe de 11 € sur les prix de location. C’est raisonnable selon l’échelle des prix.

In [None]:
import joblib

if not os.path.exists('../getaround/inference'):
    os.mkdir('../getaround/inference')

joblib.dump(model_forest,filename='../getaround/inference/infe_rental_price.joblib')