<a href="https://colab.research.google.com/github/MiM0ulay/Colab/blob/main/Car_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Phase d'expérimentation - Car Price Prediction

Ce Notebook présente comment entraîner un Random Forest qui cherche à prédire le prix de vente d'une voiture en fonction de ses caractéristiques.

In [None]:
# On télécharger les données
!wget https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/files/workshop_api_ml_cars.csv -O cars.csv -q

In [None]:
import warnings
warnings.filterwarnings('ignore')

#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cars = pd.read_csv('cars.csv')
cars.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## Nettoyage du jeu de données

In [None]:
# On remplace CarName par le nom de l'entreprise (premier mot dans CarName)
companies = cars['CarName'].apply(lambda x : x.split(' ')[0])
cars.insert(3, "companies", companies)
cars.drop(['CarName'], axis=1, inplace=True)

cars.companies = cars.companies.str.lower()

def replace_name(a,b):
    cars.companies.replace(a,b,inplace=True)

# On remplace certaines occurrences identiques
replace_name('maxda','mazda')
replace_name('porcshce','porsche')
replace_name('toyouta','toyota')
replace_name('vokswagen','volkswagen')
replace_name('vw','volkswagen')

cars.companies.unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'buick', 'mercury', 'mitsubishi',
       'nissan', 'peugeot', 'plymouth', 'porsche', 'renault', 'saab',
       'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [None]:
# Y a-t-il des doublons ?
cars.loc[cars.duplicated()].shape

(0, 26)

In [None]:
cars['fueleconomy'] = (0.55 * cars['citympg']) + (0.45 * cars['highwaympg'])

# On converti le prix en entiers
cars['price'] = cars['price'].astype('int')

# On calcule le prix moyen par entreprise que l'on met dans une table temporaire
temp = cars.copy()
avg_price = temp.groupby(['companies'])['price'].mean()
temp = temp.merge(avg_price.reset_index(), how='left',on='companies')
temp.head()

Unnamed: 0,car_ID,symboling,companies,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price_x,fueleconomy,price_y
0,1,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495,23.7,15498.333333
1,2,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500,23.7,15498.333333
2,3,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500,22.15,15498.333333
3,4,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950,26.7,17859.142857
4,5,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450,19.8,17859.142857


In [None]:
# On enregistre les prix moyens
avg_price.to_csv("avg_price.csv")

In [None]:
# On calcule ensuite un ordre de prix pour l'entreprise
bins = [0, 10000, 20000, 40000]
cars_bin = ['cheap', 'medium', 'high']
cars['company_price'] = pd.cut(temp['price_y'], bins, right=False, labels=cars_bin)
cars.head()

Unnamed: 0,car_ID,symboling,companies,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,fueleconomy,company_price
0,1,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495,23.7,medium
1,2,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500,23.7,medium
2,3,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500,22.15,medium
3,4,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950,26.7,medium
4,5,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450,19.8,medium


In [None]:
# On sélectionne notre jeu de données prêt à l'encodage
primary = cars[[
  'price', 'fueltype', 'aspiration','carbody', 'drivewheel','wheelbase',
  'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower', 
  'fueleconomy', 'carlength','carwidth', 'company_price'
]]
primary.head()

Unnamed: 0,price,fueltype,aspiration,carbody,drivewheel,wheelbase,curbweight,enginetype,cylindernumber,enginesize,boreratio,horsepower,fueleconomy,carlength,carwidth,company_price
0,13495,gas,std,convertible,rwd,88.6,2548,dohc,four,130,3.47,111,23.7,168.8,64.1,medium
1,16500,gas,std,convertible,rwd,88.6,2548,dohc,four,130,3.47,111,23.7,168.8,64.1,medium
2,16500,gas,std,hatchback,rwd,94.5,2823,ohcv,six,152,2.68,154,22.15,171.2,65.5,medium
3,13950,gas,std,sedan,fwd,99.8,2337,ohc,four,109,3.19,102,26.7,176.6,66.2,medium
4,17450,gas,std,sedan,4wd,99.4,2824,ohc,five,136,3.19,115,19.8,176.6,66.4,medium


## Encodage des variables

In [None]:
def dummies(col, df):
    temp = pd.get_dummies(df[col], prefix=col, drop_first = True)
    df = pd.concat([df, temp], axis=1)
    df.drop([col], axis=1, inplace=True)
    return df, temp.columns.values

dummy_cols = [
    "fueltype", "aspiration", "carbody", "drivewheel",
    "enginetype", "cylindernumber", "company_price"
]

new_dummies_cols = []

for col in dummy_cols:
  primary, new_cols = dummies(col, primary)
  new_dummies_cols += list(new_cols)

with open("dummies_cols.txt", "w") as f:
  f.write(",".join(new_dummies_cols))

primary.head()

Unnamed: 0,price,wheelbase,curbweight,enginesize,boreratio,horsepower,fueleconomy,carlength,carwidth,fueltype_gas,aspiration_turbo,carbody_hardtop,carbody_hatchback,carbody_sedan,carbody_wagon,drivewheel_fwd,drivewheel_rwd,enginetype_dohcv,enginetype_l,enginetype_ohc,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,cylindernumber_five,cylindernumber_four,cylindernumber_six,cylindernumber_three,cylindernumber_twelve,cylindernumber_two,company_price_medium,company_price_high
0,13495,88.6,2548,130,3.47,111,23.7,168.8,64.1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,16500,88.6,2548,130,3.47,111,23.7,168.8,64.1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,16500,94.5,2823,152,2.68,154,22.15,171.2,65.5,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
3,13950,99.8,2337,109,3.19,102,26.7,176.6,66.2,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
4,17450,99.4,2824,136,3.19,115,19.8,176.6,66.4,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0


In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(0)

X = primary.drop("price", axis=1)
y = primary["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [None]:
X_train.head()

Unnamed: 0,wheelbase,curbweight,enginesize,boreratio,horsepower,fueleconomy,carlength,carwidth,fueltype_gas,aspiration_turbo,carbody_hardtop,carbody_hatchback,carbody_sedan,carbody_wagon,drivewheel_fwd,drivewheel_rwd,enginetype_dohcv,enginetype_l,enginetype_ohc,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,cylindernumber_five,cylindernumber_four,cylindernumber_six,cylindernumber_three,cylindernumber_twelve,cylindernumber_two,company_price_medium,company_price_high
122,93.7,2191,98,2.97,68,34.15,167.3,63.8,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
125,94.5,2778,151,3.94,143,22.6,168.9,68.3,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1
166,94.5,2300,98,3.24,112,27.35,168.7,64.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,88.6,2548,130,3.47,111,23.7,168.8,64.1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
199,104.3,3157,130,3.62,162,19.25,188.8,67.2,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0


In [None]:
X_train.columns.values

array(['wheelbase', 'curbweight', 'enginesize', 'boreratio', 'horsepower',
       'fueleconomy', 'carlength', 'carwidth', 'fueltype_gas',
       'aspiration_turbo', 'carbody_hardtop', 'carbody_hatchback',
       'carbody_sedan', 'carbody_wagon', 'drivewheel_fwd',
       'drivewheel_rwd', 'enginetype_dohcv', 'enginetype_l',
       'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',
       'cylindernumber_six', 'cylindernumber_three',
       'cylindernumber_twelve', 'cylindernumber_two',
       'company_price_medium', 'company_price_high'], dtype=object)

## Construction du modèle

In [None]:
from sklearn.ensemble import RandomForestRegressor

hparams = {
    "n_estimators": 400,
    "max_depth": 12,
    "max_samples": 0.8,
    "max_features": "auto"
}

rf = RandomForestRegressor(**hparams)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=12, max_features='auto', max_leaf_nodes=None,
                      max_samples=0.8, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
from sklearn.metrics import r2_score

print("Score train : {:2.1f}%".format(r2_score(y_train, rf.predict(X_train)) * 100))
print("Score test : {:2.1f}%".format(r2_score(y_test, rf.predict(X_test)) * 100))

Score train : 98.5%
Score test : 91.1%


In [None]:
import joblib

joblib.dump(rf, "model.pkl")

['model.pkl']

In [None]:
!python3 -V

Python 3.7.10


## Encapsulation des transformations

In [None]:
sample = pd.read_csv("cars.csv").sample(10)

In [None]:
import re

COMPANIES = [
  'alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
  'isuzu', 'jaguar', 'mazda', 'buick', 'mercury', 'mitsubishi',
  'nissan', 'peugeot', 'plymouth', 'porsche', 'renault', 'saab',
  'subaru', 'toyota', 'volkswagen', 'volvo'
]
TRUE_COLUMNS = ['wheelbase', 'curbweight', 'enginesize', 'boreratio', 'horsepower',
       'fueleconomy', 'carlength', 'carwidth', 'fueltype_gas',
       'aspiration_turbo', 'carbody_hardtop', 'carbody_hatchback',
       'carbody_sedan', 'carbody_wagon', 'drivewheel_fwd',
       'drivewheel_rwd', 'enginetype_dohcv', 'enginetype_l',
       'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',
       'cylindernumber_six', 'cylindernumber_three',
       'cylindernumber_twelve', 'cylindernumber_two',
       'company_price_medium', 'company_price_high']

avg_price = pd.read_csv("avg_price.csv")
with open("dummies_cols.txt", "r") as f:
  dummies = f.read().split(",")

def transform(data):
  X = data.copy()
  companies = X['CarName'].apply(lambda x : x.split(' ')[0])
  X.insert(3, "companies", companies)
  X.drop(['CarName'], axis=1, inplace=True)

  X.companies = X.companies.str.lower()

  def replace_name(a,b):
      X.companies.replace(a,b,inplace=True)

  # On remplace certaines occurrences identiques
  replace_name('maxda','mazda')
  replace_name('porcshce','porsche')
  replace_name('toyouta','toyota')
  replace_name('vokswagen','volkswagen')
  replace_name('vw','volkswagen')

  X['fueleconomy'] = (0.55 * X['citympg']) + (0.45 * X['highwaympg'])

  temp = X.copy()
  temp = temp.merge(avg_price.reset_index(), how='left', on='companies')

  bins = [0, 10000, 20000, 40000]
  cars_bin = ['cheap', 'medium', 'high']
  X['company_price'] = pd.cut(temp['price_y'], bins, right=False, labels=cars_bin)
  X.head()

  X = X[[
    'price', 'fueltype', 'aspiration','carbody', 'drivewheel','wheelbase',
    'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower', 
    'fueleconomy', 'carlength','carwidth', 'company_price'
  ]]

  for dummy in dummies:
    X[dummy] = 0

  dummy_cols = [
      "fueltype", "aspiration", "carbody", "drivewheel",
      "enginetype", "cylindernumber", "company_price"
  ]

  def replace_dummies(col, df):
    temp = pd.get_dummies(df[col], prefix=col, drop_first = True)
    #df = pd.concat([df, temp], axis=1)
    # lsuffix nous indique les colonnes à retirer
    df = df.join(temp, lsuffix="_toremove")
    df.drop([col], axis=1, inplace=True)
    for colname in df.columns.values:
      if re.search(r"_toremove", colname):
        df.drop([colname], axis=1, inplace=True)
    return df

  for dummy in dummy_cols:
    X = replace_dummies(dummy, X)

  return X[TRUE_COLUMNS]

psample = transform(sample)

In [None]:
rf.predict(psample)

array([16586.87416667,  7644.06690675, 15989.04916667,  9765.45104167,
        6580.63958333, 29011.91333333, 17740.8625    , 12605.9025    ,
        8194.22835119, 15844.44583333])

## Tester l'API en ligne

In [None]:
import requests

# TODO : Remplacer par l'URL fournie par Heroku
ENDPOINT = "https://desolate-mountain-03377.herokuapp.com/"

sample = pd.read_csv("cars.csv").sample(10).to_dict()
rep = requests.post("{}/predict".format(ENDPOINT), json=sample)
print(rep.json())

{'prices': [29669.98875, 7534.445400793651, 12589.903958333332, 12105.237708333332, 8133.348166666667, 17587.185416666664, 24787.59625, 29570.3575, 7049.3950198412695, 12046.47708333333]}
