In [0]:
# !pip install --upgrade tables
# !pip install eli5
# !pip install xgboost

In [3]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

Using TensorFlow backend.


In [4]:
cd "/content/drive/My Drive/Colab Notebooks/car_prices/Car_price_prediction/"

/content/drive/My Drive/Colab Notebooks/car_prices/Car_price_prediction


In [0]:
df = pd.read_hdf('data/car.h5')

In [6]:
df.shape

(106494, 155)

---
## Feature engineering

In [0]:
# Quick data engineering.
# Factorizing all the columns (that are not lists). New factorized columns with suffix '__cat'.

SUFFIX_CAT = '__cat'

for feat in df.columns:
  # TypeError: unhashable type:'list' - checking if its list, y = continue.
  if isinstance(df[feat][0],list):
    continue
  # To avoid doubling adding __cat__cat, if we already factorized the feature then we will overwrite it.
  
  factorized_values = df[feat].factorize()[0]
  
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  #If feature hasn't been factorized then factorize it.
  else:  
    df[feat + SUFFIX_CAT] = factorized_values

In [0]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]

In [0]:
def run_model(model,feats):
  
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model,X,y,cv=3, scoring='neg_mean_absolute_error')

  return np.mean(scores), np.std(scores)

---
## Decision Tree

In [20]:
run_model(model = DecisionTreeRegressor(max_depth=5), feats = cat_feats)

(-19695.13091100928, 148.72570644015792)

---
## Random forest

In [23]:
run_model(model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=0), feats = cat_feats)

(-18718.657185256638, 64.5424578125788)

---
## XGBoost



In [10]:
xgb_params = {
    'max_depth' : 5,
    'n_estimators' : 50,
    'learning_rate' : 0.1,
    'seed' : 0
}

model = xgb.XGBRegressor(**xgb_params)

run_model(model, feats = cat_feats)



(-13108.379065811214, 74.32158265003798)

---
## Feature engineering for XGBoost

In [0]:
X = df[cat_feats].values
y = df['price_value'].values

In [18]:
# Checking features important for XGBoost

xgb_params = {
    'max_depth' : 5,
    'n_estimators' : 50,
    'learning_rate' : 0.1,
    'seed' : 0
}

model_2 = xgb.XGBRegressor(**xgb_params)
model_2.fit(X,y)

imp = PermutationImportance(model_2, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1194  ± 0.0031,param_napęd__cat
0.1132  ± 0.0032,param_rok-produkcji__cat
0.1090  ± 0.0025,param_stan__cat
0.0619  ± 0.0025,param_skrzynia-biegów__cat
0.0568  ± 0.0016,param_faktura-vat__cat
0.0489  ± 0.0014,param_moc__cat
0.0273  ± 0.0007,param_marka-pojazdu__cat
0.0242  ± 0.0013,feature_kamera-cofania__cat
0.0212  ± 0.0008,param_typ__cat
0.0174  ± 0.0008,param_pojemność-skokowa__cat


In [0]:
feats = ['param_napęd__cat', 'param_rok-produkcji__cat', 'param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [20]:
feats

['param_napęd__cat',
 'param_rok-produkcji__cat',
 'param_stan__cat',
 'param_skrzynia-biegów__cat',
 'param_faktura-vat__cat',
 'param_moc__cat',
 'param_marka-pojazdu__cat',
 'feature_kamera-cofania__cat',
 'param_typ__cat',
 'param_pojemność-skokowa__cat',
 'seller_name__cat',
 'feature_wspomaganie-kierownicy__cat',
 'param_model-pojazdu__cat',
 'param_wersja__cat',
 'param_kod-silnika__cat',
 'feature_system-start-stop__cat',
 'feature_asystent-pasa-ruchu__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_łopatki-zmiany-biegów__cat',
 'feature_regulowane-zawieszenie__cat']

In [21]:
run_model(xgb.XGBRegressor(**xgb_params),feats)



(-13375.230420852275, 65.40441107118909)

In [22]:
df['param_napęd'].unique()

array([None, 'Na przednie koła', '4x4 (dołączany automatycznie)',
       'Na tylne koła', '4x4 (dołączany ręcznie)', '4x4 (stały)'],
      dtype=object)

In [23]:
df['param_rok-produkcji'].unique()

array(['2018', '2011', '2015', '2009', '2017', '2012', '2013', '2007',
       '2001', '2016', '2006', '2008', '2004', '1999', '2000', '2010',
       '2005', '2002', '1998', '2014', '2003', '1982', '1995', '1997',
       '1992', '1993', '1994', '1996', '1989', '1988', '1967', '1987',
       '1970', '1959', '1990', '1991', '1974', None, '1975', '1973',
       '1953', '1985', '1984', '1986', '1981', '1979', '1960', '1983',
       '1978', '1964', '1980', '1972', '1969', '1956', '1966', '1977',
       '1962', '1965', '1971', '1963', '1961', '1952', '1949', '1976',
       '1937', '1968', '1958', '1955', '1933', '1929', '1957', '1944',
       '1954', '1932', '1936', '1947', '1948'], dtype=object)

In [24]:
df['param_rok-produkcji__cat'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, -1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73, 74, 75])

In [0]:
# We lost the continuity of years, 0 - 2018, 1- 2011 etc..
# Checking None

df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x:-1 if str(x) == 'None' else int(x))

In [0]:
# Feats with param_rok-produkcji (without __cat)
feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [27]:
run_model(xgb.XGBRegressor(**xgb_params),feats)



(-11308.885890938496, 27.868488259630677)

In [28]:
df['param_moc'].unique()[:20]

array(['90 KM', '115 KM', '262 KM', '110 KM', '310 KM', '105 KM',
       '140 KM', '175 KM', '125 KM', '185 KM', '190 KM', '440 KM',
       '141 KM', '200 KM', '224 KM', '75 KM', '99 KM', '184 KM', '109 KM',
       '233 KM'], dtype=object)

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x : -1 if str(x) == 'None' else int(x.split(' ')[0]))

In [0]:
# Feats with param_rok-produkcji and param_moc (without __cat)
feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [34]:
run_model(xgb.XGBRegressor(**xgb_params),feats)



(-9716.450230340148, 62.2164408321879)

In [36]:
df['param_pojemność-skokowa'].unique()[:20]

array(['898 cm3', '1 560 cm3', '3 000 cm3', '1 984 cm3', '1 598 cm3',
       '1 368 cm3', '1 995 cm3', '1 400 cm3', '2 200 cm3', '2 400 cm3',
       '1 968 cm3', '4 399 cm3', '1 800 cm3', '2 000 cm3', '2 967 cm3',
       '1 329 cm3', '1 390 cm3', '2 926 cm3', '1 896 cm3', '1 197 cm3'],
      dtype=object)

In [37]:
df['param_pojemność-skokowa__cat'].unique()[:20]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(' ','') ))

In [43]:
df['param_pojemność-skokowa'].unique()[:20]

array([ 898, 1560, 3000, 1984, 1598, 1368, 1995, 1400, 2200, 2400, 1968,
       4399, 1800, 2000, 2967, 1329, 1390, 2926, 1896, 1197])

In [0]:
# Feats with param_rok-produkcji, param_moc and param_pojemność-skokow (without __cat)
feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [45]:
run_model(xgb.XGBRegressor(**xgb_params),feats)



(-9569.227198767323, 72.83561801421891)