In [1]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 2.6MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 2.9MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix2/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix2/dw_matrix_car


In [0]:
df = pd.read_hdf('data/car.h5')

In [0]:
suffix_cat = '__cat'

for feat in df.columns:
  if isinstance(df[feat][0], list): continue

  factorized_values = df[feat].factorize()[0]
  if suffix_cat in feat:
    df[feat] = factorized_values
  else:
    df[feat + suffix_cat] = factorized_values

In [31]:
cat_feats = [x for x in df.columns if suffix_cat in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X,y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores),np.std(scores)


Decision Tree

In [12]:
run_model(DecisionTreeRegressor(max_depth=5), cat_feats)

(-19695.13091100928, 148.72570644015792)

Random forest

In [13]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(model, cat_feats)

(-18718.657185256638, 64.5424578125788)

XGBoost

In [43]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}
run_model(xgb.XGBRegressor(**xgb_params), cat_feats)



(-13108.379065811214, 74.32158265003798)

In [16]:
m = xgb.XGBRegressor(max_depth = 5, n_estimators = 50, learning_rate =  0.1, seed = 0)
m.fit(X,y)

imp = PermutationImportance(m).fit(X,y)
eli5.show_weights(imp,feature_names=cat_feats)



Weight,Feature
0.1184  ± 0.0012,param_napęd__cat
0.1129  ± 0.0019,param_rok-produkcji__cat
0.1087  ± 0.0012,param_stan__cat
0.0615  ± 0.0012,param_skrzynia-biegów__cat
0.0567  ± 0.0011,param_faktura-vat__cat
0.0492  ± 0.0018,param_moc__cat
0.0274  ± 0.0007,param_marka-pojazdu__cat
0.0239  ± 0.0005,feature_kamera-cofania__cat
0.0209  ± 0.0006,param_typ__cat
0.0174  ± 0.0007,seller_name__cat


In [21]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat' ]
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-13108.379065811214, 74.32158265003798)

In [0]:
#df['param_rok-produkcji'].unique()
#df['param_moc'].unique()

df['param_moc']= df['param_moc'].map(lambda x: -1 if str(x)=='None' else int(x.split(' ')[0]))
df['param_rok-produkcji']= df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else int(x))


In [40]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat' ]
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-13108.379065811214, 74.32158265003798)

In [0]:
'param_pojemność-skokowa__cat'
df['param_pojemność-skokowa']= df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(' ','')) )

In [45]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat' ]
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9569.227198767323, 72.83561801421891)