In [6]:
cd '/content/drive/My Drive/Colab Notebooks/dw_matrix_car'

/content/drive/My Drive/Colab Notebooks/dw_matrix_car


In [7]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

Requirement already up-to-date: tables in /usr/local/lib/python3.6/dist-packages (3.6.1)


In [8]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance
import tensorflow
print(tensorflow.__version__)

import xgboost as xgb

Using TensorFlow backend.


1.15.0


In [9]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

# Feature Engineering

In [0]:
SUFFIX_CAT='__cat'
for feat in df.columns:
  if isinstance(df[feat][0], list): continue

  factorized_value=df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat]=factorized_value
  else:
    df[feat +SUFFIX_CAT]=factorized_value
  #print(feat)

In [11]:
cat_feats=[x for x in df.columns if SUFFIX_CAT in x]
cat_feats=[x for x in cat_feats if 'price' not in x]
cat_feats

['created_at__cat',
 'seller_address__cat',
 'seller_name__cat',
 'seller_type__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_poduszka-powietrzna-chroniąca-kolana__cat',
 'feature_kurtyny-powietrzne__cat',
 'feature_klimatyzacja-dwustrefowa__cat',
 'feature_światła-led__cat',
 'feature_czujnik-zmierzchu__cat',
 'feature_elektrycznie-ustawiane-lusterka__cat',
 'feature_asr-(kontrola-trakcji)__cat',
 'feature_poduszka-powietrzna-kierowcy__cat',
 'feature_cd__cat',
 'feature_elektryczne-szyby-przednie__cat',
 'feature_poduszka-powietrzna-pasażera__cat',
 'feature_system-start-stop__cat',
 'feature_światła-do-jazdy-dziennej__cat',
 'feature_komputer-pokładowy__cat',
 'feature_elektryczne-szyby-tylne__cat',
 'feature_klimatyzacja-manualna__cat',
 'feature_tapicerka-welurowa__cat',
 'feature_czujnik-deszczu__cat',
 'feature_światła-przeciwmgielne__cat',
 'feature_ogrzewanie-postojowe__cat',
 'feature_radio-niefabryczne__cat',
 'feature_regulowane-zawieszenie__cat',
 'feature_

In [12]:
X=df[cat_feats].values
y=df.price_value.values

model=DecisionTreeRegressor(max_depth=15)
scores=cross_val_score(model, X, y, cv=10, scoring='neg_mean_absolute_error')
np.mean(scores), np.std(scores)


#|max_depth|cv|     mean|    std|
#|        5| 5|-19651.81| 281.07|
#|        7| 5|-17342.16| 201.36|
#|       10| 5|-15489.60| 394.63|
#|       15| 5|-14943.41|1184.19|
#+---------+--+---------+-------+
#|        5| 3|-19695.13| 148.72|
#|        7| 3|-17715.81| 589.25|
#|       10| 3|-15987.08| 610.95|
#|       15| 3|-15210.58| 633.45|
#+---------+--+---------+-------+
#|        5|10|-19745.19| 246.43|
#|        7|10|-17400.25| 214.77|
#|       10|10|-15562.10| 250.89|
#|       15|10|-15210.58| 633.45|
#+---------+--+---------+-------+

(-14302.151567039007, 507.00677879892345)

In [13]:
m= DecisionTreeRegressor(max_depth=7)
m.fit(X,y)

imp = PermutationImportance(m, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)

Weight,Feature
0.2755  ± 0.0099,param_rok-produkcji__cat
0.2736  ± 0.0239,param_stan__cat
0.2599  ± 0.0040,param_napęd__cat
0.1906  ± 0.0059,param_faktura-vat__cat
0.0789  ± 0.0037,param_moc__cat
0.0544  ± 0.0028,param_skrzynia-biegów__cat
0.0389  ± 0.0030,param_pojemność-skokowa__cat
0.0336  ± 0.0042,param_marka-pojazdu__cat
0.0331  ± 0.0022,feature_kamera-cofania__cat
0.0195  ± 0.0027,param_kod-silnika__cat


In [0]:
def run_model(model, feats, cv=10):
  X=df[feats].values
  y=df.price_value.values
  scores=cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)
  

#DecisionTree

In [34]:
run_model(model=DecisionTreeRegressor(max_depth=7), feats=cat_feats)

(-14247.531634286957, 438.98982919836754)

#RandomForest

In [14]:
run_model(model=RandomForestRegressor(max_depth=7, n_estimators=25, random_state=0), feats=cat_feats)

(-16280.716168398785, 191.0576886313013)

#XGBoost

In [15]:
xgb_param={
    'max_depth':7, 
    'n_estimators':25, 
    'learning_rate':0.1,
    'seed':0
}
m=run_model(model=xgb.XGBRegressor(**xgb_param), feats=cat_feats)



(-10956.56676303147, 117.05186004344151)

In [18]:
mxgb=xgb.XGBRegressor(**xgb_param)
X=df[cat_feats].values
y=df.price_value.values
mxgb.fit(X,y)
imp = PermutationImportance(mxgb, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1940  ± 0.0043,param_rok-produkcji__cat
0.1217  ± 0.0040,param_stan__cat
0.1076  ± 0.0020,param_napęd__cat
0.0883  ± 0.0030,param_skrzynia-biegów__cat
0.0597  ± 0.0016,param_moc__cat
0.0537  ± 0.0013,param_faktura-vat__cat
0.0425  ± 0.0012,param_marka-pojazdu__cat
0.0291  ± 0.0009,param_typ__cat
0.0247  ± 0.0011,param_pojemność-skokowa__cat
0.0206  ± 0.0012,feature_kamera-cofania__cat


In [0]:
xgb_feats=[
'param_rok-produkcji__cat',
'param_stan__cat',
'param_napęd__cat',
'param_skrzynia-biegów__cat',
'param_moc__cat',
'param_faktura-vat__cat',
'param_marka-pojazdu__cat',
'param_typ__cat',
'param_pojemność-skokowa__cat',
'feature_kamera-cofania__cat',
'param_wersja__cat',
'seller_name__cat',
'param_model-pojazdu__cat',
'feature_wspomaganie-kierownicy__cat',
'param_kod-silnika__cat',
'feature_czujniki-parkowania-przednie__cat',
'feature_system-start-stop__cat',
'param_przebieg__cat',
'param_color__cat',
'feature_asystent-pasa-ruchu__cat' ]

In [84]:
xgb_param2={
    'max_depth':7, 
    'n_estimators':50, 
    'learning_rate':0.1,
    'seed':0
}
run_model(model=xgb.XGBRegressor(**xgb_param), feats=xgb_feats)



(-8473.570175987838, 173.54971045738318)



```
1. 'param_rok-produkcji__cat' (-11163.01090813468, 154.46362335181163)
2. 'param_moc' (-8712.125804447995, 151.6976545668947)
3. 'param_przebieg__cat'(-8605.226467932622, 192.37815264047242)
4. 'param_pojemność-skokowa'(-8472.16504242178, 164.8653786320036)
5. 'param_color' (-8473.570175987838, 173.54971045738318)

```



In [62]:
(df['param_pojemność-skokowa'].unique())

array(['898 cm3', '1 560 cm3', '3 000 cm3', ..., '5 992 cm3', '1 966 cm3',
       '142 280 cm3'], dtype=object)

In [0]:
df['param_rok-produkcji__cat']=df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else int(x))

In [0]:
df.param_moc__cat=df['param_moc'].map(lambda x: -1 if str(x)=='None' else int(x.replace(' KM', '').replace(' ', '')))

In [0]:
df.param_przebieg__cat=df['param_przebieg'].map(lambda x: -1 if str(x)=='None' else int(str(x).split('km')[0].replace(' ', '')))

In [0]:
df['param_pojemność-skokowa__cat'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=='None' else int(str(x).split('cm')[0].replace(' ', '')))
