In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

### Wczytywanie danych

In [20]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car


In [21]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [22]:
df.columns

Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       ...
       'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',
       'param_body-type', 'param_matowy', 'param_bezwypadkowy',
       'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',
       'car_id'],
      dtype='object', length=155)

### Dummy Model

In [23]:
df.select_dtypes(np.number).columns

Index(['price_value', 'car_id'], dtype='object')

In [0]:
features = ['car_id']
X = df[features].values
y = df['price_value'].values

model = DummyRegressor()
model.fit(X, y)
y_pred = model.predict(X)

In [25]:
mae(y, y_pred)

39465.934630440985

In [26]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [31]:
print(df['price_currency'].value_counts(normalize=True)*100)

PLN    100.0
Name: price_currency, dtype: float64


In [29]:
df = df[df['price_currency'] != 'EUR']
df.shape

(106290, 155)

### Features

In [33]:
df['param_color'].factorize()[0] # factorize() generuje unikalne id dla wszystkich unikalnych kolorow --> [0] bierzemy tylko czynniki liczbowe bez labeli (kolorów), które są w [1]

array([-1, -1, -1, ..., -1, -1, -1])

In [0]:
SUFFIX_CAT = '_cat'
for feature in df.columns:
  if isinstance(df[feature][0], list): continue
  
  factorized_values = df[feature].factorize()[0]
  if SUFFIX_CAT in features:
    df[features] = factorized_values
  else:
    df[feature + SUFFIX_CAT] = df[feature].factorize()[0]

In [36]:
cat_features = [x for x in df.columns if SUFFIX_CAT in x]
cat_features = [x for x in cat_features if 'price' not in x]
len(cat_features)

151

In [42]:
X = df[cat_features].values
y = df['price_value'].values

model = DecisionTreeRegressor(max_depth = 5, random_state = 0)
scores = cross_val_score(model, X, y, cv = 3, scoring='neg_mean_absolute_error')
np.round(np.mean(scores), 2)

-19566.59

### Wyszukiwanie najciekawszych cech (najbardziej wartościowych do stworzenia bardziej optymalnego modelu)

In [44]:
m = DecisionTreeRegressor(max_depth = 5, random_state= 0)
m.fit(X, y)

imp = PermutationImportance(m).fit(X, y)
eli5.show_weights(imp, feature_names = cat_features)

Weight,Feature
0.2531  ± 0.0044,param_napęd_cat
0.2022  ± 0.0075,param_faktura-vat_cat
0.1914  ± 0.0035,param_stan_cat
0.1458  ± 0.0025,param_rok-produkcji_cat
0.0633  ± 0.0038,param_moc_cat
0.0416  ± 0.0011,feature_kamera-cofania_cat
0.0414  ± 0.0015,param_skrzynia-biegów_cat
0.0285  ± 0.0053,param_marka-pojazdu_cat
0.0193  ± 0.0026,param_pojemność-skokowa_cat
0.0163  ± 0.0004,feature_bluetooth_cat


In [0]:
imp_features = ['param_napęd_cat', 'param_faktura-vat_cat', 'param_stan_cat', 'param_rok-produkcji_cat', 'param_moc_cat']
X_imp = df[imp_features]
y_imp = df['price_value']

In [56]:
imp_model = DecisionTreeRegressor(max_depth = 5, random_state = 0)
imp_scores = cross_val_score(imp_model, X_imp, y_imp, cv = 3, scoring='neg_mean_absolute_error')
np.round(np.mean(imp_scores), 2)

-21391.6