In [2]:
!pip install eli5



In [4]:
cd /content/drive/MyDrive/Colab Notebooks/matrix_two/DW_Matrix_car

/content/drive/MyDrive/Colab Notebooks/matrix_two/DW_Matrix_car


In [30]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor 
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest,f_regression, mutual_info_regression

import eli5
from eli5.sklearn import PermutationImportance

## Wczytywanie Danych

In [6]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [9]:
df.columns.values

array(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       'feature_kurtyny-powietrzne', 'feature_klimatyzacja-dwustrefowa',
       'feature_światła-led', 'feature_czujnik-zmierzchu',
       'feature_elektrycznie-ustawiane-lusterka',
       'feature_asr-(kontrola-trakcji)',
       'feature_poduszka-powietrzna-kierowcy', 'feature_cd',
       'feature_elektryczne-szyby-przednie',
       'feature_poduszka-powietrzna-pasażera',
       'feature_system-start-stop', 'feature_światła-do-jazdy-dziennej',
       'feature_komputer-pokładowy', 'feature_elektryczne-szyby-tylne',
       'feature_klimatyzacja-manualna', 'feature_tapicerka-welurowa',
       'feature_czujnik-deszczu', 'feature_światła-przeciwmgielne',
       'feature_ogrzewanie-postojowe', 'feature_radio-niefabryczne',
       'feature_regulowane-zaw

## Dummy model

In [11]:
def get_feats(df,black_list):
    # Wybieramy zmienne numeryczne i bool
    feats = list(df.select_dtypes(include = ['number','bool']).columns)
    
    valid_feats = [feat for feat in feats if feat not in black_list]
    return valid_feats

In [15]:
black_list = ['price_value']
feats = get_feats(df,black_list )

In [16]:
X = df[feats].values
y = df[black_list]
model = DummyRegressor()
model.fit(X,y)
y_pred = model.predict(X)

mae = mean_absolute_error(y,y_pred)
mae  

39465.934630440985

## Przygotowanie Cech

In [20]:
SUFFIX_CAT = '__cat'
def factorizing_columns(df):
  for feat in df.columns:
    # jezeli kolumna to lista, ignoruj
    if isinstance( df[feat][0], (list, bool, int, float ) ): print(feat); continue  

    factorized_values = df[feat].factorize()[0]
    if SUFFIX_CAT in feat:
      df[feat] = factorized_values
    else:
      df[ feat + SUFFIX_CAT] = factorized_values

  return df

In [22]:
df = factorizing_columns(df)

breadcrumb
price_value


In [23]:
cat_feats = [cat_feat for cat_feat in df.columns if SUFFIX_CAT in cat_feat]
len(cat_feats)

153

In [28]:
X = df[ cat_feats ].values
y = df['price_value'].values

model = DecisionTreeRegressor(random_state = 0)
scores = cross_val_score(model, X, y, cv = 3, scoring = 'neg_mean_absolute_error')
np.mean(scores), np.std(scores)

(-16660.836319886566, 943.7852337459377)

In [29]:
m = DecisionTreeRegressor(max_depth = 5, random_state = 0)
m.fit(X,y)

imp = PermutationImportance(m, random_state= 0).fit(X, y)
eli5.show_weights(imp, feature_names = cat_feats)

Weight,Feature
0.1917  ± 0.0061,param_faktura-vat__cat
0.1898  ± 0.0032,param_napęd__cat
0.1769  ± 0.0062,param_stan__cat
0.1502  ± 0.0077,param_rok-produkcji__cat
0.0919  ± 0.0023,param_skrzynia-biegów__cat
0.0611  ± 0.0036,param_moc__cat
0.0419  ± 0.0011,feature_kamera-cofania__cat
0.0196  ± 0.0020,param_pojemność-skokowa__cat
0.0162  ± 0.0003,feature_bluetooth__cat
0.0107  ± 0.0004,feature_łopatki-zmiany-biegów__cat


In [34]:
def train_and_check(model, X, y, cv = 5, scoring= 'neg_mean_absolute_error'):
    scores = cross_val_score(model, X, y, scoring= scoring, cv = cv, )
    return np.mean(scores),np.std(scores)

In [38]:
def es(model,k = 5):
    black_list = ['id', 'weekly_sales']
    X_new = SelectKBest(f_regression, k = k).fit_transform(X, y)
    
    return train_and_check(model,X_new ,y)

In [45]:
es(m,k = 10)

(-20487.43705631781, 290.5472853397143)