In [None]:
cd /content/drive/MyDrive/Colab Notebooks/matrix_two/DW_Matrix_car

/content/drive/MyDrive/Colab Notebooks/matrix_two/DW_Matrix_car


In [35]:
import pandas as pd
import numpy as np 

from sklearn.dummy import DummyRegressor 
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest,f_regression, mutual_info_regression

import eli5
from eli5.sklearn import PermutationImportance

from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, space_eval

## Wczytywanie Danych

In [None]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

## Przygotowanie Cech

In [None]:
def get_feats(df,black_list):
    # Wybieramy zmienne numeryczne i bool
    feats = list(df.select_dtypes(include = ['number','bool']).columns)
    
    valid_feats = [feat for feat in feats if feat not in black_list]
    return valid_feats

In [None]:
black_list = ['price_value']
feats = get_feats(df,black_list )

In [None]:
SUFFIX_CAT = '__cat'
def factorizing_columns(df):
  for feat in df.columns:
    # jezeli kolumna to lista, ignoruj
    if isinstance( df[feat][0], (list, bool, int, float ) ): print(feat); continue  

    factorized_values = df[feat].factorize()[0]
    if SUFFIX_CAT in feat:
      df[feat] = factorized_values
    else:
      df[ feat + SUFFIX_CAT] = factorized_values

  return df

In [None]:
df = factorizing_columns(df)

breadcrumb
price_value


  # This is added back by InteractiveShellApp.init_path()


In [None]:
cat_feats = [cat_feat for cat_feat in df.columns if SUFFIX_CAT in cat_feat]
len(cat_feats)

153

In [None]:
def show_importance(model, df, feats, target):
  X = df[ feats ].values
  y = df[ target ].values
  model.fit(X,y)

  imp = PermutationImportance(model, random_state= 0).fit(X, y)
  return eli5.show_weights(imp, feature_names = feats)

In [None]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'random_state': 0,
    'objective': 'reg:squarederror'
}

model = XGBRegressor(**xgb_params)

In [None]:
show_importance(model, df, cat_feats,'price_value')

Weight,Feature
0.1281  ± 0.0028,param_napęd__cat
0.1059  ± 0.0036,param_stan__cat
0.0912  ± 0.0014,param_rok-produkcji__cat
0.0598  ± 0.0010,param_skrzynia-biegów__cat
0.0494  ± 0.0016,param_moc__cat
0.0492  ± 0.0005,param_faktura-vat__cat
0.0240  ± 0.0010,feature_kamera-cofania__cat
0.0235  ± 0.0009,param_marka-pojazdu__cat
0.0199  ± 0.0006,param_typ__cat
0.0189  ± 0.0024,seller_name__cat


In [None]:
important_feats = ['param_napęd__cat','param_stan__cat','param_rok-produkcji__cat','param_skrzynia-biegów__cat','param_moc__cat','param_faktura-vat__cat','feature_kamera-cofania__cat','param_marka-pojazdu__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa__cat','param_wersja__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_światła-led__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat']

In [None]:
def train_and_check(model, X, y, cv = 3, scoring= 'neg_mean_absolute_error'):
    scores = cross_val_score(model, X, y, scoring= scoring, cv = cv, )
    return np.mean(scores),np.std(scores)

In [None]:
def run_model(model, df, feats, target,cv = 5, scoring= 'neg_mean_absolute_error'):
  X = df[ feats ].values
  y = df[ target ].values

  return train_and_check(model, X, y)

In [None]:
# model ze wszystkimi cechami
run_model(model,df,cat_feats,'price_value')

(-13105.09658540803, 66.26770659640614)

In [None]:
# model z top 20 najwazniejszymi cechami
run_model(model,df,important_feats,'price_value')

(-13371.749129325122, 120.74632178909155)

In [None]:
def es(model, df, feats, target, k = 5):
    X = df[ feats ].values
    y = df[ target ].values
    X_new = SelectKBest(mutual_info_regression, k = k).fit_transform(X, y)
    
    return train_and_check(model,X_new ,y)

In [None]:
es(model,df,cat_feats,'price_value', k = 20)

(-16834.07358516897, 41.16506732734249)

lepszy wynik przy trenowaniu modelu z top 20 cech z eli5 niz SelectKBest z biblioteki sklearn

### Przywrocenie oryginalnych wartosci dla kolumn numerycznych i sprawdzenie jakosci modelu

In [None]:
df['param_rok-produkcji'].unique()

array(['2018', '2011', '2015', '2009', '2017', '2012', '2013', '2007',
       '2001', '2016', '2006', '2008', '2004', '1999', '2000', '2010',
       '2005', '2002', '1998', '2014', '2003', '1982', '1995', '1997',
       '1992', '1993', '1994', '1996', '1989', '1988', '1967', '1987',
       '1970', '1959', '1990', '1991', '1974', None, '1975', '1973',
       '1953', '1985', '1984', '1986', '1981', '1979', '1960', '1983',
       '1978', '1964', '1980', '1972', '1969', '1956', '1966', '1977',
       '1962', '1965', '1971', '1963', '1961', '1952', '1949', '1976',
       '1937', '1968', '1958', '1955', '1933', '1929', '1957', '1944',
       '1954', '1932', '1936', '1947', '1948'], dtype=object)

In [None]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))

In [None]:
important_feats2 = ['param_napęd__cat','param_stan__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc__cat','param_faktura-vat__cat','feature_kamera-cofania__cat','param_marka-pojazdu__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa__cat','param_wersja__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_światła-led__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat']

In [None]:
# model z top 20 najwazniejszymi cechami, z oryginalnym rokiem produkcji
run_model(model,df,important_feats2,'price_value')

(-11386.987581301895, 57.94138820640305)

In [None]:
df['param_moc'].unique()[:10]

array(['90 KM', '115 KM', '262 KM', '110 KM', '310 KM', '105 KM',
       '140 KM', '175 KM', '125 KM', '185 KM'], dtype=object)

In [None]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0])) 

In [None]:
important_feats3 = ['param_napęd__cat','param_stan__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc','param_faktura-vat__cat','feature_kamera-cofania__cat','param_marka-pojazdu__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa__cat','param_wersja__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_światła-led__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat']

In [None]:
# model z top 20 najwazniejszymi cechami, z oryginalnym rokiem produkcji, z moca jako int
run_model(model,df,important_feats3,'price_value')

(-9726.16386098245, 80.75857928675383)

In [None]:
df['param_pojemność-skokowa'].unique()[:10]

array(['898 cm3', '1 560 cm3', '3 000 cm3', '1 984 cm3', '1 598 cm3',
       '1 368 cm3', '1 995 cm3', '1 400 cm3', '2 200 cm3', '2 400 cm3'],
      dtype=object)

In [None]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(' ',''))) 

In [None]:
important_feats4 = ['param_napęd__cat','param_stan__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc','param_faktura-vat__cat','feature_kamera-cofania__cat','param_marka-pojazdu__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa','param_wersja__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_światła-led__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat']

In [None]:
# model z top 20 najwazniejszymi cechami, z oryginalnym rokiem produkcji, z moca jako int i pojemnoscia jako int
run_model(model,df,important_feats4,'price_value')

(-9556.426781679105, 101.5185930825203)

## Hyperopt

In [44]:
def obj_func(params):
  print("Training with params: ")
  print(params)

   
  try:
    mean_mae, mean_std = run_model(XGBRegressor(**params), df, important_feats4, target = 'price_value',cv = 5, scoring= 'neg_mean_absolute_error')
    return {'loss': np.abs(mean_mae), "status": STATUS_OK}
  except:
    return {"loss: ", np.abs(mean_mae), "status: ", STATUS_FAIL}


# space
xgb_reg_params = {
    'learning_rate':           hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':               hp.choice('max_depth',        np.arange(5, 16, 1, dtype = int)),
    'subsample':               hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree':        hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'objective':               'reg:squarederror',
    'n_estimators':            100,
    'seed': 0,
}

best = fmin(obj_func, xgb_reg_params, algo = tpe.suggest, max_evals = 25)

space_eval(xgb_reg_params, best)

Training with params: 
{'colsample_bytree': 0.5, 'learning_rate': 0.25, 'max_depth': 9, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}
Training with params: 
{'colsample_bytree': 0.75, 'learning_rate': 0.25, 'max_depth': 9, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}
Training with params: 
{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.3, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}
Training with params: 
{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}
Training with params: 
{'colsample_bytree': 0.65, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}
Training with params: 
{'colsample_bytree': 0.7000000

{'colsample_bytree': 0.65,
 'learning_rate': 0.05,
 'max_depth': 14,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'seed': 0,
 'subsample': 0.9}

## Najlepsze parametry dla algorytmu XGBoost
#### Dla ktorych <b> loss </b> wyniosl 7561
{'colsample_bytree': 0.65,
 'learning_rate': 0.05,
 'max_depth': 14,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'seed': 0,
 'subsample': 0.9}