In [1]:
import random

import dalex as dx
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn import model_selection

sns.set_theme()

  from pandas import MultiIndex, Int64Index


In [2]:
def seed_everything(seed: int) -> int:
    random.seed(seed)
    np.random.seed(seed)

    return seed

In [3]:
SEED = 357
seed_everything(SEED)

357

In [4]:
data = pd.read_csv('../data/raw/temat_2_dane.csv')
data_cols_description = pd.read_excel('../data/raw/temat_2_opis_zmiennych.xlsx')

In [6]:
def train_test_split_by_income(data, test_size=0.2, bins=5):
    income_bined = pd.qcut(data['Y'], q=bins)

    return model_selection.train_test_split(data, test_size=test_size, shuffle=True, stratify=income_bined, random_state=SEED)

In [7]:
data_train_val, data_test = train_test_split_by_income(data)
data_train, data_early_stopping = train_test_split_by_income(data_train_val, test_size=0.10)

In [8]:
X_train, y_train = data_train.drop('Y', axis=1), data_train['Y']
X_early_stopping, y_early_stopping = data_early_stopping.drop('Y', axis=1), data_early_stopping['Y']
X_test, y_test = data_test.drop('Y', axis=1), data_test['Y']

In [9]:
xgb_best_params = {"max_depth": 23, "min_child_weight": 2.404712074481848, "colsample_bytree": 0.5095502611913834, "learning_rate": 0.0449682801162507, "subsample": 0.9196040393707718, "alpha": 0.0524693644585006, "lambda": 4.5586068865713215e-07}

In [10]:
model_xgb = xgb.XGBRegressor(verbosity=0, objective='reg:squarederror', n_estimators=10000, **xgb_best_params, random_state=SEED)
model_xgb.fit(X_train, np.log10(y_train + 1), eval_set=[(X_early_stopping, np.log10(y_early_stopping + 1))], eval_metric='rmse', early_stopping_rounds=50)

[0]	validation_0-rmse:3.19415
[1]	validation_0-rmse:3.05154
[2]	validation_0-rmse:2.91495
[3]	validation_0-rmse:2.78471
[4]	validation_0-rmse:2.66016
[5]	validation_0-rmse:2.54097


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[6]	validation_0-rmse:2.42728
[7]	validation_0-rmse:2.31896
[8]	validation_0-rmse:2.21574
[9]	validation_0-rmse:2.11685
[10]	validation_0-rmse:2.02232
[11]	validation_0-rmse:1.93205
[12]	validation_0-rmse:1.84585
[13]	validation_0-rmse:1.76426
[14]	validation_0-rmse:1.68575
[15]	validation_0-rmse:1.61139
[16]	validation_0-rmse:1.54032
[17]	validation_0-rmse:1.47222
[18]	validation_0-rmse:1.40725
[19]	validation_0-rmse:1.34509
[20]	validation_0-rmse:1.28636
[21]	validation_0-rmse:1.22977
[22]	validation_0-rmse:1.17565
[23]	validation_0-rmse:1.12455
[24]	validation_0-rmse:1.07602
[25]	validation_0-rmse:1.02982
[26]	validation_0-rmse:0.98562
[27]	validation_0-rmse:0.94342
[28]	validation_0-rmse:0.90295
[29]	validation_0-rmse:0.86451
[30]	validation_0-rmse:0.82826
[31]	validation_0-rmse:0.79375
[32]	validation_0-rmse:0.76081
[33]	validation_0-rmse:0.72948
[34]	validation_0-rmse:0.69964
[35]	validation_0-rmse:0.67090
[36]	validation_0-rmse:0.64402
[37]	validation_0-rmse:0.61829
[38]	validat

XGBRegressor(alpha=0.0524693644585006, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.5095502611913834, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', lambda=4.5586068865713215e-07,
             learning_rate=0.0449682801162507, max_delta_step=0, max_depth=23,
             min_child_weight=2.404712074481848, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=357,
             reg_alpha=0.0524693653, reg_lambda=4.55860715e-07,
             scale_pos_weight=1, subsample=0.9196040393707718,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [11]:
predict = lambda model, data: 10 ** model.predict(data) - 1

In [12]:
explainer_xgb = dx.Explainer(model_xgb, X_train, y_train, predict_function=predict, label='XGBoost')

Preparation of a new explainer is initiated

  -> data              : 7200 rows 295 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 7200 values
  -> model_class       : xgboost.sklearn.XGBRegressor (default)
  -> label             : XGBoost
  -> predict function  : <function <lambda> at 0x00000121839C4C10> will be used
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 1.4e+02, mean = 8.91e+03, max = 7.06e+05
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -5.92e+04, mean = 5.81e+02, max = 1.36e+06
  -> model_info        : package xgboost

A new explainer has been created!


In [63]:
variable_groups = {'wiek': ['X1']}
for i in range(1, 43):
    desc = data_cols_description[data_cols_description['NAZWA'] == f'X{7 * (i - 1) + 2}']['OPIS'].values[0][11:-27]

    variable_groups[desc] = [f'X{7 * (i - 1) + j}' for j in range(2, 9)]

In [67]:
mp = explainer_xgb.model_parts(N=None,
                               variable_groups=variable_groups,
                               loss_function='rmse',
                               random_state=SEED)
plot = mp.plot(show=False)
plot.write_image(file=f'../reports/figures/all.png', engine='orca', scale=4.0)

In [68]:
segment_1_train = (X_train['X1'] <= 28) & (X_train['X292'] <= 4306.5)
segment_2_train = (X_train['X1'] <= 28) & (X_train['X292'] > 4306.5)
segment_3_train = (X_train['X1'] > 28) & (X_train['X292'] <= 5756.5)
segment_4_train = (X_train['X1'] > 28) & (X_train['X292'] > 5756.5)

segment_1_early_stopping = (X_early_stopping['X1'] <= 28) & (X_early_stopping['X292'] <= 4306.5)
segment_2_early_stopping = (X_early_stopping['X1'] <= 28) & (X_early_stopping['X292'] > 4306.5)
segment_3_early_stopping = (X_early_stopping['X1'] > 28) & (X_early_stopping['X292'] <= 5756.5)
segment_4_early_stopping = (X_early_stopping['X1'] > 28) & (X_early_stopping['X292'] > 5756.5)

In [70]:
explainers = {}
for idx_train, idx_early_stopping, label in [
        (segment_1_train, segment_1_early_stopping, 'young low-income'),
        (segment_2_train, segment_2_early_stopping, 'young high-income'),
        (segment_3_train, segment_3_early_stopping, 'old low-income'),
        (segment_4_train, segment_4_early_stopping, 'old high-income'),
]:
    regressor = xgb.XGBRegressor(verbosity=0, objective='reg:squarederror', n_estimators=10000, **xgb_best_params, random_state=SEED)
    regressor.fit(X_train[idx_train], np.log10(y_train[idx_train] + 1), eval_set=[(X_early_stopping[idx_early_stopping], np.log10(y_early_stopping[idx_early_stopping] + 1))], eval_metric='rmse', early_stopping_rounds=50)

    explainer_xgb = dx.Explainer(regressor, X_train, y_train, predict_function=predict, label=f'XGBoost ({label})')
    explainers[label] = explainer_xgb

[0]	validation_0-rmse:2.99422
[1]	validation_0-rmse:2.85900
[2]	validation_0-rmse:2.73083
[3]	validation_0-rmse:2.60842
[4]	validation_0-rmse:2.49044
[5]	validation_0-rmse:2.37770
[6]	validation_0-rmse:2.27082
[7]	validation_0-rmse:2.16929
[8]	validation_0-rmse:2.07128
[9]	validation_0-rmse:1.97759
[10]	validation_0-rmse:1.88824
[11]	validation_0-rmse:1.80329
[12]	validation_0-rmse:1.72231
[13]	validation_0-rmse:1.64502
[14]	validation_0-rmse:1.57016
[15]	validation_0-rmse:1.49954
[16]	validation_0-rmse:1.43202
[17]	validation_0-rmse:1.36778
[18]	validation_0-rmse:1.30667
[19]	validation_0-rmse:1.24810
[20]	validation_0-rmse:1.19217
[21]	validation_0-rmse:1.13860
[22]	validation_0-rmse:1.08815
[23]	validation_0-rmse:1.03993
[24]	validation_0-rmse:0.99450
[25]	validation_0-rmse:0.94969
[26]	validation_0-rmse:0.90784
[27]	validation_0-rmse:0.86820
[28]	validation_0-rmse:0.82954
[29]	validation_0-rmse:0.79199
[30]	validation_0-rmse:0.75642
[31]	validation_0-rmse:0.72281
[32]	validation_0-

In [71]:
for label, exp in explainers.items():
    mp = exp.model_parts(N=None,
                         variable_groups=variable_groups,
                         loss_function='rmse',
                         random_state=SEED)
    plot = mp.plot(show=False)
    plot.write_image(file=f'../reports/figures/{label}.png', engine='orca', scale=4.0)