In [1]:
import dalex as dx

import warnings
warnings.filterwarnings('ignore')

dx.__version__

'1.0.1.9000'

## data

In [2]:
train = dx.datasets.load_apartments()
test = dx.datasets.load_apartments_test()

X_train = train.drop(columns='m2_price')
y_train = train["m2_price"]

X_test= test.drop(columns='m2_price')
y_test = test["m2_price"]

In [3]:
train

Unnamed: 0,m2_price,construction_year,surface,floor,no_rooms,district
1,5897,1953,25,3,1,Srodmiescie
2,1818,1992,143,9,5,Bielany
3,3643,1937,56,1,2,Praga
4,3517,1995,93,7,3,Ochota
5,3013,1992,144,6,5,Mokotow
...,...,...,...,...,...,...
996,6355,1921,44,2,2,Srodmiescie
997,3422,1921,48,10,2,Bemowo
998,3098,1980,85,3,3,Bemowo
999,4192,1942,36,7,1,Zoliborz


## preprocessing

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
numerical_features = X_train.select_dtypes(exclude=[object]).columns
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

categorical_features = X_train.select_dtypes(include=[object]).columns
categorical_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## model

In [5]:
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor

model_elastic_net = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet())
    ]
)
model_elastic_net.fit(X=X_train, y=y_train)

model_decision_tree = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor())
    ]
)
model_decision_tree.fit(X=X_train, y=y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['construction_year', 'surface', 'floor', 'no_rooms'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['district'], dtype='object'))])),
                ('model', DecisionTreeRegressor())])

## explainer

In [6]:
exp_elastic_net = dx.Explainer(model_elastic_net, data=X_test, y=y_test)
exp_decision_tree = dx.Explainer(model_decision_tree, data=X_test, y=y_test)

Preparation of a new explainer is initiated

  -> data              : 9000 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 9000 values
  -> model_class       : sklearn.linear_model._coordinate_descent.ElasticNet (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x0000013DA1982700> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 2.46e+03, mean = 3.5e+03, max = 4.66e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -9.47e+02, mean = 11.4, max = 2.16e+03
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 9000 rows 5 cols


In [7]:
exp_decision_tree.model_performance()

Unnamed: 0,mse,rmse,r2,mae,mad
DecisionTreeRegressor,53643.160778,231.609932,0.93384,149.771889,82.0


## arena

In [8]:
# create empty Arena
arena=dx.Arena()
# push created explainer
arena.push_model(exp_elastic_net)
# push whole test dataset (including target column)
arena.push_observations(test)
# run server on port 9294
arena.run_server(port=9294)

https://arena.drwhy.ai/?data=http://127.0.0.1:9294/


In [9]:
arena.push_model(exp_decision_tree)

In [10]:
from lightgbm import LGBMRegressor
model_gbm = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', LGBMRegressor())
    ]
)
model_gbm.fit(X=X_train, y=y_train)
exp_gbm = dx.Explainer(model_gbm, data=X_test, y=y_test)

Preparation of a new explainer is initiated

  -> data              : 9000 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 9000 values
  -> model_class       : lightgbm.sklearn.LGBMRegressor (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x0000013DA1982700> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 1.63e+03, mean = 3.5e+03, max = 6.43e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -5.02e+02, mean = 8.94, max = 7.27e+02
  -> model_info        : package sklearn

A new explainer has been created!


In [11]:
arena.push_model(exp_gbm)

In [12]:
arena.print_options()


[1mSHAPValues[0m
---------------------------------
B: 10   #Number of random paths

[1mFeatureImportance[0m
---------------------------------
N: None   #Number of observations to use. None for all.
B: 10   #Number of permutation rounds to perform each variable

[1mPartialDependence[0m
---------------------------------
grid_type: quantile   #grid type "quantile" or "uniform"
grid_points: 101   #Maximum number of points for profile
N: 500   #Number of observations to use. None for all.

[1mAccumulatedDependence[0m
---------------------------------
grid_type: quantile   #grid type "quantile" or "uniform"
grid_points: 101   #Maximum number of points for profile
N: 500   #Number of observations to use. None for all.

[1mCeterisParibus[0m
---------------------------------
grid_points: 101   #Maximum number of points for profile
grid_type: quantile   #grid type "quantile" or "uniform"

[1mBreakdown[0m
---------------------------------

[1mMetrics[0m
----------------------------

In [13]:
# Chart-specific
arena.set_option('SHAPValues', 'B', 25)

In [14]:
arena.stop_server()