In [2]:
import dalex as dx
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

print(X)

      gender   age             class     embarked   fare  sibsp  parch
0       male  42.0               3rd  Southampton   7.11      0      0
1       male  13.0               3rd  Southampton  20.05      0      2
2       male  16.0               3rd  Southampton  20.05      1      1
3     female  39.0               3rd  Southampton  20.05      1      1
4     female  16.0               3rd  Southampton   7.13      0      0
...      ...   ...               ...          ...    ...    ...    ...
2202    male  41.0         deck crew      Belfast   0.00      0      0
2203    male  40.0  victualling crew  Southampton   0.00      0      0
2204    male  32.0  engineering crew  Southampton   0.00      0      0
2205    male  20.0  restaurant staff  Southampton   0.00      0      0
2206    male  26.0  restaurant staff  Southampton   0.00      0      0

[2207 rows x 7 columns]


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
    (OneHotEncoder(), ['gender', 'class', 'embarked']))

In [4]:
from sklearn.ensemble import RandomForestClassifier

titanic_rf = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth = 3, n_estimators = 500))
titanic_rf.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, n_estimators=500))])

In [5]:
import pandas as pd
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])

import dalex as dx
titanic_rf_exp = dx.Explainer(titanic_rf, X, y, 
                  label = "Titanic RF Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Titanic RF Pipeline
  -> predict function  : <function yhat_proba_default at 0x000002884E4C5040> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.165, mean = 0.322, max = 0.894
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.826, mean = 0.000118, max = 0.832
  -> model_info        : package sklearn

A new explainer has been created!


In [6]:
bd_henry = titanic_rf_exp.predict_parts(henry, 
             type = 'break_down')
bd_henry.result

Unnamed: 0,variable_name,variable_value,variable,cumulative,contribution,sign,position,label
0,intercept,,intercept,0.322038,0.322038,1.0,8,Titanic RF Pipeline
1,class,1st,class = 1st,0.38973,0.067692,1.0,7,Titanic RF Pipeline
2,embarked,Cherbourg,embarked = Cherbourg,0.416254,0.026524,1.0,6,Titanic RF Pipeline
3,fare,25.0,fare = 25.0,0.428857,0.012603,1.0,5,Titanic RF Pipeline
4,sibsp,0.0,sibsp = 0.0,0.428587,-0.00027,-1.0,4,Titanic RF Pipeline
5,parch,0.0,parch = 0.0,0.425201,-0.003385,-1.0,3,Titanic RF Pipeline
6,age,47.0,age = 47.0,0.417551,-0.00765,-1.0,2,Titanic RF Pipeline
7,gender,male,gender = male,0.304733,-0.112818,-1.0,1,Titanic RF Pipeline
8,,,prediction,0.304733,0.304733,1.0,0,Titanic RF Pipeline


In [7]:
bd_henry.plot()

In [8]:
import numpy as np

bd_henry = titanic_rf_exp.predict_parts(henry,
        type = 'break_down',
        order = np.array(['gender', 'class', 'age',
            'embarked', 'fare', 'sibsp', 'parch']))
bd_henry.plot(max_vars = 5)

# Interactions BDP

In [9]:
import pandas as pd
henry = pd.DataFrame({'gender': ['male'], 'age': [47],
           'class': ['1st'],
           'embarked': ['Cherbourg'], 'fare': [25],
           'sibsp': [0], 'parch': [0]},
           index = ['Henry'])
import dalex as dx
titanic_rf_exp = dx.Explainer(titanic_rf, X, y, 
           label = "Titanic RF Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Titanic RF Pipeline
  -> predict function  : <function yhat_proba_default at 0x000002884E4C5040> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.165, mean = 0.322, max = 0.894
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.826, mean = 0.000118, max = 0.832
  -> model_info        : package sklearn

A new explainer has been created!


In [10]:
bd_henry = titanic_rf_exp.predict_parts(henry, 
                type = 'break_down_interactions', 
                interaction_preference = 10)
bd_henry.result

Unnamed: 0,variable_name,variable_value,variable,cumulative,contribution,sign,position,label
0,intercept,,intercept,0.322038,0.322038,1.0,5,Titanic RF Pipeline
1,class:gender,1st:male,class:gender = 1st:male,0.285648,-0.03639,-1.0,4,Titanic RF Pipeline
2,fare:embarked,25.0:Cherbourg,fare:embarked = 25.0:Cherbourg,0.321926,0.036278,1.0,3,Titanic RF Pipeline
3,parch:sibsp,0.0:0.0,parch:sibsp = 0.0:0.0,0.315724,-0.006202,-1.0,2,Titanic RF Pipeline
4,age,47.0,age = 47.0,0.304733,-0.010991,-1.0,1,Titanic RF Pipeline
5,,,prediction,0.304733,0.304733,1.0,0,Titanic RF Pipeline
