In [2]:
import dalex as dx
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

print(X)

      gender   age             class     embarked   fare  sibsp  parch
0       male  42.0               3rd  Southampton   7.11      0      0
1       male  13.0               3rd  Southampton  20.05      0      2
2       male  16.0               3rd  Southampton  20.05      1      1
3     female  39.0               3rd  Southampton  20.05      1      1
4     female  16.0               3rd  Southampton   7.13      0      0
...      ...   ...               ...          ...    ...    ...    ...
2202    male  41.0         deck crew      Belfast   0.00      0      0
2203    male  40.0  victualling crew  Southampton   0.00      0      0
2204    male  32.0  engineering crew  Southampton   0.00      0      0
2205    male  20.0  restaurant staff  Southampton   0.00      0      0
2206    male  26.0  restaurant staff  Southampton   0.00      0      0

[2207 rows x 7 columns]


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
    (OneHotEncoder(), ['gender', 'class', 'embarked']))

In [4]:
from sklearn.ensemble import RandomForestClassifier

titanic_rf = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth = 3, n_estimators = 500))
titanic_rf.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, n_estimators=500))])

In [5]:
import pandas as pd
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])

import dalex as dx
titanic_rf_exp = dx.Explainer(titanic_rf, X, y, 
                    label = "Titanic RF Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Titanic RF Pipeline
  -> predict function  : <function yhat_proba_default at 0x000001F4EAD7D0D0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.166, mean = 0.322, max = 0.892
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.838, mean = 0.000314, max = 0.832
  -> model_info        : package sklearn

A new explainer has been created!


In [6]:
cp_henry = titanic_rf_exp.predict_profile(henry)
cp_henry.result

Calculating ceteris paribus: 100%|██████████| 7/7 [00:00<00:00, 65.18it/s]


Unnamed: 0,gender,age,class,embarked,fare,sibsp,parch,_original_,_yhat_,_vname_,_ids_,_label_
Henry,male,47.000000,1st,Cherbourg,25.0,0.0,0.00,male,0.306725,gender,Henry,Titanic RF Pipeline
Henry,female,47.000000,1st,Cherbourg,25.0,0.0,0.00,male,0.810013,gender,Henry,Titanic RF Pipeline
Henry,male,0.166667,1st,Cherbourg,25.0,0.0,0.00,47,0.441484,age,Henry,Titanic RF Pipeline
Henry,male,0.905000,1st,Cherbourg,25.0,0.0,0.00,47,0.445878,age,Henry,Titanic RF Pipeline
Henry,male,1.643333,1st,Cherbourg,25.0,0.0,0.00,47,0.444929,age,Henry,Titanic RF Pipeline
...,...,...,...,...,...,...,...,...,...,...,...,...
Henry,male,47.000000,1st,Cherbourg,25.0,0.0,8.64,0,0.351609,parch,Henry,Titanic RF Pipeline
Henry,male,47.000000,1st,Cherbourg,25.0,0.0,8.73,0,0.351609,parch,Henry,Titanic RF Pipeline
Henry,male,47.000000,1st,Cherbourg,25.0,0.0,8.82,0,0.351609,parch,Henry,Titanic RF Pipeline
Henry,male,47.000000,1st,Cherbourg,25.0,0.0,8.91,0,0.351609,parch,Henry,Titanic RF Pipeline


In [7]:
cp_henry.plot(variables = ['age', 'fare'])

In [8]:
cp_henry.plot(variables = ['class', 'embarked'],
               variable_type = 'categorical')

In [9]:
from sklearn.linear_model import LogisticRegression

titanic_lr = make_pipeline(
    preprocess,
    LogisticRegression(penalty = 'l2'))
titanic_lr.fit(X, y)

titanic_lr_exp = dx.Explainer(titanic_lr, 
                    X, y, label = "Titanic LR Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.linear_model._logistic.LogisticRegression (default)
  -> label             : Titanic LR Pipeline
  -> predict function  : <function yhat_proba_default at 0x000001F4EAD7D0D0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.009, mean = 0.322, max = 0.97
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.96, mean = -5.83e-07, max = 0.964
  -> model_info        : package sklearn

A new explainer has been created!


In [10]:
cp_henry2 = titanic_lr_exp.predict_profile(henry)
cp_henry.plot(cp_henry2, variables = ['age', 'fare'])

Calculating ceteris paribus: 100%|██████████| 7/7 [00:00<00:00, 419.45it/s]
