# Models for RMS Titanic, snippets for Python

In [1]:
import dalex as dx
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

In [2]:
print(X)

      gender   age             class     embarked   fare  sibsp  parch
0       male  42.0               3rd  Southampton   7.11      0      0
1       male  13.0               3rd  Southampton  20.05      0      2
2       male  16.0               3rd  Southampton  20.05      1      1
3     female  39.0               3rd  Southampton  20.05      1      1
4     female  16.0               3rd  Southampton   7.13      0      0
...      ...   ...               ...          ...    ...    ...    ...
2202    male  41.0         deck crew      Belfast   0.00      0      0
2203    male  40.0  victualling crew  Southampton   0.00      0      0
2204    male  32.0  engineering crew  Southampton   0.00      0      0
2205    male  20.0  restaurant staff  Southampton   0.00      0      0
2206    male  26.0  restaurant staff  Southampton   0.00      0      0

[2207 rows x 7 columns]


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
    (OneHotEncoder(), ['gender', 'class', 'embarked']))

## Logistic regression model

In [4]:
from sklearn.linear_model import LogisticRegression

titanic_lr = make_pipeline(
    preprocess,
    LogisticRegression(penalty = 'l2'))
titanic_lr.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('logisticregression', LogisticRegression())])

## Random forest model

In [5]:
from sklearn.ensemble import RandomForestClassifier

titanic_rf = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth = 3, n_estimators = 500))
titanic_rf.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, n_estimators=500))])

## Gradient boosting model

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

titanic_gbc = make_pipeline(
    preprocess,
    GradientBoostingClassifier(n_estimators = 100))
titanic_gbc.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('gradientboostingclassifier', GradientBoostingClassifier())])

## Support vector machine model

In [7]:
from sklearn.svm import SVC

titanic_svm = make_pipeline(
    preprocess,
    SVC(probability = True))
titanic_svm.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('svc', SVC(probability=True))])

## Model Prediction

In [12]:
import pandas as pd

johnny_d = pd.DataFrame({'gender': ['male'],
                       'age'     : [8],
                       'class'   : ['1st'],
                       'embarked': ['Southampton'],
                       'fare'    : [72],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['JohnnyD'])

titanic_lr.predict_proba(johnny_d)
# array([[0.35884528, 0.64115472]])
titanic_rf.predict_proba(johnny_d)
# array([[0.63028556, 0.36971444]])
titanic_gbc.predict_proba(johnny_d)
# array([[0.1567194, 0.8432806]])
titanic_svm.predict_proba(johnny_d)
# array([[0.78308146, 0.21691854]])

array([[0.78328913, 0.21671087]])

In [11]:
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])
titanic_lr.predict_proba(henry)
# array([[0.56798421 0.43201579]])
titanic_rf.predict_proba(henry)
# array([[0.69917845 0.30082155]])
titanic_gbc.predict_proba(henry)
# array([[0.78542886 0.21457114]])
titanic_svm.predict_proba(henry)
# array([[0.81725832 0.18274168]])

array([[0.81543577, 0.18456423]])

## Models' explainers

In [13]:
titanic_rf_exp = dx.Explainer(titanic_rf, 
                    X, y, label = "Titanic RF Pipeline")
titanic_lr_exp = dx.Explainer(titanic_lr, 
                    X, y, label = "Titanic LR Pipeline")
titanic_gbc_exp = dx.Explainer(titanic_gbc, 
                    X, y, label = "Titanic GBC Pipeline")
titanic_svm_exp = dx.Explainer(titanic_svm, 
                    X, y, label = "Titanic SVM Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Titanic RF Pipeline
  -> predict function  : <function yhat_proba_default at 0x0000025374087040> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.164, mean = 0.322, max = 0.885
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.828, mean = 0.000653, max = 0.832
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pan

# Apartment prices

In [14]:
import dalex as dx
apartments = dx.datasets.load_apartments()
X = apartments.drop(columns='m2_price')
y = apartments['m2_price']

In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['construction_year', 'surface', 'floor', 'no_rooms']),
    (OneHotEncoder(), ['district']))

In [16]:
from sklearn.linear_model import LinearRegression

apartments_lm = make_pipeline(
    preprocess,
    LinearRegression())
apartments_lm.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['construction_year',
                                                   'surface', 'floor',
                                                   'no_rooms']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['district'])])),
                ('linearregression', LinearRegression())])

In [17]:
from sklearn.ensemble import RandomForestRegressor

apartments_rf = make_pipeline(
    preprocess,
    RandomForestRegressor(max_depth = 3, n_estimators = 500))
apartments_rf.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['construction_year',
                                                   'surface', 'floor',
                                                   'no_rooms']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['district'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=3, n_estimators=500))])

In [18]:
from sklearn.svm import SVR

apartments_svm = make_pipeline(
    preprocess,
    SVR())
apartments_svm.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['construction_year',
                                                   'surface', 'floor',
                                                   'no_rooms']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['district'])])),
                ('svr', SVR())])

In [19]:
apartments_test = dx.datasets.load_apartments_test()
apartments_test = apartments_test.drop(columns='m2_price')

apartments_lm.predict(apartments_test)

array([4820.00943156, 3292.67756996, 2717.90972101, ..., 4836.44370353,
       3191.69063189, 5157.93680175])

In [20]:
apartments_lm_exp = dx.Explainer(apartments_lm, X, y, 
                      label = "Apartments LM Pipeline")
apartments_rf_exp = dx.Explainer(apartments_rf, X, y, 
                      label = "Apartments RF Pipeline")
apartments_svm_exp = dx.Explainer(apartments_svm, X, y, 
                      label = "Apartments SVM Pipeline")

Preparation of a new explainer is initiated

  -> data              : 1000 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 1000 values
  -> model_class       : sklearn.linear_model._base.LinearRegression (default)
  -> label             : Apartments LM Pipeline
  -> predict function  : <function yhat_default at 0x000002537407CF70> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 1.78e+03, mean = 3.49e+03, max = 6.18e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -2.47e+02, mean = 3.67e-13, max = 4.69e+02
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 1000 rows 5 cols
  -> target variable   : Parameter 'y' was a