![Data definition](data_dict.png "Data definition")

To calculate exact shapley values we decided to remove some features and leave 8.

In [1]:
# !pip install -e ../
# !pip install shap

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import dalex as dx
import time
import numpy as np
import shap
from pathlib import Path

In [3]:
DATA_DIR = Path.cwd().parent / "data"
ESTIMATES_DIR = Path.cwd().parent / "estimates"

In [4]:
df_titanic_train = pd.read_csv(DATA_DIR / "train.csv")
df_titanic_test_x = pd.read_csv(DATA_DIR / "test.csv")
df_test_target_unordered = pd.read_csv(DATA_DIR / "gender_submission.csv")

In [5]:
df_titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df_titanic_train = pd.read_csv(DATA_DIR / "train.csv")
df_titanic_test_x = pd.read_csv(DATA_DIR / "test.csv")
df_test_target_unordered = pd.read_csv(DATA_DIR / "gender_submission.csv")

# irrelevant features
df_titanic_train = df_titanic_train.drop(
    columns=["Name", "Ticket", "Cabin", "Embarked"]
).dropna()
df_titanic_test_x = df_titanic_test_x.drop(
    columns=["Name", "Ticket", "Cabin", "Embarked"]
).dropna()

df_titanic_train_y = df_titanic_train["Survived"]
df_titanic_train_x = df_titanic_train.drop(columns=["PassengerId", "Survived"])

df_titanic_train_x["Sex"] = df_titanic_train_x["Sex"].map({"female": 0, "male": 1})
df_titanic_test_x["Sex"] = df_titanic_test_x["Sex"].map({"female": 0, "male": 1})

# one-hot-encoding of Pclass
for class_nb in range(1, 4):
    df_titanic_train_x[f"Pclass_{class_nb}"] = (
        df_titanic_train_x["Pclass"] == class_nb
    ) * 1
    df_titanic_test_x[f"Pclass_{class_nb}"] = (
        df_titanic_test_x["Pclass"] == class_nb
    ) * 1

df_titanic_train_x = df_titanic_train_x.drop(columns="Pclass")
df_titanic_test_x = df_titanic_test_x.drop(columns="Pclass")

df_titanic_test = df_test_target_unordered.merge(
    df_titanic_test_x, how="inner", on="PassengerId"
)
df_titanic_test_y = df_titanic_test["Survived"]
df_titanic_test_x = df_titanic_test.drop(columns=["PassengerId", "Survived"])

In [7]:
df_titanic_train_x.columns == df_titanic_test_x.columns

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [18]:
df_titanic_train_x.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2',
       'Pclass_3'],
      dtype='object')

In [8]:
df_titanic_test_y

0      0
1      1
2      0
3      0
4      1
      ..
326    1
327    1
328    1
329    1
330    0
Name: Survived, Length: 331, dtype: int64

In [9]:
model_predicting_method = lambda m, d: m.predict_proba(d)[:, 1]

In [10]:
rf = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=446519)
rf.fit(df_titanic_train_x, df_titanic_train_y)
explainer_rf = dx.Explainer(
    rf,
    df_titanic_test_x,
    df_titanic_test_y,
    predict_function=model_predicting_method,
    label="RF",
)

Preparation of a new explainer is initiated

  -> data              : 331 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 331 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : RF
  -> predict function  : <function <lambda> at 0x7ff17ec8d950> will be used
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.107, mean = 0.427, max = 0.945
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.758, mean = -0.0433, max = 0.812
  -> model_info        : package sklearn

A new explainer has been created!


X does not have valid feature names, but RandomForestClassifier was fitted with feature names


In [11]:
svc = SVC(random_state=446519, probability=True)
svc.fit(df_titanic_train_x, df_titanic_train_y)
explainer_svc = dx.Explainer(
    svc,
    df_titanic_test_x,
    df_titanic_test_y,
    predict_function=model_predicting_method,
    label="SVC",
)

Preparation of a new explainer is initiated

  -> data              : 331 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 331 values
  -> model_class       : sklearn.svm._classes.SVC (default)
  -> label             : SVC
  -> predict function  : <function <lambda> at 0x7ff17ec8d950> will be used
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.281, mean = 0.419, max = 0.779
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.778, mean = -0.0354, max = 0.719
  -> model_info        : package sklearn

A new explainer has been created!


X does not have valid feature names, but SVC was fitted with feature names


In [12]:
lr = LogisticRegression(random_state=446519)
lr.fit(df_titanic_train_x, df_titanic_train_y)
explainer_lr = dx.Explainer(
    lr,
    df_titanic_test_x,
    df_titanic_test_y,
    predict_function=model_predicting_method,
    label="LR",
)

Preparation of a new explainer is initiated

  -> data              : 331 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 331 values
  -> model_class       : sklearn.linear_model._logistic.LogisticRegression (default)
  -> label             : LR
  -> predict function  : <function <lambda> at 0x7ff17ec8d950> will be used
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.014, mean = 0.433, max = 0.962
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.795, mean = -0.0497, max = 0.821
  -> model_info        : package sklearn

A new explainer has been created!


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
X does not have valid feature names, but LogisticRegression was fitted with feature names


In [13]:
type(df_titanic_test_x.iloc[0])

pandas.core.series.Series

In [14]:
explainer_rf.predict_parts(
    df_titanic_test_x.iloc[0:1], type="unbiased_kernel_shap", label=f"sample id: 0"
).plot()

X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names


# UWAGA! Poniższa komórka była odpalona raz a jej wyniki zostały zapisane do plików. Wystarczy zmienne załadować z pliku :)

In [15]:
# dict_of_explainers = {"random_forrest":explainer_rf,
#                       "support_vector_machine": explainer_svc,
#                       "logistic_regression": explainer_lr}
# list_of_n_samples = [10, 50, 100, 500, 1000]

# times_uks = np.zeros((len(dict_of_explainers), len(list_of_n_samples)))

# for e_id, (exp_name, explainer) in enumerate(dict_of_explainers.items()):
#     for s_id, n_samples in enumerate(list_of_n_samples):

#         start = time.time()

#         explained = df_titanic_test_x.apply(lambda x: explainer.predict_parts(x.to_frame().transpose(),
#                                                              type="unbiased_kernel_shap",
#                                                              n_samples = n_samples), axis=1)

#         end = time.time()

#         result = explained.apply(lambda x: x.result.contribution.to_numpy()).to_numpy()

#         np.save(EXTIMATES_DIR / f"{exp_name}_{n_samples}.npy", result)

#         process_time = end - start
#         times_uks[e_id][s_id] = process_time

# np.save(EXTIMATES_DIR / "times_uks.npy", times_uks)

In [16]:
# exact_explainer_rf = shap.explainers.Exact(rf.predict_proba, df_titanic_test_x)
# exact_explainer_svc = shap.explainers.Exact(svc.predict_proba, df_titanic_test_x)
# exact_explainer_lr = shap.explainers.Exact(lr.predict_proba, df_titanic_test_x)

# dict_of_explainers = {"random_forrest":exact_explainer_rf,
#                       "support_vector_machine": exact_explainer_svc,
#                       "logistic_regression": exact_explainer_lr}

# times_uks = np.zeros((len(dict_of_explainers),))

# for e_id, (exp_name, explainer) in enumerate(dict_of_explainers.items()):

#         start = time.time()

#         shap_values = explainer(df_titanic_test_x)

#         end = time.time()

#         result = shap_values.values[...,1]

#         np.save(EXTIMATES_DIR / f"{exp_name}_exact.npy", result)

#         process_time = end - start
#         times_uks[e_id] = process_time

# np.save(EXTIMATES_DIR / "times_exact.npy", times_uks)

In [17]:
# exact_explainer_rf = shap.KernelExplainer(rf.predict_proba, df_titanic_test_x)
# exact_explainer_svc = shap.KernelExplainer(svc.predict_proba, df_titanic_test_x)
# exact_explainer_lr = shap.KernelExplainer(lr.predict_proba, df_titanic_test_x)

# dict_of_explainers = {"random_forrest":exact_explainer_rf,
#                       "support_vector_machine": exact_explainer_svc,
#                       "logistic_regression": exact_explainer_lr}

# times_uks = np.zeros((len(dict_of_explainers),))

# for e_id, (exp_name, explainer) in enumerate(dict_of_explainers.items()):

#         start = time.time()

#         shap_values = explainer.shap_values(df_titanic_test_x)

#         end = time.time()

#         result = shap_values[1]

#         np.save(EXTIMATES_DIR / f"{exp_name}_kernel.npy", result)

#         process_time = end - start
#         times_uks[e_id] = process_time

# np.save(EXTIMATES_DIR / "times_kernel.npy", times_uks)