In [None]:
#! conda install -y shap

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
# For this demo, I'm only using a subset of the data since SHAP can take a long time to compute
housing = pd.read_csv('../data/kc_house_data.csv', nrows = 1000)  

In [None]:
X = housing[['sqft_living', 'grade', 'waterfront', 'bedrooms']]
y = housing['price']

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor().fit(X, y)

In [None]:
import shap

shap.initjs()     #Load Javascript so that we can view the force plots.

Since we are using a random forest model (tree-based), we can use the TreeExplainer, which will compute exact SHAP values.

In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X)

In [None]:
observation_idx = 5

shap.force_plot(explainer.expected_value, shap_values[observation_idx,:], X.iloc[observation_idx,:])

In [None]:
shap.dependence_plot("grade", shap_values, X)

In [None]:
sum_plot = shap.summary_plot(shap_values, X, plot_size = (14,6))

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar", plot_size = (14,6))

In [None]:
explainer_rf = shap.TreeExplainer(rf)

In [None]:
shap_rf = explainer_rf.shap_values(X.iloc[observation_idx, :])

In [None]:
shap_rf

In [None]:
X.columns

In [None]:
X.loc[observation_idx, :]

In [None]:
avg_price = round(y.mean(),2)


def make_explanations_plot(row_num):
    predicted_price = round(rf.predict(X.iloc[row_num, :].values.reshape(1,-1))[0],2)
    
    shap_rf = explainer_rf.shap_values(X.iloc[row_num, :])
    shap_rf_idx = shap_rf.argsort()

    labels = np.array([str(x) + ':\n' + str(y) for x,y in zip(X.columns, X.iloc[row_num, :])])
    
    feature_indices = np.arange(0, len(X.columns)) + 0.5
    
    print(f'Average home price: {avg_price}')
    print(f'Predicted home price: {predicted_price}')
    print(f'Difference from Average: {round(predicted_price - avg_price,2)}')
    
    fig, ax = plt.subplots(figsize=(8, 6))


    
    ax.barh(feature_indices,shap_rf[shap_rf.argsort()], height=0.7, edgecolor = 'black')
    ax.set_yticklabels(labels[shap_rf_idx], fontsize = 18)
    ax.set_yticks(feature_indices)
    plt.vlines(x = 0, ymin = 0, ymax = len(X.columns));

In [None]:
make_explanations_plot(10)

In [None]:
def make_force(row_num):
    return shap.force_plot(explainer_rf.expected_value,
                    explainer_rf.shap_values(X.iloc[row_num, :]),
                    X.iloc[row_num,:])

In [None]:
make_force(10)

In [None]:
# For this demo, I'm only using a subset of the data since SHAP can take a long time to compute
titanic = pd.read_csv('../data/titanic.csv') 

In [None]:
X = pd.get_dummies(titanic[['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']],columns = ['Sex'], drop_first = True)
y = titanic['Survived']

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_titanic = RandomForestClassifier().fit(X, y)

In [None]:
explainer = shap.TreeExplainer(rf_titanic)
shap_values = explainer.shap_values(X)

In [None]:
observation_idx = 10

shap.force_plot(explainer.expected_value[1], shap_values[1][observation_idx,:], X.iloc[observation_idx,:])

## Using KernelSHAP

To use kernelSHAP, we need to pass in the predict method for our model as well as our data.

Note that kernelSHAP is quite a bit slower than treeSHAP, so sometimes it is better to only use a subset of your data.

In [None]:
kernel_explainer = shap.KernelExplainer(rf_titanic.predict, X)

In [None]:
ke_shap = kernel_explainer.shap_values(X.loc[observation_idx, :])

In [None]:
avg_price = round(y.mean(),2)


def kernel_explanations_plot(row_num):
    predicted_price = round(rf_titanic.predict_proba(X.iloc[row_num, :].values.reshape(1,-1))[0,1],2)
    
    shap_rf = kernel_explainer.shap_values(X.iloc[row_num, :])
    shap_rf_idx = shap_rf.argsort()

    labels = np.array([str(x) + ':\n' + str(y) for x,y in zip(X.columns, X.iloc[row_num, :])])
    
    feature_indices = np.arange(0, len(X.columns)) + 0.5
    
    print(f'Overall probability of surviving: {avg_price}')
    print(f'Predicted probability of surviving: {predicted_price}')
    print(f'Difference from Overall: {round(predicted_price - avg_price,2)}')
    
    fig, ax = plt.subplots(figsize=(8, 6))


    
    ax.barh(feature_indices,shap_rf[shap_rf.argsort()], height=0.7, edgecolor = 'black')
    ax.set_yticklabels(labels[shap_rf_idx], fontsize = 18)
    ax.set_yticks(feature_indices)
    plt.vlines(x = 0, ymin = 0, ymax = len(X.columns));

In [None]:
kernel_explanations_plot(0)