###Install Dependencies

In [None]:
!pip install shap

###Load Dataset



####Diabetes Dataset

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

- Samples total: 442
- Dimensionality: 10
- Features: real, -.2 < x < .2
  - age age in years
  - sex
  - bmi body mass index
  - bp average blood pressure
  - s1 tc, T-Cells (a type of white blood cells)
  - s2 ldl, low-density lipoproteins
  - s3 hdl, high-density lipoproteins
  - s4 tch, thyroid stimulating hormone
  - s5 ltg, lamotrigine
  - s6 glu, blood sugar level
- Targets: integer 25 - 346

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).

Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see: Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) “Least Angle Regression,” Annals of Statistics (with discussion), 407-499. (https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import shap
import time

X,y = shap.datasets.diabetes()
display(X.head())

### Train Linear Regression Model

In [None]:
# Split the dataset into training and testing 
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('Training Features Shape', X_train.shape)
print('Training Target Shape', y_train.shape)
print()
print('Testing Features Shape', X_test.shape)
print('Testing Target Shape', y_test.shape)

In [None]:
# rather than use the whole training set to estimate expected values, we summarize with
# a set of weighted kmeans, each weighted by the number of points they represent.
X_train_summary = shap.kmeans(X_train, 10)

def print_accuracy(f):
    print("Root mean squared test error = {0}".format(np.sqrt(np.mean((f(X_test) - y_test)**2))))
    time.sleep(0.5) # to let the print get out before any progress bars

In [None]:
# Train a linear regression model
from sklearn import linear_model
lin_regr = linear_model.LinearRegression()
lin_regr.fit(X_train, y_train)

# Evaluate the trained model
print_accuracy(lin_regr.predict)

### Explain Linear Regression Predictions

In [None]:
# Explain a single prediction

shap.initjs()

ex = shap.KernelExplainer(lin_regr.predict, X_train_summary)
shap_values = ex.shap_values(X_test.iloc[0,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[0,:])

In [None]:
# Explain a single prediction

shap.initjs()
shap_values = ex.shap_values(X_test.iloc[45,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[45,:])

In [None]:
shap.initjs()

shap_values = ex.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot("bmi", shap_values, X_test)


### Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rforest = RandomForestRegressor(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)
rforest.fit(X_train, y_train)
print_accuracy(rforest.predict)

### Explain Random Forest Predictions

In [None]:
# explain all the predictions in the test set
explainer = shap.TreeExplainer(rforest)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot("bmi", shap_values, X_test)

In [None]:
shap.initjs()

shap.force_plot(ex.expected_value, shap_values, X_test)