In [1]:
## Our standard import
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


## Models & evaluation metrics
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import joblib

# ## setting random state for reproducibility
SEED = 321
np.random.seed(SEED)
plt.style.use(('ggplot','tableau-colorblind10'))

In [2]:
import joblib
loaded_joblib = joblib.load('Data/best-models.joblib')
loaded_joblib.keys()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/best-models.joblib'

In [None]:
loaded_joblib = joblib.load('random_forest.joblib')
loaded_joblib.keys()

In [None]:
X_train_df = loaded_joblib['X_train']
y_train = loaded_joblib['y_train']
X_test_df = loaded_joblib['X_test']
y_test = loaded_joblib['y_test']
preprocessor = loaded_joblib['preprocessor']
reg = loaded_joblib['RandomForestRegressor']

# Explain your tree-based model with shap:

In [None]:
import shap
shap.initjs()

In [None]:
len(X_train_df)

In [None]:
X_shap = shap.sample(X_train_df,nsamples=400,random_state=321)
X_shap.head()

In [None]:
## get the corresponding y-values
y_shap = y_train.loc[X_shap.index]
y_shap 

In [None]:
explainer = shap.Explainer(reg)
explainer

In [None]:
## Getting the shap values
shap_values = explainer(X_shap,y_shap)
type(shap_values)

In [None]:
X_shap.shape

They are not exactly the same, the first and the second are the same. In shap, the supermarket type 3 go up in to third place. The item visibility went down to the last. However, we have new feature - Establishment Year.

In [None]:
fig, ax = plt.subplots()
shap.summary_plot(shap_values,features= X_shap, plot_type='bar')
#saving the summary plot
fig.savefig('Data/summary_plot_rf.png', bbox_inches="tight")

In [None]:
fig, ax = plt.subplots()
shap.summary_plot(shap_values, features = X_shap,plot_type='dot')
#saving
fig.savefig('Data/summary_plot_dot.png');

Top 3 most important features

Item MRP:

The Sales will be higher if the items prices are higher

Outlet type Grocery Store
it will affect sales.

Outlet type Supermarket Type 3
more supermarket type 3, it will increasing more sales.

# Local Explanations

In [None]:
X_shap_local = X_shap.reset_index(drop=True)
y_shap_local = y_shap.reset_index(drop=True)
X_shap_local.head()

In [None]:
sns.histplot(X_shap['Item_MRP']);

In [None]:
high_mrp = X_shap_local['Item_MRP'].idxmax()
high_mrp

In [None]:
X_shap_local.iloc[high_mrp]

In [None]:
y_shap_local.iloc[high_mrp]

In [None]:
low_mrp = X_shap_local['Item_MRP'].idxmin()
low_mrp

In [None]:
X_shap_local.iloc[low_mrp]

In [None]:
shap.force_plot(explainer.expected_value, 
                shap_values=shap_values[high_mrp].values,
               features=X_shap_local.iloc[high_mrp],show=False,matplotlib=True)

In [None]:
shap.force_plot(explainer.expected_value, 
                shap_values=shap_values[low_mrp].values,
               features=X_shap_local.iloc[low_mrp],show=False,matplotlib=True)

In [None]:
# LimeTabularExplainer
from lime.lime_tabular import LimeTabularExplainer
lime_explainer =LimeTabularExplainer(
    training_data=X_shap.values,
    feature_names=X_shap.columns,
    mode='regression')
lime_explainer

In [None]:
## Use the lime explainer to explain the selected example used above 

exp = lime_explainer.explain_instance(X_shap_local.loc[high_mrp],
                                      reg.predict)


exp.show_in_notebook()

In [None]:
## Use the lime explainer to explain the selected example used above 

exp = lime_explainer.explain_instance(X_shap_local.loc[low_mrp],
                                      reg.predict)


exp.show_in_notebook()