In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header = None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
pd.set_option('max_columns', 1000)

In [3]:
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]

In [4]:
df.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Some data housekeeping!

train_cols = df.columns[0:-1]
label = df.columns[-1]

x = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1) # Making the target a binary output for classification

x_enc = pd.get_dummies(x, prefix_sep = '-')
feature_names = list(x_enc.columns)

seed = 1
x_train, x_test, y_train, y_test = train_test_split(x_enc, y, test_size = 0.20, random_state = seed)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
rf = RandomForestClassifier(n_estimators = 100)

blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
                  

In [8]:
import interpret

In [9]:
from interpret import show
from interpret.perf import ROC

blackbox_perf = ROC(blackbox_model.predict_proba).explain_perf(x_test, y_test, name = 'Blackbox')
show(blackbox_perf)

In [11]:
# LIME and SHAP
from interpret.blackbox import LimeTabular

# All blackbox explainers need a predict / predict_proba function, optionally a dataset
lime = LimeTabular(predict_fn = blackbox_model.predict_proba, data = x_train, random_state = 1)

# Chose your instances to explain
lime_local = lime.explain_local(x_test[:10], y_test[:10], name = 'LIME')

show(lime_local)

In [12]:
from interpret.blackbox import ShapKernel
import numpy as np

background_val = np.median(x_train, axis = 0).reshape(1, -1)
shap = ShapKernel(predict_fn = blackbox_model.predict_proba, data = background_val, feature_names = feature_names)
shap_local = shap.explain_local(x_test[:5], y_test[:5], name = 'SHAP')

show(shap_local)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!





l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


In [10]:
# Global interpretability

from interpret.blackbox import MorrisSensitivity

sensitivity = MorrisSensitivity(predict_fn = blackbox_model.predict_proba, data = x_train)
sensitivity_global = sensitivity.explain_global(name = "Global Sensitivity")

show(sensitivity_global)

In [13]:
from interpret.blackbox import PartialDependence

pdp = PartialDependence(predict_fn = blackbox_model.predict_proba, data = x_train)
pdp_global = pdp.explain_global(name = "Partial Dependence")

show(pdp_global)

In [12]:
show([blackbox_perf, lime_local, shap_local, sensitivity_global, pdp_global])

In [13]:
# Let's look at an interpretable model
# Explainable Boosting Machine (EBM)

from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier(random_state = seed)
ebm.fit(x_train, y_train)

ExplainableBoostingClassifier(binning_strategy='quantile',
               data_n_episodes=2000, early_stopping_run_length=50,
               early_stopping_tolerance=1e-05,
               feature_names=['Age', 'fnlwgt', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'WorkClass- ?', 'WorkClass- Federal-gov', 'WorkClass- Local-gov', 'WorkClass- Never-worked', 'WorkClass- Private', 'WorkClass- Self-emp-inc', 'WorkClass- Self-emp-not-inc', 'WorkClass- State-gov', 'WorkClas...adad&Tobago', 'NativeCountry- United-States', 'NativeCountry- Vietnam', 'NativeCountry- Yugoslavia'],
               feature_step_n_inner_bags=0,
               feature_types=['continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categ...egorical', 'categorical', 'categorical', 'cat

In [14]:
ebm_perf = ROC(ebm.predict_proba).explain_perf(x_test, y_test, name = 'EBM')
show(ebm_perf)

In [15]:
ebm_global = ebm.explain_global(name = 'EBM')
show(ebm_global)

In [16]:
ebm_local = ebm.explain_local(x_test[:5], y_test[:5], name = 'EBM')
show(ebm_local)