In [None]:
#!pip install lime
#!pip install shap
#!pip install anchor-exp
#!pip install hyperopt

import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from hyperopt import hp
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.kernel_ridge import KernelRidge
import sklearn

from sklearn.metrics import f1_score


import os
import joblib

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')

import statistics
import scipy as scp
import math

import lime
import lime.lime_tabular

import shap

from anchor import anchor_tabular

import seaborn as sns

import random

In [None]:
def get_tree_features(cls, instance):
    tree = cls.tree_
    lvl = 0
    left_child = tree.children_left[lvl]
    right_child = tree.children_right[lvl]

    feats = []
    
    while left_child != sklearn.tree._tree.TREE_LEAF and right_child != sklearn.tree._tree.TREE_LEAF:
        feature = tree.feature[lvl]
        feats.append(feature)
        
        if instance[feature] < tree.threshold[lvl]:
            lvl = left_child
        else:
            lvl = right_child
            
        left_child = tree.children_left[lvl]
        right_child = tree.children_right[lvl]
            
            
    feats = set(feats)
    
    return feats

In [None]:
def get_path_depths(tree, feat_list, cur_depth = 0, lvl = 0, depths = []):

    left_child = tree.children_left[lvl]
    right_child = tree.children_right[lvl]
    
    if left_child == sklearn.tree._tree.TREE_LEAF:
        depths.append(cur_depth)
        
    else:
        depths = get_path_depths(tree, feat_list, cur_depth+1, left_child, depths)
        depths = get_path_depths(tree, feat_list, cur_depth+1, right_child, depths)
    return depths

In [None]:
# path to project folder
# please change to your own
PATH = os.getcwd()

dataset = "breast_cancer"
cls_method = "decision_tree" 
classification = True

random_state = 22
exp_iter = 10

save_to = "%s/%s/" % (PATH, dataset)
dataset_folder = "%s/datasets/" % (save_to)
final_folder = "%s/%s/" % (save_to, cls_method)

#Get datasets
X_train = pd.read_csv(dataset_folder+dataset+"_Xtrain.csv", index_col=False, sep = ";")
test_x = pd.read_csv(final_folder+"test_sample.csv", index_col=False, sep = ";").values
results = pd.read_csv(os.path.join(final_folder,"results.csv"), index_col=False, sep = ";")

feat_list = [each.replace(' ','_') for each in X_train.columns]

In [None]:
cls = joblib.load(save_to+cls_method+"/cls.joblib")

path_lengths = get_path_depths(cls.tree_, feat_list)
num_retrieve = max(path_lengths)
    
if num_retrieve > len(feat_list):
    num_retrieve = math.ceil(len(feat_list)*(2/3))

In [None]:
results

In [None]:
shap_precision = []
shap_recall = []

if cls_method == "xgboost":
    shap_explainer = shap.Explainer(cls)
else:
    shap_explainer = shap.Explainer(cls, X_train)

for instance in test_x:
    full_exp = [shap_explainer(instance, check_additivity = False).values for i in range(exp_iter)]
    
    if classification==True:
        shap_exp = []
        for each in full_exp:
            single_exp = [feat[0] for feat in each]
            shap_exp.append(single_exp)
    else:
        shap_exp = full_exp
        
    avg_val = np.average(shap_exp, axis = 0)
    abs_val = [abs(val) for val in avg_val]
    
    if cls_method == "decision_tree":
        feat_pos = get_tree_features(cls, instance)
        true_features = [feat_list[i] for i in feat_pos]
        true_features = set(true_features)
    
    #Get recall and precision for the average of shap values
    bins = pd.cut(abs_val, 4, retbins = True, duplicates = "drop")
    q1_min = bins[1][-2]

    sorted_val = np.copy(abs_val)
    sorted_val.sort()
    path_min = sorted_val[-num_retrieve-1]
    
    shap_recall_features = set([feat_list[i] for i in range(len(feat_list)) if abs_val[i] > path_min])
    shap_precision_features = set([feat_list[i] for i in range(len(feat_list)) if abs_val[i] > q1_min])
    
    recall = len(true_features.intersection(shap_recall_features))/len(true_features)
    precision = len(true_features.intersection(shap_precision_features))/len(shap_precision_features)
    
    shap_precision.append(precision)
    shap_recall.append(recall)
    
results["SHAP Precision"] = shap_precision
results["SHAP Recall"] = shap_recall

In [None]:
print(np.mean(shap_precision))
print(np.mean(shap_recall))

print(np.mean(ind_shap_precision))
print(np.mean(ind_shap_recall))

In [None]:
lime_recall = []
lime_precision = []

if classification==True:
    class_names=['Negative','Positive']# negative is 0, positive is 1, 0 is left, 1 is right
    lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names = feat_list, 
                                                            class_names=class_names, discretize_continuous=True)
else:
    class_names = ['Final Value']
    lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names = feat_list, 
                                                            class_names=class_names, discretize_continuous=True, mode = "regression")

for each in test_x:
    lime_exp = []
    for i in range(exp_iter):
        if classification==True:
            lime_exp.extend(lime_explainer.explain_instance(each, cls.predict_proba, 
                                                num_features=len(feat_list), labels=[0,1]).as_list())
        else:
            lime_exp.extend(lime_explainer.explain_instance(each, cls.predict, 
                                                num_features=len(feat_list), labels=[0,1]).as_list())
            
    weights = [[] for each in feat_list]
    for exp in lime_exp:
        feat = exp[0].replace("= ",'')
        if '<' in feat:
            parts = feat.split('<')
        elif '>' in feat:
            parts = feat.split('>')
        
        for part in parts:
            if part.replace('.','').replace(' ','').isdigit()==False:
                feat_name = part.replace(' ','')
        n = feat_list.index(feat_name)
        weights[n].append(exp[1])
    
    #ind_weights = [abs(feat[0]) for feat in weights]
    weights = np.transpose(weights)
    avg_weight = np.average(np.array(weights), axis = 0)
    abs_weight = [abs(weight) for weight in avg_weight]
    
    if cls_method == "decision_tree":
        feat_pos = get_tree_features(cls, each)
        true_features = [feat_list[i] for i in feat_pos]
        true_features = set(true_features)  
        
    #For average explanation    
    bins = pd.cut(abs_weight, 4, retbins = True, duplicates = "drop")
    q1_min = bins[1][-2]
    
    sorted_weight = np.copy(abs_weight)
    sorted_weight.sort()
    path_min = sorted_weight[-num_retrieve-1]
    
    lime_recall_features = set([feat_list[i] for i in range(len(feat_list)) if abs_weight[i] >= path_min])
    lime_precision_features = set([feat_list[i] for i in range(len(feat_list)) if abs_weight[i] >= q1_min])
    
    recall = len(true_features.intersection(lime_recall_features))/len(true_features)
    precision = len(true_features.intersection(lime_precision_features))/len(lime_precision_features)

    lime_recall.append(recall)
    lime_precision.append(precision)
        
results["LIME Precision"] = lime_precision
results["LIME Recall"] = lime_recall


In [None]:
print(np.mean(lime_precision))
print(np.mean(lime_recall))

print(np.mean(ind_lime_precision))
print(np.mean(ind_lime_recall))

In [None]:
results.to_csv(os.path.join(save_to, cls_method, "results.csv"), index = False, sep = ";")

In [None]:
results