In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from calibrated_explanations import CalibratedExplainer
from sklearn.utils import shuffle
import copy
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer
from shap import Explainer
from VennAbers import VennAbers
import time


In [None]:

dataSet = 'pc1req'
delimiter = ';'
num_to_test = 4
model = 'xGB'
print(dataSet)

fileName = '../data/' + dataSet + ".csv"
df = pd.read_csv(fileName, delimiter=delimiter, dtype=np.float64)
df.head()

In [None]:
target = 'Y'
X, y = df.drop(target,axis=1), df[target] 
no_of_classes = len(np.unique(y))
no_of_features = X.shape[1]
no_of_instances = X.shape[0]
categorical_features = [i for i in range(no_of_features) if len(np.unique(X.iloc[:,i])) < 10]
# # sort targets to make sure equal presence of both classes in test set (see definition of test_index after outer loop below)
idx = np.argsort(y.values).astype(int)
X, y = X.values[idx,:], y.values[idx]
# Select num_to_test/2 from top and num_to_test/2 from bottom of list of instances
test_index = np.array([*range(int(num_to_test/2)), *range(no_of_instances-1, no_of_instances-int(num_to_test/2)-1,-1)])
train_index = np.setdiff1d(np.array(range(no_of_instances)), test_index)   
trainCalX, testX = X[train_index,:], X[test_index,:]
trainCalY, testY = y[train_index], y[test_index]
trainX, calX, trainY, calY = train_test_split(trainCalX, trainCalY, test_size=0.33,random_state=42, stratify=trainCalY)
print(testY)
print(categorical_features)

In [None]:
t1 = DecisionTreeClassifier()
r1 = RandomForestClassifier(n_estimators=100)
g1 = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

model_dict = {'xGB':(g1,"xGB"),'RF':(r1,"RF"),'DT': (t1,"DT")}
model, model_name = model_dict[model] 
model.fit(trainX,trainY)  

va = VennAbers(calX, calY, model)
va_preds = va.predict(testX)
va_proba, low, high = va.predict_proba(testX, output_interval=True)
print(*zip(low, va_proba[:,1],high, va_preds,testY),sep='\n')  

In [None]:
ce = CalibratedExplainer(model, copy.deepcopy(calX), copy.deepcopy(calY), \
                    feature_names=df.columns, \
                    categorical_features=categorical_features)


In [None]:

for num_neighbors in [1.0]:
    ce.set_num_neighbors(num_neighbors)
    discretizer = 'entropy'
    ce.set_discretizer(discretizer)
    tic = time.time()
    exp = ce(copy.deepcopy(testX))
    toc = time.time()
    print(toc-tic)
    exp.plot_counterfactuals(title=dataSet + ' ' + str(num_neighbors) + ' counterfactuals ' + discretizer, num_to_show=10)

    discretizer = 'binaryEntropy'
    ce.set_discretizer(discretizer)
    exp = ce(copy.deepcopy(testX))
    exp.plot_uncertainty(title=dataSet + ' ' + str(num_neighbors) + ' uncertainty ' + discretizer, num_to_show=10)
    exp.plot_regular(title=dataSet + ' ' + str(num_neighbors) + ' regular ' + discretizer, num_to_show=10)

In [None]:

# Running the exp 20 times and saving the results

reg = []
l = []
h = []

stability = {'predict':[],'low':[],'high':[]} #Create the structure stability with three lists for each of the different values for each iteration

for i in range (20):
    ce.set_random_state(i)
    exp = ce(copy.deepcopy(testX))
    stability['predict'].append(exp.feature_weights['predict'][1][:])
    stability['low'].append(exp.feature_weights['low'][1][:])
    stability['high'].append(exp.feature_weights['high'][1][:])
    # print(stability['regularized'][i][:])


In [None]:

print(np.array(stability['predict'][0]))
print(*zip(np.array(stability['predict']).min(axis=0,),np.array(stability['predict']).max(axis=0,)),sep='\n')

In [None]:
ce_as_lime = exp.as_lime()
for e in ce_as_lime:
    e.show_in_notebook(show_table=True)

In [None]:
run_lime=False
if run_lime:    
    lime = LimeTabularExplainer(calX,training_labels=calY, feature_names=df.columns, class_names=['0','1'], mode='classification',discretizer='binaryEntropy')
    lime_weights = []
    for x in testX:
        exp = lime.explain_instance(x, va.predict_proba, num_features=no_of_features)
        exp.show_in_notebook(show_table=True, show_all=False)
        feature_order = [exp.local_exp[1][f][0] for f in range(no_of_features)]
        lime_values = np.zeros(no_of_features)
        for i, f in enumerate(feature_order):
            lime_values[f] = exp.local_exp[1][i][1]
        lime_weights.append([exp.local_exp[1][exp.local_exp[1][f][0]][1] for f in range(no_of_features)])
    print(lime_weights, sep='\n')

In [None]:
ce_as_shap = exp.as_shap_values()

from shap.plots import waterfall, force, scatter, heatmap, bar, violin, beeswarm
beeswarm(ce_as_shap)
heatmap(ce_as_shap)
bar(ce_as_shap)
for e in ce_as_shap:
    e.feature_names = ce_as_shap.feature_names
    waterfall(e)

In [None]:
f = lambda x: va.predict_proba(x)[:,1]
shap = Explainer(f, calX, feature_names=df.columns)
shap_exp = shap(testX)
from shap.plots import waterfall, force, scatter, heatmap, bar, violin, beeswarm
beeswarm(shap_exp)
heatmap(shap_exp)
bar(shap_exp)
for e in shap_exp:
    waterfall(e)
scatter(shap_exp)
