In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from tensorflow.keras.models import model_from_json

import os
import joblib
import pickle
import json

import time
import warnings
warnings.filterwarnings(action = 'once')

import matplotlib.pyplot as plt

from matplotlib.pyplot import figure
import matplotlib.image as mpimg

import stability as st

import statistics
import scipy as scp
import math

import lime
from lime import lime_tabular

import shap

numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


In [2]:
#PATH = "C:/Users/mythr/Documents/GitHub/Stability-Experiments/Diabetes/"
PATH = "C:/Users/velmurug/Documents/Stability Experiments/Diabetes/"
model_filename = os.path.join(PATH, "models/model_h5_N12_DUO.json")
weights_filename = os.path.join(PATH, "models/model_h5_N12_DUO.h5")
dataset_path = 'datasets/'

json_file = open(model_filename, 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights(weights_filename)
print("Loaded model from disk")

Loaded model from disk


In [3]:
tn_file = os.path.join(PATH, "datasets/dicts/true_neg.pkl")
with open (tn_file, 'rb') as f:
    results = pickle.load(f)
    
f.close()

In [4]:
X_train = pd.read_csv(PATH + "datasets/diabetes_Xtrain.csv", index_col=False).values
X_test = pd.read_csv(PATH + "datasets/diabetes_Xtest.csv", index_col=False).values
X_validation =pd.read_csv(PATH + "datasets/diabetes_Xvalidation.csv",index_col=False).values
y_train = pd.read_csv(PATH + "datasets/diabetes_Ytrain.csv",index_col=False).values
y_test =pd.read_csv(PATH + "datasets/diabetes_Ytest.csv", index_col=False).values
y_validation = pd.read_csv(PATH + "datasets/diabetes_Yvalidation.csv", index_col=False).values

original_data = pd.read_csv(PATH + "datasets/diabetes.csv")
feature_names = original_data.drop(["Outcome"], axis = 1).columns

In [5]:
MAX_FEAT = 5

start_time = time.time()

# LIME has one explainer for all the models
explainer = lime_tabular.LimeTabularExplainer(X_train, feature_names= feature_names, 
                                                  class_names=["No Diabetes", "Diabetes"], verbose=False, mode='classification')

elapsed = time.time() - start_time

print ("Time taken to create explainer:", round(elapsed, 2), "seconds")

Time taken to create explainer: 0.01 seconds


In [None]:
stability_scores = []
probas = []

feat_list = list(feature_names)

for instance in results:
    print ("Running instance", results.index(instance))
    
    feat_pres = []

    print("INDEX = %d" %instance['index'])
    patients_feat = np.array(instance['scaled_vector'])
    prediction = instance['predictions']
    probas.append(instance['prediction_probability'][1])
    
    start_time = time.time()
    # explain instance
    
    for iteration in range(100):
        print("Run", iteration)
        lime_exp = explainer.explain_instance(patients_feat, model.predict_proba, num_features = 30)
        
        #For stability by index
        presence_list = [0]*len(feat_list)
    
        for each in feat_list:
            list_idx = feat_list.index(each)
            #print ("Feature", list_idx)
            for explanation in lime_exp.as_list():
                if each in explanation[0]:
                    if lime_exp.as_list().index(explanation) < MAX_FEAT:
                        presence_list[list_idx] = 1
                        
        feat_pres.append(presence_list)
        
        #method 1 - Nogueira, Sechidis, Brown.
    stability = st.getStability(feat_pres[:10])
    print ("Stability:", round(stability,2))
    stability_scores.append(stability)
 
    print("\n--------------------------------------------------------------------------------------------\n")

Running instance 0
INDEX = 14
Run 0
Instructions for updating:
Please use `model.predict()` instead.
Run 1
Run 2
Run 3
Run 4
Run 5
Run 6
Run 7
Run 8
Run 9
Run 10
Run 11
Run 12
Run 13
Run 14
Run 15
Run 16
Run 17
Run 18
Run 19
Run 20
Run 21
Run 22
Run 23
Run 24
Run 25
Run 26
Run 27
Run 28
Run 29
Run 30
Run 31
Run 32
Run 33
Run 34
Run 35
Run 36
Run 37
Run 38
Run 39
Run 40
Run 41
Run 42
Run 43
Run 44
Run 45
Run 46
Run 47
Run 48
Run 49
Run 50
Run 51
Run 52
Run 53
Run 54
Run 55
Run 56
Run 57
Run 58
Run 59
Run 60
Run 61
Run 62
Run 63
Run 64
Run 65
Run 66
Run 67
Run 68
Run 69
Run 70
Run 71
Run 72
Run 73
Run 74
Run 75
Run 76
Run 77
Run 78
Run 79
Run 80
Run 81
Run 82
Run 83
Run 84
Run 85
Run 86
Run 87
Run 88
Run 89
Run 90
Run 91
Run 92
Run 93
Run 94
Run 95
Run 96
Run 97
Run 98
Run 99
Stability: 1.0

--------------------------------------------------------------------------------------------

Running instance 1
INDEX = 44
Run 0
Run 1
Run 2
Run 3
Run 4
Run 5
Run 6
Run 7
Run 8
Run 9
Run 10
Run 11
R

# SHAP

In [None]:
X_train_frame = pd.DataFrame(data = X_train, columns = feature_names)
X_test_frame = pd.DataFrame(data = X_test, columns = feature_names)

data_sample = shap.kmeans(X_train_frame, 100)
shap_explainer = shap.KernelExplainer(model.predict, data_sample)#, link = 'identity')

shap.initjs()

In [None]:
def create_samples(iterations, row, features, top = 10):
    length = len(features)
    
    exp = []
    rel_exp = []
    
    for j in range(iterations):
        #print(X_test_frame.loc[row])
        shap_values = shap_explainer.shap_values(row)
       # print(shap_values)

        importances = []

        for i in range(length):
            #print(i)
            feat = features[i]
            shap_val = shap_values[1][i]
            abs_val = abs(shap_values[1][i])
            entry = (feat, shap_val, abs_val)
            importances.append(entry)
            #print(entry)

        importances.sort(key=lambda tup: tup[2], reverse = True)
        
        exp.append(importances)

        rel_feat = []

        for i in range(top):
            feat = importances[i]
            if feat[2] > 0:
                rel_feat.append(feat)
                
        rel_exp.append(rel_feat)
        
    return exp, rel_exp

In [None]:
shap_stability_scores = []
probas = []

for instance in results:
    
    print("INDEX = %d" %instance['index'])
    patients_feat = pd.Series(instance['scaled_vector'])
    prediction = instance['predictions']
    probas.append(instance['prediction_probability'][1])
    
    loc = results.index(instance)
    
    exp, rel_exp = create_samples(50, patients_feat.values.reshape(-1, 8), feature_names, top = 5)
    
    #print(rel_exp)
    
    feat_pres = []
    
    for iteration in rel_exp:
        print("Iteration", rel_exp.index(iteration))
        #print(iteration)

        #Stability by index
        presence_list = [0]*len(feat_list)
        
        for each in feat_list:
            list_idx = feat_list.index(each)
            #print ("Feature", list_idx)
            for explanation in iteration:
                if each in explanation[0]:
                    #by index
                    presence_list[list_idx] = 1
                    
        feat_pres.append(presence_list)
        
    stability = st.getStability(feat_pres)
    print ("Stability:", round(stability,2))
    shap_stability_scores.append(stability)

In [None]:
plt.plot(probas, stability_scores, 'bo', label = 'LIME')
plt.plot(probas, shap_stability_scores, 'ro', label = 'SHAP')
plt.show()