In [1]:
import numpy as np
import pandas as pd

import pydotplus
from IPython.display import Image
import seaborn as sns

import sklearn
from sklearn import tree
from sklearn.metrics import f1_score, accuracy_score, classification_report, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB

import random

from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

import scipy.stats as ss
from scipy.spatial.distance import cdist

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll


In [2]:
def get_tree_characteristics(tree, feat_list, cur_depth = 0, lvl = 0, depths = [], split_nodes = [], leaf_nodes = []):

    left_child = tree.children_left[lvl]
    right_child = tree.children_right[lvl]
    
    if left_child == sklearn.tree._tree.TREE_LEAF:
        depths.append(cur_depth)
        leaf_nodes.append(lvl)
        
    else:
        split_nodes.append(lvl)
        depths, split_nodes, leaf_nodes = get_tree_characteristics(tree, feat_list, cur_depth+1, left_child, depths, split_nodes, leaf_nodes)
        depths, split_noes, leaf_nodes = get_tree_characteristics(tree, feat_list, cur_depth+1, right_child, depths, split_nodes, leaf_nodes)
        
    return depths, split_nodes, leaf_nodes


In [3]:
def get_tree_features(cls, instance):
    tree = cls.tree_
    lvl = 0
    left_child = tree.children_left[lvl]
    right_child = tree.children_right[lvl]

    feats = []
    
    while left_child != sklearn.tree._tree.TREE_LEAF and right_child != sklearn.tree._tree.TREE_LEAF:
        feature = tree.feature[lvl]
        feats.append(feature)
        
        if instance[feature] < tree.threshold[lvl]:
            lvl = left_child
        else:
            lvl = right_child
            
        left_child = tree.children_left[lvl]
        right_child = tree.children_right[lvl]
            
            
    feat_pos = np.zeros(len(instance))
    n = len(feats)
    for i in feats:
        feat_pos[i]+=n
        n=n-1
    
    return feat_pos

def get_reg_features(cls):

    og_coef = cls.coef_
    if len(og_coef.shape) > 1:
        og_coef = og_coef[0]
    
    #coef = [abs(val) for val in og_coef]
    coef = og_coef.copy()
        
    return coef

def get_nb_features(cls, instance):
    pred = cls.predict(instance.reshape(1, -1)).astype(int)
    means = cls.theta_[pred][0]
    std = np.sqrt(cls.var_[pred])[0]
    
    alt = 1-pred
    alt_means = cls.theta_[alt][0]
    alt_std = np.sqrt(cls.var_[alt])[0]

    likelihoods = []
    
    for i in range(len(means)):
        lk = ss.norm(means[i], std[i]).logpdf(instance[i])
        alt_lk = ss.norm(alt_means[i], alt_std[i]).logpdf(instance[i])
        lkhood = lk-alt_lk
        likelihoods.append(lkhood)
        
    return np.abs(likelihoods)

def get_true_rankings(cls, instance, cls_method, X_train, feat_list):
    if cls_method == "decision_tree":
        feat_pos = get_tree_features(cls, instance)
        
    elif cls_method == "logit" or cls_method == "lin_reg":
        feat_pos = get_reg_features(cls)
        
    elif cls_method == "nb":
        feat_pos = get_nb_features(cls, instance)
        
    return feat_pos

In [4]:
def permute_instance(instance, i, perm_iter = 100, min_i = [0], max_i=[1], mean_i=[0], unique_values=[[0,1]], mode="permutation"):

    permutations = np.array([instance]*perm_iter).transpose()

    for j in range(len(i)):
        if mode=="baseline_max":
            n_val = [max_i[j]]*perm_iter
        elif mode=="baseline" or mode=="baseline_mean":
            n_val = [mean_i[j]]*perm_iter
        elif mode=="baseline_min":
            n_val = [min_i[j]]*perm_iter
        elif mode=="baseline_0":
            n_val = [0]*perm_iter
        else:
            n_val = np.random.choice(unique_values[j], perm_iter)

        permutations[i[j]] = n_val
        
    permutations = permutations.transpose()

    return permutations

In [5]:
def generate_data(n_samples: int = 1000, intercept = 1, sigma = 0.2, num_features = 3):
    
    #weights = np.random.uniform(-1, 1, num_features)#.reshape(-1, 1)  # Arbitrarily chosen coefficients
    weights = [0.55, 0.503, 0.551, 0.49, 0.459]
    noise = np.random.randn(n_samples)*sigma # Gaussian noise with standard deviation 0.2

    ##CONTINUOUS FEATURES ONLY
    # sample X2 from a normal distribution with center 1 and std 2
    cont_features = [np.random.randn(n_samples) for i in range(num_features)]

    # Generate X1 as a linear combination of the features plus some Gaussian noise
    full_weights = [[weights[i]]*n_samples for i in range(num_features)]
    cls = intercept + np.sum(np.multiply(full_weights,cont_features), axis=0) + noise
    
    threshold = np.mean(cls)
    X1 = np.zeros(len(cls))
    X1[np.where(cls>threshold)] = 1    
    
    # Combine into a dataframe
    cont_df = pd.DataFrame(np.transpose(cont_features), columns = ["X%s"%(i) for i in range(2, num_features+2)])
    cont_df["X1"] = X1  
        
    ##CATEGORICAL FEATURES ONLY
    cat_features = np.zeros(np.array(cont_features).shape)
    cat_features[np.where(np.array(cont_features)>np.mean(cont_features, axis=0))]=1
    ## Generate X1 as a linear combination of the features plus some Gaussian noise
    full_weights = [[weights[i]]*n_samples for i in range(num_features)]
    cls = intercept + np.sum(np.multiply(full_weights,cat_features), axis=0) + noise
    
    threshold = np.mean(cls)
    X1 = np.zeros(len(cls))
    X1[np.where(cls>threshold)] = 1    
    # Combine into a dataframe
    cat_df = pd.DataFrame(np.transpose(cat_features), columns = ["X%s"%(i) for i in range(2, num_features+2)])
    cat_df["X1"] = X1
        
    ##CONTINUOUS AND CATEGORICAL FEATURES
    cats = num_features//2
    mixed_features = np.vstack((cat_features[:cats],np.array(cont_features)[cats:]))
 
    # Generate X1 as a linear combination of the features plus some Gaussian noise
    full_weights = [[weights[i]]*n_samples for i in range(num_features)]
    cls = intercept + np.sum(np.multiply(full_weights,mixed_features), axis=0) + noise
    
    threshold = np.mean(cls)
    X1 = np.zeros(len(cls))
    X1[np.where(cls>threshold)] = 1    
    
   # Combine into a dataframe
    mix_df = pd.DataFrame(np.transpose(mixed_features), columns = ["X%s"%(i) for i in range(2, num_features+2)])
    mix_df["X1"] = X1
    
    return cont_df, cat_df, mix_df, weights


In [6]:
# Generate a sample dataset
cont_data, cat_data, mixed_data,weights = generate_data(n_samples = 2000, intercept=1, sigma=0.05, num_features = 5)
print(weights)
cont_data.head()

[0.55, 0.503, 0.551, 0.49, 0.459]


Unnamed: 0,X2,X3,X4,X5,X6,X1
0,-1.1249,-1.799708,0.447009,-0.938291,0.019894,0.0
1,2.260155,0.109919,0.554466,1.43346,0.046705,1.0
2,-0.17445,-0.236026,-0.8001,1.466358,1.690151,1.0
3,-0.731989,0.624614,0.339062,0.586095,1.261578,1.0
4,0.864385,-0.134838,-0.338043,-1.102814,1.535411,1.0


In [7]:
dt = True
lr = True
nb = True

random_state = 32

# Continuous Data

In [8]:
X = cont_data[cont_data.columns.drop("X1")]
y = cont_data["X1"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [36]:
#decision tree
if dt:
    space = {"splitter": ["best", "random"],
             "min_samples_split": [random.uniform(0, 1) for i in range (50)],
             "max_features": [random.uniform(0,1) for i in range (50)],
             "criterion":["gini"]}#, "entropy"]}
    estimator = DecisionTreeClassifier(random_state = random_state)

    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_df = cls.best_estimator_
    print(classification_report(y_test, cls.predict(X_test)))
    
    split_nodes = []
    leaf_nodes = []

    depths, split_nodes, leaf_nodes = get_tree_characteristics(cls_df.tree_, X_train.columns, split_nodes = split_nodes, leaf_nodes = leaf_nodes)

    max_depth = cls_df.get_depth()
    splits = len(split_nodes)
    leaves = len(leaf_nodes)

    print("Max depth: %s \t Num splits: %s \t Num leaves: %s" %(max_depth, splits, leaves))

              precision    recall  f1-score   support

         0.0       0.80      0.86      0.83       227
         1.0       0.88      0.82      0.85       273

    accuracy                           0.84       500
   macro avg       0.84      0.84      0.84       500
weighted avg       0.84      0.84      0.84       500

Max depth: 12 	 Num splits: 182 	 Num leaves: 183


In [37]:
if dt:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    dt_global = []
    exp_global = []
    
#     print("Weights", cls_df.feature_importances_)
#     print("Global model correctness:", ss.kendalltau(np.abs(weights), cls_df.feature_importances_.reshape(1, -1), variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_df, instance, "decision_tree", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        
        dt_global.append(tr)
        
        pred = cls_df.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_df.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_df.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))#/np.mean(cdist(permutations, [instance])))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(dt_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(dt_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.6
Global explanation fidelity: 0.7999999999999999
Global data-explanation fidelity: 0.7999999999999999
---------------------------------------------------------------
Local model correctness: 0.42004572636427373
Local explanation fidelity: 0.13259591739845158
Local data-explanation fidelity: 0.3156886217434191


In [38]:
if dt:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    dt_global = []
    exp_global = []
    
#     print("Weights", cls_df.feature_importances_)
#     print("Global model correctness:", ss.kendalltau(np.abs(weights), cls_df.feature_importances_.reshape(1, -1), variant="b").statistic)

    for instance in tqdm_notebook(X_test.values):
        tr = get_true_rankings(cls_df, instance, "decision_tree", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        
        dt_global.append(tr)
        
        pred = cls_df.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_df.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_df.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(np.mean([(p1_list[i] - p2_list[i])/cdist(instance, permutations[i]) for i in range(len(p1_list))]))
            #change.append(mean_absolute_percentage_error(p1_list, p2_list)/np.mean([MAPE(instance, permutation) for permutation in permutations]))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(dt_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(dt_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

  0%|          | 0/500 [00:00<?, ?it/s]

ValueError: XA must be a 2-dimensional array.

In [12]:
np.mean([(p1_list[i] - p2_list[i])/mean_absolute_percentage_error(instance, permutations[i]) for i in range(len(p1_list))])

0.0

In [13]:
[(p1_list[i] - p2_list[i]) for i in range(len(p1_list))]

[array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array

In [14]:
instance[4], permutations[:, 4]

(-0.32915568000061424,
 array([ 5.09412461e-01,  1.07207775e+00, -1.75187029e+00, -5.93779275e-01,
         1.41957807e+00, -1.66455264e-01,  1.61131101e+00, -7.54839171e-01,
        -1.17296891e+00, -2.41727574e+00,  5.78933764e-02, -1.29342911e+00,
         2.07322239e-02, -1.03532264e+00, -1.66770510e-02,  6.63002323e-01,
         1.10423725e+00, -9.54573230e-01, -3.82129289e-01, -8.88498272e-01,
        -1.45760586e+00,  6.16822694e-02, -5.09863546e-01, -1.22949382e+00,
        -2.30921735e-01,  1.25999955e+00,  8.21085704e-01, -5.19448052e-01,
         2.58797336e-01, -4.02520099e-01,  4.97040921e-01, -8.36120164e-02,
        -8.20727015e-01, -1.88295262e-01, -1.03109742e+00,  3.36586858e-01,
         2.12697708e+00,  5.07240911e-01,  5.21627624e-01,  2.82504430e-02,
         1.18751227e+00, -1.45779521e+00,  6.06153724e-02, -1.86427693e+00,
         9.12249770e-02,  3.53900284e-01,  4.77226348e-01, -1.11266236e+00,
         4.57485582e-01, -7.74546916e-01,  1.09605141e-01, -1.369

In [15]:
#logit
if lr:
    space = {"fit_intercept": [True, False],
             "penalty": ['l2', 'elasticnet', 'none'],
             "max_iter": [random.uniform(5,200) for i in range (20)],
             "tol": np.logspace(-4, 4, 50)}
    estimator = LogisticRegression(random_state = random_state)
    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_lr = cls.best_estimator_
    print(classification_report(y_test, cls_lr.predict(X_test)))
    print("Model weights:", cls_lr.coef_[0])

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       265
         1.0       0.99      0.99      0.99       235

    accuracy                           0.99       500
   macro avg       0.99      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500

Model weights: [4.66025499 4.36957995 4.85593973 4.15542653 3.89963206]


In [16]:
if lr:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    exp_global = []

    
    #print("Weights", cls_lr.coef_[0])
    #print("Global model correctness:", ss.kendalltau(weights, cls_lr.coef_[0], variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_lr, instance, "logit", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(weights, tr, variant="b").statistic)
        
        pred = cls_lr.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_lr.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_lr.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
        
        exp_global.append(change)
        mx_corr.append(ss.kendalltau(np.abs(tr), change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.abs(cls_lr.coef_[0]), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.abs(cls_lr.coef_[0]), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.mean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.9999999999999999
Global explanation fidelity: 0.9999999999999999
Global data-explanation fidelity: 0.9999999999999999
---------------------------------------------------------------
Local model correctness: 0.9999999999999998
Local explanation fidelity: 0.15
Local data-explanation fidelity: 0.3670666666666666


In [17]:
if nb:
    space = {'var_smoothing': np.logspace(0, -9, 100)}
    estimator = GaussianNB()
    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_nb = cls.best_estimator_
    print(classification_report(y_test, cls_nb.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97       265
         1.0       0.95      1.00      0.97       235

    accuracy                           0.97       500
   macro avg       0.97      0.98      0.97       500
weighted avg       0.97      0.97      0.97       500



In [18]:
if nb:
#     nb_global = [ss.kstest(np.random.normal(cls_nb.theta_[0][ind], np.sqrt(cls_nb.var_)[0][ind],1000), 
#           np.random.normal(cls_nb.theta_[1][ind], np.sqrt(cls_nb.var_)[1][ind],1000)).statistic for ind in range(len(weights))]

    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    nb_global = []
    exp_global = []
    
    #print("Weights", nb_global)
    #print("Global model correctness:", ss.kendalltau(np.abs(weights), nb_global, variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_nb, instance, "nb", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        nb_global.append(tr)
        
        pred = cls_nb.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_nb.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_nb.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)
        
    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(nb_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(nb_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.mean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.7999999999999999
Global explanation fidelity: 0.9999999999999999
Global data-explanation fidelity: 0.7999999999999999
---------------------------------------------------------------
Local model correctness: 0.08519999999999998
Local explanation fidelity: 0.6876
Local data-explanation fidelity: 0.4145333333333333


In [19]:
np.sum(exp_global, axis=0)

array([ 94.29819103,  86.56189099, 111.20045041,  90.18544013,
        83.49338862])

In [20]:
weights

[0.55, 0.503, 0.551, 0.49, 0.459]

# Categorical Data

In [21]:
X = cat_data[cat_data.columns.drop("X1")]
y = cat_data["X1"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
#decision tree
if dt:
    space = {"splitter": ["best", "random"],
             "min_samples_split": [random.uniform(0, 1) for i in range (50)],
             "max_features": [random.uniform(0,1) for i in range (50)],
             "criterion":["gini"]}#, "entropy"]}
    estimator = DecisionTreeClassifier(random_state = random_state)

    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_df = cls.best_estimator_
    print(classification_report(y_test, cls.predict(X_test)))
    
    split_nodes = []
    leaf_nodes = []

    depths, split_nodes, leaf_nodes = get_tree_characteristics(cls_df.tree_, X_train.columns, split_nodes = split_nodes, leaf_nodes = leaf_nodes)

    max_depth = cls_df.get_depth()
    splits = len(split_nodes)
    leaves = len(leaf_nodes)

    print("Max depth: %s \t Num splits: %s \t Num leaves: %s" %(max_depth, splits, leaves))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       266
         1.0       1.00      1.00      1.00       234

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

Max depth: 5 	 Num splits: 19 	 Num leaves: 20


In [23]:
if dt:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    dt_global = []
    exp_global = []
    
#     print("Weights", cls_df.feature_importances_)
#     print("Global model correctness:", ss.kendalltau(np.abs(weights), cPerfls_df.feature_importances_.reshape(1, -1), variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_df, instance, "decision_tree", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        
        dt_global.append(tr)
        
        pred = cls_df.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_df.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_df.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(dt_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(dt_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.0
Global explanation fidelity: -0.19999999999999998
Global data-explanation fidelity: 0.0
---------------------------------------------------------------
Local model correctness: -0.06888156617270719
Local explanation fidelity: -0.003911099180066295
Local data-explanation fidelity: 0.2358947510377595


In [24]:
#logit
if lr:
    space = {"fit_intercept": [True, False],
             "penalty": ['l2', 'elasticnet', 'none'],
             "max_iter": [random.uniform(5,200) for i in range (20)],
             "tol": np.logspace(-4, 4, 50)}
    estimator = LogisticRegression(random_state = random_state)
    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_lr = cls.best_estimator_
    print(classification_report(y_test, cls_lr.predict(X_test)))
    print("Model weights:", cls_lr.coef_[0])

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       266
         1.0       1.00      1.00      1.00       234

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

Model weights: [6.05633197 6.21009034 6.06533825 6.10242977 6.0974812 ]


In [25]:
if lr:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    exp_global = []

    
    #print("Weights", cls_lr.coef_[0])
    #print("Global model correctness:", ss.kendalltau(weights, cls_lr.coef_[0], variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_lr, instance, "logit", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(weights, tr, variant="b").statistic)
        
        pred = cls_lr.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_lr.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_lr.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
        
        exp_global.append(change)
        mx_corr.append(ss.kendalltau(np.abs(tr), change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.abs(cls_lr.coef_[0]), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.abs(cls_lr.coef_[0]), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: -0.19999999999999998
Global explanation fidelity: 0.0
Global data-explanation fidelity: 0.0
---------------------------------------------------------------
Local model correctness: -0.2
Local explanation fidelity: 0.055999999999999994
Local data-explanation fidelity: 0.3118666666666667


In [26]:
if nb:
    space = {'var_smoothing': np.logspace(0, -9, 100)}
    estimator = GaussianNB()
    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_nb = cls.best_estimator_
    print(classification_report(y_test, cls_nb.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       266
         1.0       1.00      1.00      1.00       234

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500



In [27]:
if nb:
#     nb_global = [ss.kstest(np.random.normal(cls_nb.theta_[0][ind], np.sqrt(cls_nb.var_)[0][ind],1000), 
#           np.random.normal(cls_nb.theta_[1][ind], np.sqrt(cls_nb.var_)[1][ind],1000)).statistic for ind in range(len(weights))]

    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    nb_global = []
    exp_global = []
    
    #print("Weights", nb_global)
    #print("Global model correctness:", ss.kendalltau(np.abs(weights), nb_global, variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_nb, instance, "nb", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        nb_global.append(tr)
        
        pred = cls_nb.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_nb.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_nb.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)
        
    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(nb_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(nb_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: -0.39999999999999997
Global explanation fidelity: 0.9999999999999999
Global data-explanation fidelity: -0.39999999999999997
---------------------------------------------------------------
Local model correctness: -0.4612
Local explanation fidelity: 0.808
Local data-explanation fidelity: 0.05234999999999997


# Mixed Data

In [28]:
X = cont_data[cont_data.columns.drop("X1")]
y = cont_data["X1"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [29]:
#decision tree
if dt:
    space = {"splitter": ["best", "random"],
             "min_samples_split": [random.uniform(0, 1) for i in range (50)],
             "max_features": [random.uniform(0,1) for i in range (50)],
             "criterion":["gini"]}#, "entropy"]}
    estimator = DecisionTreeClassifier(random_state = random_state)

    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_df = cls.best_estimator_
    print(classification_report(y_test, cls.predict(X_test)))
    
    split_nodes = []
    leaf_nodes = []

    depths, split_nodes, leaf_nodes = get_tree_characteristics(cls_df.tree_, X_train.columns, split_nodes = split_nodes, leaf_nodes = leaf_nodes)

    max_depth = cls_df.get_depth()
    splits = len(split_nodes)
    leaves = len(leaf_nodes)

    print("Max depth: %s \t Num splits: %s \t Num leaves: %s" %(max_depth, splits, leaves))

              precision    recall  f1-score   support

         0.0       0.77      0.84      0.80       227
         1.0       0.86      0.79      0.82       273

    accuracy                           0.81       500
   macro avg       0.81      0.82      0.81       500
weighted avg       0.82      0.81      0.81       500

Max depth: 12 	 Num splits: 126 	 Num leaves: 127


In [30]:
if dt:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    dt_global = []
    exp_global = []
    
#     print("Weights", cls_df.feature_importances_)
#     print("Global model correctness:", ss.kendalltau(np.abs(weights), cls_df.feature_importances_.reshape(1, -1), variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_df, instance, "decision_tree", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        
        dt_global.append(tr)
        
        pred = cls_df.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_df.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_df.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(dt_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(dt_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.9999999999999999
Global explanation fidelity: 0.7999999999999999
Global data-explanation fidelity: 0.7999999999999999
---------------------------------------------------------------
Local model correctness: 0.5696294231491453
Local explanation fidelity: 0.13718746148496735
Local data-explanation fidelity: 0.3490408861908746


In [31]:
#logit
if lr:
    space = {"fit_intercept": [True, False],
             "penalty": ['l2', 'elasticnet', 'none'],
             "max_iter": [random.uniform(5,200) for i in range (20)],
             "tol": np.logspace(-4, 4, 50)}
    estimator = LogisticRegression(random_state = random_state)
    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_lr = cls.best_estimator_
    print(classification_report(y_test, cls_lr.predict(X_test)))
    print("Model weights:", cls_lr.coef_[0])

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       227
         1.0       1.00      0.99      0.99       273

    accuracy                           0.99       500
   macro avg       0.99      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500

Model weights: [4.76514275 4.54992483 4.89709404 4.14464018 3.95259274]


In [32]:
if lr:
    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    exp_global = []

    
    #print("Weights", cls_lr.coef_[0])
    #print("Global model correctness:", ss.kendalltau(weights, cls_lr.coef_[0], variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_lr, instance, "logit", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(weights, tr, variant="b").statistic)
        
        pred = cls_lr.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_lr.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_lr.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
        
        exp_global.append(change)
        mx_corr.append(ss.kendalltau(np.abs(tr), change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))

    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.abs(cls_lr.coef_[0]), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.abs(cls_lr.coef_[0]), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.9999999999999999
Global explanation fidelity: 0.9999999999999999
Global data-explanation fidelity: 0.9999999999999999
---------------------------------------------------------------
Local model correctness: 0.9999999999999998
Local explanation fidelity: 0.1184
Local data-explanation fidelity: 0.3568666666666666


In [33]:
if nb:
    space = {'var_smoothing': np.logspace(0, -9, 100)}
    estimator = GaussianNB()
    cls = GridSearchCV(estimator, space, verbose = 0)
    cls.fit(X_train, y_train)

    cls_nb = cls.best_estimator_
    print(classification_report(y_test, cls_nb.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97       227
         1.0       1.00      0.95      0.97       273

    accuracy                           0.97       500
   macro avg       0.97      0.97      0.97       500
weighted avg       0.97      0.97      0.97       500



In [34]:
if nb:
#     nb_global = [ss.kstest(np.random.normal(cls_nb.theta_[0][ind], np.sqrt(cls_nb.var_)[0][ind],1000), 
#           np.random.normal(cls_nb.theta_[1][ind], np.sqrt(cls_nb.var_)[1][ind],1000)).statistic for ind in range(len(weights))]

    dm_corr = []
    mx_corr = []
    dx_corr = []
    
    nb_global = []
    exp_global = []
    
    #print("Weights", nb_global)
    #print("Global model correctness:", ss.kendalltau(np.abs(weights), nb_global, variant="b").statistic)

    for instance in X_test.values:
        tr = get_true_rankings(cls_nb, instance, "nb", X_train, X_train.columns)
        dm_corr.append(ss.kendalltau(np.abs(weights), tr, variant="b").statistic)
        nb_global.append(tr)
        
        pred = cls_nb.predict(instance.reshape(1, -1)).astype(int)
        p1_list = [cls_nb.predict_proba(instance.reshape(1, -1)).reshape(2)[pred]]*1000
        change = []
        for i in range(len(instance)):
            permutations = permute_instance(instance, [i], 1000, [X_train.iloc[:, i].min()], [X_train.iloc[:, i].max()],
                                           [X_train.iloc[:, i].mean()], [X_train.iloc[:,i].unique()], mode="permutation")
            p2_list = cls_nb.predict_proba(permutations).transpose()[pred].reshape(1000)
            change.append(mean_absolute_percentage_error(p1_list, p2_list))
            
        mx_corr.append(ss.kendalltau(tr, change, variant="b").statistic)
        dx_corr.append(ss.kendalltau(np.abs(weights), change, variant="b"))
        exp_global.append(change)
        
    print("Global model correctness:", ss.kendalltau(np.abs(weights), np.nanmean(nb_global, axis=0), variant="b").statistic)
    print("Global explanation fidelity:", ss.kendalltau(np.nanmean(nb_global, axis=0), np.nanmean(exp_global, axis=0), variant="b").statistic)
    print("Global data-explanation fidelity:", ss.kendalltau(np.abs(weights), np.nanmean(exp_global, axis=0), variant="b").statistic)
    
    print("---------------------------------------------------------------")
    
    print("Local model correctness:", np.nanmean(dm_corr))
    print("Local explanation fidelity:", np.nanmean(mx_corr))
    print("Local data-explanation fidelity:", np.nanmean(dx_corr))

Global model correctness: 0.7999999999999999
Global explanation fidelity: 0.9999999999999999
Global data-explanation fidelity: 0.7999999999999999
---------------------------------------------------------------
Local model correctness: 0.12279999999999999
Local explanation fidelity: 0.6915999999999999
Local data-explanation fidelity: 0.4256


In [35]:
dm_corr

[0.0,
 0.19999999999999998,
 0.7999999999999999,
 0.19999999999999998,
 -0.6,
 -0.39999999999999997,
 0.6,
 0.39999999999999997,
 0.39999999999999997,
 0.39999999999999997,
 0.39999999999999997,
 -0.9999999999999999,
 0.6,
 -0.39999999999999997,
 0.39999999999999997,
 0.9999999999999999,
 0.19999999999999998,
 0.19999999999999998,
 0.0,
 0.39999999999999997,
 0.39999999999999997,
 -0.7999999999999999,
 0.0,
 0.19999999999999998,
 0.0,
 0.6,
 0.39999999999999997,
 0.6,
 -0.39999999999999997,
 0.39999999999999997,
 0.39999999999999997,
 -0.19999999999999998,
 -0.19999999999999998,
 -0.19999999999999998,
 0.7999999999999999,
 -0.7999999999999999,
 -0.7999999999999999,
 -0.6,
 -0.19999999999999998,
 -0.19999999999999998,
 0.7999999999999999,
 -0.19999999999999998,
 0.39999999999999997,
 0.39999999999999997,
 -0.19999999999999998,
 0.19999999999999998,
 -0.19999999999999998,
 -0.19999999999999998,
 0.7999999999999999,
 -0.19999999999999998,
 0.19999999999999998,
 0.19999999999999998,
 -0.79