In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train =  pd.read_csv('./archive/train.csv')
df_test = pd.read_csv('./archive/test.csv')

In [3]:
df_train.income = df_train.income.map({'<=50K':0, '>50K':1})
df_test.income = df_test.income.map({'<=50K':0, '>50K':1})

In [4]:
import sklearn.preprocessing as preprocessing
df_train.drop(['fnlwgt', 'education', 'native-country'], axis=1, inplace=True)
df_test.drop(['fnlwgt', 'education', 'native-country'], axis=1, inplace=True)

In [5]:
df_train['capital_gain_loss'] = df_train['capital-gain'] - df_train['capital-loss']
df_test['capital_gain_loss'] = df_test['capital-gain'] - df_test['capital-loss']

In [6]:
df_train.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)
df_test.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)

In [7]:
df_train.drop(['workclass', 'marital-status'], axis=1, inplace=True)
df_test.drop(['workclass', 'marital-status'], axis=1, inplace=True)

In [8]:
label_encoder = preprocessing.LabelEncoder()
df_train['occupation'] = label_encoder.fit_transform(df_train['occupation'])

In [9]:
df_test['occupation'] = label_encoder.fit_transform(df_test['occupation'])

In [10]:
cat_col = ['relationship', 'race', 'sex']
train = pd.get_dummies(df_train, columns=cat_col, drop_first=True)

In [11]:
test = pd.get_dummies(df_test, columns=cat_col, drop_first=True)

In [12]:
train.income = train.income.astype('int')
test.income = test.income.astype('int')

In [13]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train = train.drop(['income'], axis=1).to_numpy()
y_train = train['income'].to_numpy()
X_test = test.drop(['income'], axis=1).to_numpy()
y_test = test['income'].to_numpy()

clf = DecisionTreeClassifier(max_leaf_nodes=40, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8570726613844358


In [14]:
import lime
from lime import lime_tabular

In [15]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=train.drop(['income'], axis=1).columns.to_list(), class_names=[0,1])

In [16]:
feature_names=train.drop(['income'], axis=1).columns.to_list()

In [17]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, 0)]
while len(stack) > 0:
    node_id, depth = stack.pop()
    node_depth[node_id] = depth
    is_split_node = children_left[node_id] != children_right[node_id]
    if is_split_node:
        stack.append((children_left[node_id], depth + 1))
        stack.append((children_right[node_id], depth + 1))
    else:
        is_leaves[node_id] = True
    
node_indicator = clf.decision_path(X_train)
leaf_id = clf.apply(X_train)

true_features = []

sample_id = 0
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]
for i in node_index:
    if not is_leaves[i]:
        true_features.append(feature_names[feature[i]])

exp_fn = lambda i: explainer.explain_instance(X_train[i, :], clf.predict_proba)
inst_expl = exp_fn(sample_id)
explanation_features = inst_expl.as_list(label=inst_expl.available_labels()[0])
expl_feat2 = inst_expl.as_map()

In [18]:
expl_feat2 = list(expl_feat2.values())[0]

In [19]:
expl_feat2_l = [feature_names[i[0]] for i in expl_feat2]

In [20]:
expl_feat2_l, explanation_features

(['capital_gain_loss',
  'education-num',
  'relationship_Own-child',
  'relationship_Not-in-family',
  'relationship_Unmarried',
  'relationship_Other-relative',
  'age',
  'hours-per-week',
  'relationship_Wife',
  'race_Asian-Pac-Islander'],
 [('capital_gain_loss > 0.00', 0.70899522525278),
  ('education-num > 12.00', 0.2014496135439733),
  ('relationship_Own-child <= 0.00', 0.09969081829760053),
  ('0.00 < relationship_Not-in-family <= 1.00', -0.09874729695227612),
  ('relationship_Unmarried <= 0.00', 0.08525945630719363),
  ('relationship_Other-relative <= 0.00', 0.05247914293676825),
  ('37.00 < age <= 48.00', 0.04781548394176994),
  ('hours-per-week <= 40.00', -0.03514981869715259),
  ('relationship_Wife <= 0.00', -0.032593510369492026),
  ('race_Asian-Pac-Islander <= 0.00', -0.0074040482250413275)])

In [21]:
true_features, explanation_features, max(node_depth)

(['capital_gain_loss',
  'education-num',
  'relationship_Not-in-family',
  'age',
  'education-num',
  'hours-per-week'],
 [('capital_gain_loss > 0.00', 0.70899522525278),
  ('education-num > 12.00', 0.2014496135439733),
  ('relationship_Own-child <= 0.00', 0.09969081829760053),
  ('0.00 < relationship_Not-in-family <= 1.00', -0.09874729695227612),
  ('relationship_Unmarried <= 0.00', 0.08525945630719363),
  ('relationship_Other-relative <= 0.00', 0.05247914293676825),
  ('37.00 < age <= 48.00', 0.04781548394176994),
  ('hours-per-week <= 40.00', -0.03514981869715259),
  ('relationship_Wife <= 0.00', -0.032593510369492026),
  ('race_Asian-Pac-Islander <= 0.00', -0.0074040482250413275)],
 14)

In [22]:
#Visualize
from sklearn import tree
import graphviz

dot_data = tree.export_graphviz(clf, out_file='x.gv',
                                feature_names=train.drop(['income'], axis=1).columns.to_list(),
                                class_names=['less 50K', 'great 50K'],
                                filled=True, rounded=True,
                                special_characters=True)

In [23]:
X = X_test
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, 0)]
while len(stack) > 0:
    node_id, depth = stack.pop()
    node_depth[node_id] = depth
    is_split_node = children_left[node_id] != children_right[node_id]
    if is_split_node:
        stack.append((children_left[node_id], depth + 1))
        stack.append((children_right[node_id], depth + 1))
    else:
        is_leaves[node_id] = True
    
node_indicator = clf.decision_path(X)
leaf_id = clf.apply(X)
max_depth = max(node_depth)

true_features = []

sample_id = 0
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]
for i in node_index:
    if not is_leaves[i]:
        true_features.append(feature[i])

exp_fn = lambda i: explainer.explain_instance(X[i, :], clf.predict_proba)
inst_expl = exp_fn(sample_id)
# explanation_features = inst_expl.as_list(label=inst_expl.available_labels()[0])
explanation_features = inst_expl.as_map()[inst_expl.available_labels()[0]]
explanation_features = sorted(explanation_features, key=lambda x: x[1], reverse=True)
if len(explanation_features) > max_depth:
    recall_explanation_features = [x[0] for x in explanation_features[:max_depth]]
elif len(explanation_features) > (2*max_depth)/3:
    recall_explanation_features = [x[0] for x in explanation_features[:(2*max_depth)//3]]
else:
    recall_explanation_features = [x[0] for x in explanation_features]
explanation_features_weights = np.array([x[1] for x in explanation_features])
top_quartile = np.percentile(explanation_features_weights, 75)
precision_explanation_features = [x[0] for x in explanation_features if x[1] >= top_quartile]

true_features_set = set(true_features)
precision_explanation_features_set = set(precision_explanation_features)
recall_explanation_features_set = set(recall_explanation_features)

recall = len(true_features_set.intersection(recall_explanation_features_set)) / len(true_features_set)
precision = len(true_features_set.intersection(precision_explanation_features_set)) / len(precision_explanation_features_set)

In [24]:
true_features_set, recall_explanation_features, precision_explanation_features, recall, precision, feature

({0, 1, 4},
 [5, 8, 6, 10, 9, 3, 7, 0, 1],
 [5, 8, 6],
 0.6666666666666666,
 0.0,
 array([ 4,  1,  4,  0,  5,  7,  0,  8, -2,  0,  5,  4, -2,  8,  4,  1, -2,
        -2,  7,  4, -2,  4,  3, -2,  6,  3, -2,  9,  1, -2, -2, 14,  4,  6,
         4, -2,  1, -2,  4,  4, -2, -2,  0,  3, -2,  0, -2, -2,  2, -2, -2,
        -2, -2, -2,  1,  4,  9, -2, -2, -2, -2, -2, -2,  2, -2, 14, -2, -2,
        -2, -2, -2, -2,  2, -2, -2, -2, -2, -2, -2], dtype=int64))

In [25]:
import fidelity
fid = fidelity.Fidelity(clf, explainer, X_test, feature_names)

In [26]:
fid.phase_one()

([0.6666666666666666, 0.875, 0.6, 1.0, 0.6666666666666666],
 [0.0, 1.0, 0.0, 0.3333333333333333, 0.0])

In [27]:
fid.phase_two_util(0)

(0.3333333333333333, 10)