## Prepare the dataset and Machine Learning Model

In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import print_function
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
from competition_methods_explanation.active_methods_bottom_up.anchor import utils
from competition_methods_explanation.active_methods_bottom_up.anchor import anchor_tabular

import pandas as pd

df = pd.read_csv("datasets/adult/adult.csv")
# handling space in the column name
df = df.rename(columns=lambda x: x.strip())
df.head()
target_class = "class"

from competition_methods_explanation.active_methods_bottom_up.utils import load_dataset_dataframe, encoder_from_dataset

adult_dataset = load_dataset_dataframe(df, target_class="class",feature_to_use=None, categorical_features=None, balance=False,discretize=True)
new_encoder = encoder_from_dataset(adult_dataset)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
print(adult_dataset.train[0])


[ 2.  6.  2.  1.  0.  2. 14.  0.  4.  1.  0.  0.  0. 39.]


## prepare the blackbox

In [3]:
c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
c.fit(new_encoder.transform(adult_dataset.train), adult_dataset.labels_train)

predict_fn = lambda x: c.predict(new_encoder.transform(x))
print('Train', sklearn.metrics.accuracy_score(adult_dataset.labels_train, predict_fn(adult_dataset.train)))
print('Test', sklearn.metrics.accuracy_score(adult_dataset.labels_test, predict_fn(adult_dataset.test)))

Train 0.9469824938574939
Test 0.8345102855388394


## local explanation (only one instance)

In [4]:
%load_ext autoreload
%autoreload 2
idx = 0
random_seed = 42

from competition_methods_explanation.active_methods_bottom_up.sp_anchor import explain_one_instance_anchor

result = explain_one_instance_anchor(idx, adult_dataset, new_encoder, c.predict, random_seed=42)
print(result)
print('Precision: %.2f' % result.precision())
print('Coverage: %.2f' % result.coverage())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
education-num <= 9.00 AND education =  11th AND capital-gain <= 0.00
Precision: 0.98
Coverage: 0.04


In [5]:
string_exp= str(result)
print(string_exp)
conditions = [t.split(' ') for t in string_exp.split(' AND ')]
print(conditions)

education-num <= 9.00 AND education =  11th AND capital-gain <= 0.00
[['education-num', '<=', '9.00'], ['education', '=', '', '11th'], ['capital-gain', '<=', '0.00']]


## Submodular-Pick-Anchor Sanity Check

In [6]:
from competition_methods_explanation.active_methods_bottom_up.sp_anchor import submodular_pick_anchor

# run this cell first to compute all the anchors for every instance in the train set

# submodular_pick_anchor(adult_dataset, new_encoder, c.predict, precompute_explanation=False,file_name = "adult_anchor.pickle",random_seed=42)

In [8]:
# run this cell, with only `precompute_explanation` set to True
# Use the precomputed anchor for each explanation to do the submodular picking for the largest coverage

# sp_anchor_explanations = submodular_pick_anchor(adult_dataset, new_encoder, c.predict, precompute_explanation=True,file_name = "adult_anchor.pickle",random_seed=42)
sp_anchor_explanations = submodular_pick_anchor(adult_dataset, new_encoder, c.predict, precompute_explanation=False,random_seed=42)

for e in sp_anchor_explanations:
    print(e.names())


  4%|▍         | 15/400 [01:25<26:39,  4.15s/it]

KeyboardInterrupt: 

note that, without pre-computing a local explanation, this will take a long time to compute local explanation. In experiments we will set `precompute_explanation` to true.