In [1]:
# Used to normalize some features of the adult dataset
# Taken from the orignal anchor implementation

def map_array_values(array, value_map):
    ret = array.copy()
    for src, target in value_map.items():
        ret[ret == src] = target
    return ret

def cap_gains_fn(x):
    x = x.astype(float)
    d = np.digitize(x, [0, np.median(x[x > 0]), float('inf')],
                    right=True).astype('|S128')
    return map_array_values(d, {'0': 'None', '1': 'Low', '2': 'High'})

education_map = {
    '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th':
    'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th':
    'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad',
    'Some-college': 'High School grad', 'Masters': 'Masters',
    'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates',
    'Assoc-voc': 'Associates',
}

occupation_map = {
    "Adm-clerical": "Admin", "Armed-Forces": "Military",
    "Craft-repair": "Blue-Collar", "Exec-managerial": "White-Collar",
    "Farming-fishing": "Blue-Collar", "Handlers-cleaners":
    "Blue-Collar", "Machine-op-inspct": "Blue-Collar", "Other-service":
    "Service", "Priv-house-serv": "Service", "Prof-specialty":
    "Professional", "Protective-serv": "Other", "Sales":
    "Sales", "Tech-support": "Other", "Transport-moving":
    "Blue-Collar",
}

country_map = {
    'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China':
    'China', 'Columbia': 'South-America', 'Cuba': 'Other',
    'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America',
    'El-Salvador': 'South-America', 'England': 'British-Commonwealth',
    'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2',
    'Guatemala': 'Latin-America', 'Haiti': 'Latin-America',
    'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America',
    'Hong': 'China', 'Hungary': 'Euro_2', 'India':
    'British-Commonwealth', 'Iran': 'Other', 'Ireland':
    'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica':
    'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico':
    'Latin-America', 'Nicaragua': 'Latin-America',
    'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru':
    'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2',
    'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland':
    'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China',
    'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America',
    'United-States': 'United-States', 'Vietnam': 'SE-Asia'
}

married_map = {
    'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married',
    'Married-civ-spouse': 'Married', 'Married-spouse-absent':
    'Separated', 'Separated': 'Separated', 'Divorced':
    'Separated', 'Widowed': 'Widowed'
}

label_map = {'<=50K': 'Less than $50,000', '>50K': 'More than $50,000'}

In [2]:
import numpy as np

data = np.genfromtxt('../datasets/adult/adult.data', delimiter=',', dtype='|S128')

all_column_names = [
    "Age", "Workclass", "fnlwgt", "Education",
    "Education-Num", "Marital Status", "Occupation",
    "Relationship", "Race", "Sex", "Capital Gain",
    "Capital Loss", "Hours per week", "Country", 'Income'
]

categorical_columns = [
    1, 3, 5, 6, 7, 8, 9, 10, 11, 13
]

used_columns = [
    0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13
]

# names of the columns after dropping unused columns
column_names_after_dropping = [
    x for i, x in enumerate(all_column_names) if i in used_columns
]

# idx of the categorical_columns after dropping unused columns
categorical_features_after_dropping = [
    used_columns.index(x) for x in categorical_columns
]

In [3]:
import sklearn
import sklearn.ensemble

# transformize some cols

transformations = {
            3: lambda x: map_array_values(x, education_map),
            5: lambda x: map_array_values(x, married_map),
            6: lambda x: map_array_values(x, occupation_map),
            10: cap_gains_fn,
            11: cap_gains_fn,
            13: lambda x: map_array_values(x, country_map),
            14: lambda x: map_array_values(x, label_map),
}

for feature, transformation in transformations.items():
    data[:, feature] = transformation(data[:, feature])

# encode categorical features
for feature in categorical_columns:
    le = sklearn.preprocessing.LabelEncoder()
    data[:, feature] = le.fit_transform(data[:, feature])

# encode label
le = sklearn.preprocessing.LabelEncoder()
data[:, -1] = le.fit_transform(data[:, -1])

# drop unused columns and split into data and labels
X = data[:, used_columns]
y = data[:, -1]


In [4]:
# discretize data

import lime.lime_tabular

X = X.astype(float)
disc = lime.lime_tabular.QuartileDiscretizer(X, categorical_features_after_dropping, column_names_after_dropping)
X = disc.discretize(X)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
import sklearn.ensemble

c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
c.fit(X_train, y_train)
print('Train', sklearn.metrics.accuracy_score(y_train, c.predict(X_train)))
print('Test', sklearn.metrics.accuracy_score(y_test, c.predict(X_test)))

Train 0.9292688517075407
Test 0.8389168062534896


In [7]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from Anchor.anchor import Anchor, Tasktype
import time

params = {
    "batch_size": [4, 8, 12, 16, 20, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 1024, 2048, 4092],
    "delta": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
    "epsilon": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
}

agg_data = []
for key, values in params.items():
    for value in values:
        explainer = Anchor(Tasktype.TABULAR)
        task_paras = {"dataset": X_train, "column_names": column_names_after_dropping}
        method_paras = {"beam_size": 4, "desired_confidence": 0.9}

        start_time = time.time()

        anchor = explainer.explain_instance(
            input=X_test[1].reshape(1, -1),
            predict_fn=c.predict,
            method="beam",
            epsilon = value if key == "epsilon" else 0.1,
            delta = value if key == "delta" else 0.1,
            batch_size = value if key == "batch_size" else 16,
            task_specific=task_paras,
            method_specific=method_paras,
            num_coverage_samples=100,
        )

        end_time = time.time()

        agg_data.append([
            key, value, 
            anchor.feature_mask, anchor.precision, 
            anchor.n_samples, anchor.positive_samples, 
            anchor.coverage, end_time - start_time
        ])


INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root: Start Sampling
INFO:root: Start Beam Search
INFO:root:

In [None]:
import csv

header = [
    'parameter', 'value', 'feature_mask', 
    'precision', 'n_samples', 'positive_samples', 'coverage', 'search_duration']


with open('hyperparams.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(agg_data)

In [16]:
# Analysis Part
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

df = pd.read_csv("hyperparams.csv")
print(df.head())

fig = make_subplots(
    rows=3, cols=3,
)

df_batch = df[df['parameter']=='batch_size']
df_delta = df[df['parameter']=='delta']
df_eps = df[df['parameter']=='epsilon']

trace_batch_cov = go.Scatter(
    x=df_batch.value,
    y=df_batch.coverage,
    mode='markers',
    name='Coverage x Batch Size'
)
trace_delta_cov = go.Scatter(
    x=df_delta.value,
    y=df_delta.coverage,
    mode='markers',
    name='Coverage x Delta'
)
trace_eps_cov = go.Scatter(
    x=df_eps.value,
    y=df_eps.coverage,
    mode='markers',
    name='Coverage x Epsilon'
)
trace_batch_dur = go.Scatter(
    x=df_batch.value,
    y=df_batch.search_duration,
    mode='markers',
    name='Search Time x Batch Size'
)
trace_delta_dur = go.Scatter(
    x=df_delta.value,
    y=df_delta.search_duration,
    mode='markers',
    name='Search Time x Delta'
)
trace_eps_dur = go.Scatter(
    x=df_eps.value,
    y=df_eps.search_duration,
    mode='markers',
    name='Search Time x Epsilon'
)

trace_batch_nsamples = go.Scatter(
    x=df_batch.value,
    y=df_batch.n_samples,
    mode='markers',
    name='Number of Samples x Batch Size'
)
trace_delta_nsamples = go.Scatter(
    x=df_delta.value,
    y=df_delta.n_samples,
    mode='markers',
    name='Number of Samples x Delta',
)
trace_eps_nsamples = go.Scatter(
    x=df_eps.value,
    y=df_eps.n_samples,
    mode='markers',
    name='Number of Samples x Epsilon'
)


fig.add_trace(trace_batch_cov, row=1, col=1)
fig.add_trace(trace_delta_cov, row=2, col=1)
fig.add_trace(trace_eps_cov, row=3, col=1)
fig.add_trace(trace_batch_dur, row=1, col=2)
fig.add_trace(trace_delta_dur, row=2, col=2)
fig.add_trace(trace_eps_dur, row=3, col=2)
fig.add_trace(trace_batch_nsamples, row=1, col=3)
fig.add_trace(trace_delta_nsamples, row=2, col=3)
fig.add_trace(trace_eps_nsamples, row=3, col=3)



fig.update_layout(
    title_text='Impact of different values for batch_size, delta and epsilon parameters (Adult dataset)',
    height=800
)

fig['layout']['xaxis']['title'] = 'Batch Size'
fig['layout']['xaxis2']['title'] = 'Batch Size'
fig['layout']['xaxis3']['title'] = 'Batch Size'
fig['layout']['xaxis4']['title'] = 'Delta'
fig['layout']['xaxis5']['title'] = 'Delta'
fig['layout']['xaxis6']['title'] = 'Delta'
fig['layout']['xaxis7']['title'] = 'Epsilon'
fig['layout']['xaxis8']['title'] = 'Epsilon'
fig['layout']['xaxis9']['title'] = 'Epsilon'
fig['layout']['yaxis']['title'] = 'Coverage'
fig['layout']['yaxis2']['title'] = 'Search Time (sec)'
fig['layout']['yaxis3']['title'] = 'Number of samples'
fig['layout']['yaxis4']['title'] = 'Coverage'
fig['layout']['yaxis5']['title'] = 'Search Time (sec)'
fig['layout']['yaxis6']['title'] = 'Number of samples'
fig['layout']['yaxis7']['title'] = 'Coverage'
fig['layout']['yaxis8']['title'] = 'Search Time (sec)'
fig['layout']['yaxis9']['title'] = 'Number of samples'

fig.show()

fig = px.scatter_3d(df, x='precision', y='coverage', z='search_duration',
              color='parameter')
fig.show()


    parameter  value feature_mask  precision  n_samples  positive_samples  \
0  batch_size    4.0   [2, 7, 10]   0.911100       2036              1855   
1  batch_size    8.0   [8, 2, 10]   0.913158       2280              2082   
2  batch_size   12.0   [2, 7, 10]   0.904531       2472              2236   
3  batch_size   16.0    [7, 2, 8]   0.919730       1632              1501   
4  batch_size   20.0   [10, 2, 7]   0.909091       1980              1800   

   coverage  search_duration  
0      0.17       244.245227  
1      0.07        92.903987  
2      0.17        60.543440  
3      0.06        48.367267  
4      0.17        38.109655  
