In [12]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.cluster
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import lime
import lime.lime_tabular
import random
from skrules import SkopeRules

In [2]:
df_train =  pd.read_csv('./archive/train.csv')
df_test = pd.read_csv('./archive/test.csv')

df_train.income = df_train.income.map({'<=50K':0, '>50K':1})
df_test.income = df_test.income.map({'<=50K':0, '>50K':1})

In [3]:
le = sklearn.preprocessing.LabelEncoder()
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        le.fit(df_train[col])
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])

In [4]:
random_state = 39
exp_iter = 10
random.seed(random_state)

#Get datasets
X_train = df_train.drop('income', axis=1)
y_train = df_train.income
X_test = df_test.drop('income', axis=1)
y_test = df_test.income
test_x = X_test.values
n_classes = len(np.unique(y_train))
feat_list = [each.replace(' ','_') for each in X_train.columns]
X = np.vstack((X_train.values, test_x))

# LIME

In [5]:
class_names = ['<=50K', '>50K']
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names = feat_list, class_names=class_names, discretize_continuous=True)

In [6]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [7]:
def calc_identity(exp1, exp2):
    dis = np.array([np.array_equal(exp1[i],exp2[i]) for i in range(len(exp1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability(exp):
    wrong = 0
    for i in range(exp.shape[0]):
        for j in range(exp.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(exp[i],exp[j])
            if eq:
                wrong = wrong + 1
    total = exp.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong,total,total**2,score

def calc_stability(exp, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(exp[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(exp)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

In [8]:
exp_fn = lambda i: lime_explainer.explain_instance(test_x[i], model.predict_proba)
def exp_fn_blk(xtest, exp_fn):
    exp1 = []
    for i in range(len(xtest)):
        #exp = explainer.explain_instance(dataset.test[i], c.predict_proba, num_features=5, top_labels=1)
        exp = exp_fn(i)
        exp1.append(exp.as_map()[exp.available_labels()[0]])
    return np.array(exp1)
exp_fn_wrap = lambda x: np.array(exp_fn_blk(x, exp_fn))

In [9]:
exp1 = exp_fn_blk(test_x[:100], exp_fn)
exp2 = exp_fn_blk(test_x[:100], exp_fn)

In [13]:
i = calc_identity(exp1,exp2)
print(i)

s = calc_separability(test_x[:100])
print(s)

def enc_exp(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        for j in range(len(exp[i])):
            enc_exp[i][int(exp[i,j,0])] = exp[i,j,1]
    return enc_exp

enc1 = enc_exp(exp1, len(feat_list))
sb = calc_stability(enc1, y_test[:100])
print(sb)

(100.0, 0, 100)
(0, 100, 10000, 0.0)
(11, 100)


In [15]:
from feat_atr import FeatureAttribution

In [16]:
list_monotonicity = []
list_non_sensitivity = []
list_effective_complexity = []

for i in range(len(test_x[:100])):
    atr = exp1[i]
    sorted_atr = [j for i,j in atr]
    sorted_feat = [i for i,j in atr]
    y = np.zeros(n_classes, dtype=int)
    np.put(y, y_test[i], 1)
    example = FeatureAttribution(model, test_x[i], y, sorted_atr)
    list_monotonicity.append(example.monotonicity())
    list_non_sensitivity.append(example.non_sensitivity())
    list_effective_complexity.append(example.effective_complexity(sorted_feat, 0.1))

In [17]:
print(np.mean(list_monotonicity))
print(np.mean(list_non_sensitivity))
print(np.mean(list_effective_complexity))

print(np.median(list_monotonicity))
print(np.median(list_non_sensitivity))
print(np.median(list_effective_complexity))

0.002930962721776251
0.0
6.99
-0.03577742131738436
0.0
10.0


# RULEFIT

In [18]:
clf = SkopeRules(max_depth_duplication=2,
                    n_estimators=100,
                    precision_min=0.3,
                    recall_min=0.1,
                    feature_names=feat_list)

In [19]:
clf.fit(X_train, y_train)

In [20]:
top_rules1 = clf.score_top_rules(test_x[:100])
top_rules2 = clf.score_top_rules(test_x[:100])

In [21]:
def calc_identity_rules(top_rules1, top_rules2):
    dis = np.array([np.array_equal(top_rules1[i],top_rules2[i]) for i in range(len(top_rules1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability_rules(top_rules):
    wrong = 0
    for i in range(top_rules.shape[0]):
        for j in range(top_rules.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(top_rules[i],top_rules[j])
            if eq:
                wrong = wrong + 1
    total = top_rules.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong,total,total**2,score

def exp_enc(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        enc_exp[i][int(exp[i])] = 1
    return enc_exp

def calc_stability_rules(top_rules, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(top_rules[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(top_rules)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

In [23]:
i = calc_identity_rules(top_rules1, top_rules2)
print(i)

s = calc_separability_rules(top_rules1)
print(s)

enc_rules = exp_enc(top_rules1, int(max(top_rules1))+1)

(0.0, 100, 100)
(4806, 100, 10000, 48.06)


In [24]:
sb = calc_stability_rules(enc_rules, y_test[:100])
print(sb)

(36, 100)
