In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.cluster
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import random
from skrules import SkopeRules

In [2]:
df_train =  pd.read_csv('./archive/train.csv')
df_test = pd.read_csv('./archive/test.csv')

df_train.income = df_train.income.map({'<=50K':0, '>50K':1})
df_test.income = df_test.income.map({'<=50K':0, '>50K':1})

In [3]:
le = sklearn.preprocessing.LabelEncoder()
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        le.fit(df_train[col])
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])

In [4]:
random_state = 39
exp_iter = 10
random.seed(random_state)

#Get datasets
X_train = df_train.drop('income', axis=1)
y_train = df_train.income
X_test = df_test.drop('income', axis=1)
y_test = df_test.income
test_x = X_test.values
n_classes = len(np.unique(y_train))
feat_list = [each.replace(' ','_') for each in X_train.columns]
X = np.vstack((X_train.values, test_x))

In [5]:
clf = SkopeRules(max_depth_duplication=2,
                    n_estimators=100,
                    precision_min=0.3,
                    recall_min=0.1,
                    feature_names=feat_list)

In [6]:
clf.fit(X_train, y_train)

SkopeRules(feature_names=['age', 'workclass', 'fnlwgt', 'education',
                          'education-num', 'marital-status', 'occupation',
                          'relationship', 'race', 'sex', 'capital-gain',
                          'capital-loss', 'hours-per-week', 'native-country'],
           max_depth_duplication=2, n_estimators=100, precision_min=0.3,
           recall_min=0.1)

In [7]:
top_rules1 = clf.score_top_rules(test_x)
top_rules2 = clf.score_top_rules(test_x)

In [8]:
top_rules1

array([0., 1., 2., ..., 2., 0., 2.])

In [8]:
def calc_identity_rules(top_rules1, top_rules2):
    dis = np.array([np.array_equal(top_rules1[i],top_rules2[i]) for i in range(len(top_rules1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability_rules(top_rules):
    wrong = 0
    for i in range(top_rules.shape[0]):
        for j in range(top_rules.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(top_rules[i],top_rules[j])
            if eq:
                wrong = wrong + 1
    total = top_rules.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong,total,total**2,score

def exp_enc(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        enc_exp[i][int(exp[i])] = 1
    return enc_exp

def calc_stability_rules(top_rules, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(top_rules[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(top_rules)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

In [77]:
# i = calc_identity_rules(top_rules1, top_rules2)
# print(i)

# s = calc_separability_rules(top_rules1)
# print(s)

enc_rules = exp_enc(top_rules1)
sb = calc_stability_rules(enc_rules, y_test)
print(sb)

(3311, 16281)


In [10]:
def normalize_test(X_train, X_test):
    X_test_norm = X_test.copy()
    for i in X_train.columns:
        scaler = sklearn.preprocessing.MinMaxScaler()
        scaler.fit(X_train[i].values.reshape(-1,1))
        X_test_norm[i] = scaler.transform(X_test[i].values.reshape(-1,1))

    return X_test_norm

def calc_similarity(exp, X_test_norm):
    dbscan = sklearn.cluster.DBSCAN(eps=0.5, min_samples=10)
    dbscan.fit(X_test_norm[:400])
    labels = dbscan.labels_
    mean_dist = []
    for i in np.unique(labels):
        mean_dist.append(np.mean(sklearn.metrics.pairwise_distances(exp[np.where(labels == i), :].squeeze(), metric='euclidean')))
    return np.min(mean_dist)

In [11]:
X_test_norm = normalize_test(X_train, X_test)
sim = calc_similarity(enc_rules, X_test_norm[:100])

print(sim)

0.34677520724698857


In [17]:
clf.score_top_rules(X_test[0:1])

array([0.])

In [31]:
clf.rules_without_feature_names_

<bound method SkopeRules._tree_to_rules of SkopeRules(feature_names=['age', 'workclass', 'fnlwgt', 'education',
                          'education-num', 'marital-status', 'occupation',
                          'relationship', 'race', 'sex', 'capital-gain',
                          'capital-loss', 'hours-per-week', 'native-country'],
           max_depth_duplication=2, n_estimators=100, precision_min=0.3,
           recall_min=0.1)>

In [69]:
top_rules1.shape

(16281,)

In [49]:
x = np.zeros(len(clf.feature_names_))
for i in range(len(clf.feature_names_)):
    try:
        if clf.feature_names_[i] in clf.rules_without_feature_names_[int(clf.score_top_rules(X_test[0:1])[0])][0].split():
            x[i] = 1
    ex

In [70]:
def exp_enc(exp):
    enc_exp = np.zeros((len(exp),len(clf.feature_names_)))
    for i in range(len(exp)):
        try:
            for j in range(len(clf.feature_names_)):
                if clf.feature_names_[j] in clf.rules_without_feature_names_[int(exp[i])][0].split():
                    enc_exp[i][j] = 1                    
        except:
            pass
    return enc_exp

In [None]:
def permute(x, x_dash):
    x = x.copy()
    x_dash = x_dash.copy()
    x_rand = np.random.random(x.shape[0])
    x_new = [x[i] if x_rand[i] > 0.5 else x_dash[i] for i in range(len(x))]
    x_dash_new = [x_dash[i] if x_rand[i] > 0.5 else x[i] for i in range(len(x))]
    return x_new, x_dash_new

def calc_trust_score(test_x, exp, m, feat_list):
    total_recalls = []
    for i in range(len(test_x)):
        feat_score = np.zeros((len(feat_list)))
        for _ in range(m):
            x = test_x[i].copy()
            x_dash = test_x[np.random.randint(0,len(test_x))].copy()
            x_perm, x_dash_perm = permute(x, x_dash)
            for j in range(len(feat_list)):
                z = np.concatenate((x_perm[:j+1], x_dash_perm[j+1:]))
                z_dash = np.concatenate((x_dash_perm[:j], x_perm[j:]))
                p_z = model.predict_proba(z.reshape(1,-1))
                p_z_dash = model.predict_proba(z_dash.reshape(1,-1))
                feat_score[j] = feat_score[j] + np.linalg.norm(p_z-p_z_dash)
        feat_score = feat_score/m
        gold_feat_fs = np.argpartition(feat_score, -6)[-6:]
        recall = len(set(exp[i][:6, 0]).intersection(set(gold_feat_fs)))/6
        total_recalls.append(recall)
    return np.mean(total_recalls)