In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.cluster
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import random
from skrules import SkopeRules

In [2]:
df_train =  pd.read_csv('./archive/train.csv')
df_test = pd.read_csv('./archive/test.csv')

df_train.income = df_train.income.map({'<=50K':0, '>50K':1})
df_test.income = df_test.income.map({'<=50K':0, '>50K':1})

In [3]:
le = sklearn.preprocessing.LabelEncoder()
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        le.fit(df_train[col])
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])

In [4]:
random_state = 39
exp_iter = 10
random.seed(random_state)

#Get datasets
X_train = df_train.drop('income', axis=1)
y_train = df_train.income
X_test = df_test.drop('income', axis=1)
y_test = df_test.income
test_x = X_test.values
n_classes = len(np.unique(y_train))
feat_list = [each.replace(' ','_') for each in X_train.columns]
X = np.vstack((X_train.values, test_x))

In [5]:
clf = SkopeRules(max_depth_duplication=2,
                    n_estimators=100,
                    precision_min=0.3,
                    recall_min=0.1,
                    feature_names=feat_list)

In [6]:
clf.fit(X_train, y_train)

SkopeRules(feature_names=['age', 'workclass', 'fnlwgt', 'education',
                          'education-num', 'marital-status', 'occupation',
                          'relationship', 'race', 'sex', 'capital-gain',
                          'capital-loss', 'hours-per-week', 'native-country'],
           max_depth_duplication=2, n_estimators=100, precision_min=0.3,
           recall_min=0.1)

In [7]:
top_rules1 = clf.score_top_rules(test_x[:100])
top_rules2 = clf.score_top_rules(test_x[:100])

In [8]:
def calc_identity_rules(top_rules1, top_rules2):
    dis = np.array([np.array_equal(top_rules1[i],top_rules2[i]) for i in range(len(top_rules1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability_rules(top_rules):
    wrong = 0
    for i in range(top_rules.shape[0]):
        for j in range(top_rules.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(top_rules[i],top_rules[j])
            if eq:
                wrong = wrong + 1
    total = top_rules.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong,total,total**2,score

def exp_enc(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        enc_exp[i][int(exp[i])] = 1
    return enc_exp

def calc_stability_rules(top_rules, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(top_rules[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(top_rules)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

In [9]:
i = calc_identity_rules(top_rules1, top_rules2)
print(i)

s = calc_separability_rules(top_rules1)
print(s)

enc_rules = exp_enc(top_rules1, int(max(top_rules1))+1)
sb = calc_stability_rules(enc_rules, y_test[:100])
print(sb)

(0.0, 100, 100)
(4806, 100, 10000, 48.06)
(36, 100)


In [18]:
def normalize_test(X_train, X_test):
    X_test_norm = X_test.copy()
    for i in X_train.columns:
        scaler = sklearn.preprocessing.MinMaxScaler()
        scaler.fit(X_train[i].values.reshape(-1,1))
        X_test_norm[i] = scaler.transform(X_test[i].values.reshape(-1,1))

    return X_test_norm

def calc_similarity(exp, X_test_norm):
    dbscan = sklearn.cluster.DBSCAN(eps=0.5, min_samples=10)
    dbscan.fit(X_test_norm[:400])
    labels = dbscan.labels_
    mean_dist = []
    for i in np.unique(labels):
        mean_dist.append(np.mean(sklearn.metrics.pairwise_distances(exp[np.where(labels == i), :].squeeze(), metric='euclidean')))
    return np.min(mean_dist)

In [19]:
X_test_norm = normalize_test(X_train, X_test)
sim = calc_similarity(enc_rules, X_test_norm[:100])

print(sim)

0.34677520724698857
