In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.cluster
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import random

In [2]:
df_train =  pd.read_csv('./archive/train.csv')
df_test = pd.read_csv('./archive/test.csv')

df_train.income = df_train.income.map({'<=50K':0, '>50K':1})
df_test.income = df_test.income.map({'<=50K':0, '>50K':1})

In [3]:
le = sklearn.preprocessing.LabelEncoder()
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        le.fit(df_train[col])
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])

In [4]:
random_state = 39
exp_iter = 10
random.seed(random_state)

#Get datasets
X_train = df_train.drop('income', axis=1)
y_train = df_train.income
X_test = df_test.drop('income', axis=1)
y_test = df_test.income
test_x = X_test.values
n_classes = len(np.unique(y_train))
feat_list = [each.replace(' ','_') for each in X_train.columns]
X = np.vstack((X_train.values, test_x))

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

RandomForestClassifier()

# RULEMATRIX

In [6]:
import rulematrix

In [7]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,0,38,39
32557,40,4,154374,11,9,2,7,0,4,1,0,0,40,39
32558,58,4,151910,11,9,6,1,4,4,0,0,0,40,39
32559,22,4,201490,11,9,4,1,3,4,1,0,0,20,39


In [8]:
is_continuous = np.zeros(X_train.shape[1], dtype=bool)
is_continuous[[0,10,11,12]] = True

is_categorical = ~is_continuous

surrogate = rulematrix.surrogate.rule_surrogate(
    model.predict,
    X_train,
    sampling_rate=4,
    is_continuous=is_continuous,
    is_categorical=is_categorical,
    seed=random_state
)

In [9]:
print(surrogate.student)

The rule list contains 122 of rules:

     IF (X10 in (7699.9326171875, 32101.0234375)) THEN prob: [0.0013, 0.9987]

ELSE IF (X10 in (5096.892578125, 6878.70751953125)) AND (X7 = 0) THEN prob: [0.0033, 0.9967]

ELSE IF (X10 in (42629.7421875, inf)) THEN prob: [0.0016, 0.9984]

ELSE IF (X11 in (1799.3743896484375, 1987.9354248046875)) AND (X5 = 2) THEN prob: [0.0182, 0.9818]

ELSE IF (X4 = 9) AND (X7 = 4) THEN prob: [0.9985, 0.0015]

ELSE IF (X0 in (28.62492561340332, 31.15561866760254)) AND (X5 = 4) THEN prob: [0.9977, 0.0023]

ELSE IF (X10 in (-inf, 3169.612060546875)) AND (X11 in (1799.3743896484375, 1987.9354248046875)) THEN prob: [0.9928, 0.0072]

ELSE IF (X0 in (-inf, 20.041353225708008)) AND (X10 in (-inf, 3169.612060546875)) THEN prob: [0.9996, 0.0004]

ELSE IF (X3 = 10) AND (X5 = 2) THEN prob: [0.1040, 0.8960]

ELSE IF (X5 = 6) AND (X9 = 0) THEN prob: [0.9920, 0.0080]

ELSE IF (X10 in (-inf, 3169.612060546875)) AND (X4 = 4) THEN prob: [0.9950, 0.0050]

ELSE IF (X0 in (20.041353

In [10]:
surrogate.score(X_train)

0.8569147139215626

In [11]:
surrogate.score(X_test)

0.914624408820097

In [110]:
def exp_fn_blk(xtest, exp_fn):
    exp1 = []
    for i in range(len(xtest)):
        queried_rules = np.arange(surrogate.student.n_rules)[surrogate.student.decision_path(test_x[i].reshape(1,-1)).reshape(-1)]
        exp = np.zeros(surrogate.student.n_rules)
        exp[queried_rules[-1]] = 1
        exp1.append(exp)
    return np.array(exp1)
exp_fn_wrap = lambda x: np.array(exp_fn_blk(x, exp_fn))

In [111]:
exp1 = exp_fn_blk(test_x[:100], exp_fn)
exp2 = exp_fn_blk(test_x[:100], exp_fn)

In [112]:
def calc_identity(exp1, exp2):
    dis = np.array([np.array_equal(exp1[i],exp2[i]) for i in range(len(exp1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability(exp):
    wrong = 0
    for i in range(exp.shape[0]):
        for j in range(exp.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(exp[i],exp[j])
            if eq:
                wrong = wrong + 1
    total = exp.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong,total,total**2,score

def calc_stability(exp, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(exp[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(exp)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

In [113]:
i = calc_identity(exp1, exp2)
print(i)

s = calc_separability(exp1)
print(s)

sb = calc_stability(exp1, y_test[:100])
print(sb)

(0.0, 100, 100)
(304, 100, 10000, 3.04)
(3, 100)


In [114]:
def normalize_test(X_train, X_test):
    X_test_norm = X_test.copy()
    for i in X_train.columns:
        scaler = sklearn.preprocessing.MinMaxScaler()
        scaler.fit(X_train[i].values.reshape(-1,1))
        X_test_norm[i] = scaler.transform(X_test[i].values.reshape(-1,1))

    return X_test_norm

def calc_similarity(exp, X_test_norm):
    dbscan = sklearn.cluster.DBSCAN(eps=0.5, min_samples=10)
    dbscan.fit(X_test_norm[:400])
    labels = dbscan.labels_
    mean_dist = []
    for i in np.unique(labels):
        mean_dist.append(np.mean(sklearn.metrics.pairwise_distances(exp[np.where(labels == i), :].squeeze(), metric='euclidean')))
    return np.min(mean_dist)

In [115]:
X_test_norm = normalize_test(X_train, X_test)
sim = calc_similarity(exp1, X_test_norm[:100])

print(sim)

1.3060973539360516
