In [1]:
import autosklearn.classification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import sklearn.metrics

In [2]:
dataset = pd.read_csv('../Datasets/Echocardiogram/echocardiogram.csv')

In [3]:
dataset.drop(['name', 'group', 'aliveat1'], axis=1, inplace=True)

In [4]:
dataset.dropna(subset=['alive'], inplace=True)

In [5]:
dataset.isna().sum()

survival                 1
alive                    0
age                      5
pericardialeffusion      0
fractionalshortening     7
epss                    14
lvdd                    10
wallmotion-score         3
wallmotion-index         1
mult                     3
dtype: int64

In [6]:
continuous_features = dataset.drop(['pericardialeffusion', 'alive'], 1).columns

  """Entry point for launching an IPython kernel.


In [7]:
continuous_features

Index(['survival', 'age', 'fractionalshortening', 'epss', 'lvdd',
       'wallmotion-score', 'wallmotion-index', 'mult'],
      dtype='object')

In [8]:
features_with_outliers = ['wallmotion-score', 'wallmotion-index', 'mult']

In [9]:
for feature in continuous_features:
    if feature in features_with_outliers:
        dataset[feature].fillna(dataset[feature].median(), inplace=True)
    else:
        dataset[feature].fillna(dataset[feature].mean(), inplace=True)

In [10]:
X, y = dataset.drop('alive', 1), dataset['alive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

  """Entry point for launching an IPython kernel.


# AUTOML

In [26]:
automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)

In [27]:
sklearn.metrics.accuracy_score(y_test, y_hat)

0.8484848484848485

In [28]:
import pickle
with open('./models/echocardiogram_automl.pkl', 'wb') as f:
    pickle.dump(automl, f)

In [1]:
import pickle
with open('./models/echocardiogram_automl.pkl', 'rb') as f:
    automl = pickle.load(f)

len(automl.show_models())

28

# RANDOM FOREST

In [13]:
import sklearn.ensemble

model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=5, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(n_jobs=5, random_state=42)

In [14]:
y_hat = model.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_hat)

0.9090909090909091

In [15]:
import pickle
with open('./models/echocardiogram_rf.pkl', 'wb') as f:
    pickle.dump(model, f)

In [16]:
import pickle
with open('./models/echocardiogram_rf.pkl', 'rb') as f:
    model = pickle.load(f)

# CIU

In [25]:
from ciu import determine_ciu
import tqdm
import metrics

In [26]:
def enc_exp(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        for j in range(len(exp[i])):
            enc_exp[i][int(exp[i,j,0])] = exp[i,j,1]
    return enc_exp

In [18]:
feat_list = X_train.columns.tolist()

In [21]:
def exp_fn_blk(xtest):
    exp1 = []
    for i in tqdm.tqdm(range(len(xtest))):
        exp = determine_ciu(X_test.iloc[i:i+1], model.predict_proba, X_train.to_dict('list'), samples = 1000, prediction_index = 1)
        exp_list = [[feat_list.index(i), exp.ci[i]] for i in exp.ci]
        exp1.append(exp_list)
    return np.array(exp1)

In [22]:
exp1 = exp_fn_blk(X_test)
exp2 = exp_fn_blk(X_test)

100%|██████████| 33/33 [00:11<00:00,  2.87it/s]
100%|██████████| 33/33 [00:11<00:00,  2.85it/s]


In [23]:
np.save('./explanations/echocardiogram_ciu1.npy', exp1)
np.save('./explanations/echocardiogram_ciu2.npy', exp2)

In [27]:
i = metrics.calc_identity(exp1, exp2)
s = metrics.calc_separability(exp1)
enc1 = enc_exp(exp1, len(feat_list))
sb = metrics.calc_stability(enc1, y_test)

  self._check_params(X)


In [28]:
i, s, sb

((12.121212121212121, 29, 33), (0, 33, 1089, 0.0), (6.0, 33))

In [29]:
X_test_norm = metrics.normalize_test(X_train, X_test)
sim = metrics.calc_similarity(exp1, X_test_norm)

In [30]:
sim

0.2559266087901965

In [32]:
list_monotonicity = []
list_non_sensitivity = []
list_effective_complexity = []

for i in tqdm.tqdm(range(len(X_test))):
    atr = exp1[i]
    sorted_atr = [j for i,j in atr]
    sorted_feat = [i for i,j in atr]
    y = np.zeros(2, dtype=int)
    np.put(y, y_test.iloc[i], 1)
    example = metrics.FeatureAttribution(model, X_test.to_numpy()[i], y, sorted_atr)
    list_monotonicity.append(example.monotonicity())
    list_non_sensitivity.append(example.non_sensitivity())
    list_effective_complexity.append(example.effective_complexity(sorted_feat, 0.1))

100%|██████████| 33/33 [01:02<00:00,  1.89s/it]


In [33]:
print(np.mean(list_monotonicity))
print(np.mean(list_non_sensitivity))
print(np.mean(list_effective_complexity))

print(np.median(list_monotonicity))
print(np.median(list_non_sensitivity))
print(np.median(list_effective_complexity))

0.2426352301891032
0.42424242424242425
4.545454545454546
0.29130434782608694
0.0
6.0


In [34]:
metrics.calc_trust_score(model, X_test.to_numpy(), exp1, 3, X_train.columns.to_list())

100%|██████████| 33/33 [03:05<00:00,  5.61s/it]


0.6212121212121213

# RULEFIT

In [11]:
from skrules import SkopeRules
import metrics_rules
import time

In [12]:
clf = SkopeRules(max_depth_duplication=2,
                    n_estimators=512,
                    precision_min=0.3,
                    recall_min=0.1,
                    feature_names=X_train.columns.tolist())

In [13]:
start_time = time.time()
clf.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 8.621065616607666 seconds ---


In [14]:
start_time = time.time()
top_rules1 = clf.score_top_rules(X_test)
top_rules2 = clf.score_top_rules(X_test)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.048004150390625 seconds ---


In [15]:
i = metrics_rules.calc_identity_rules(top_rules1, top_rules2)
print(i)

s = metrics_rules.calc_separability_rules(top_rules1)
print(s)

enc_rules = metrics_rules.exp_enc(clf, top_rules1)
sb = metrics_rules.calc_stability_rules(enc_rules, y_test)
print(sb)

(0.0, 33, 33)
(270, 33, 1089, 24.793388429752067)
(2.0, 33)


  self._check_params(X)


In [16]:
X_test_norm = metrics_rules.normalize_test(X_train, X_test)
sim = metrics_rules.calc_similarity(enc_rules, X_test_norm)
print(sim)

0.9080987582996202
