# This notebook contains the experiments on Heart Statlog dataset with LionForests

In [1]:
from LionForests import LionForests
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
import urllib

Firstly, we load the dataset and we set the feature and class names

In [2]:
url="http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat"
raw_data = urllib.request.urlopen(url)
credit=np.genfromtxt(raw_data)
X,y = credit[:,:-1], credit[:,-1].squeeze()
feature_names = ['age','sex','chest pain','resting blood pressure','serum cholestoral',
               'fasting blood sugar','resting electrocardiographic results','maximum heart rate achieved','exercise induced angina','oldpeak',
               'the slope of the peak exercise','number of major vessels','reversable defect']
class_names = ['absence','presence']

This dataset contains few instances. Only 270

In [3]:
len(X)

270

We can explore the features of this dataset

In [4]:
pd.DataFrame(X,columns=feature_names).describe()

Unnamed: 0,age,sex,chest pain,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic results,maximum heart rate achieved,exercise induced angina,oldpeak,the slope of the peak exercise,number of major vessels,reversable defect
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


Then, we can train our random forests model using LionForests

In [5]:
y = [int(i-1) for i in y] 
parameters = [{
    'max_depth': [5],
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'min_samples_leaf' : [5],
    'n_estimators': [500]
}]
lf = LionForests(class_names=class_names)
scaler = MinMaxScaler(feature_range=(-1,1))
lf.train(X, y, scaler, feature_names, parameters)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.7s finished


And we can see the number of estimators and the best set of parameters

In [6]:
number_of_estimators = lf.model.n_estimators
print("Accuracy:",lf.accuracy,", Number of estimators:",lf.number_of_estimators)
print(lf.model)

Accuracy: 0.8188916011524707 , Number of estimators: 500
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


Now, we are ready to produce explanations using lionForests

In [7]:
rule = lf.following_breadcrumbs(X[25], False, True, False, complexity=2)
print(rule)

['if 3.0<=reversable defect<=4.5 & 2.5<=chest pain<=3.49 & 0.0<=number of major vessels<=0.51 & 0.124<=oldpeak<=0.248 & 0.0<=exercise induced angina<=0.5 & 137.81<=maximum heart rate achieved<=140.43 & 1.0<=the slope of the peak exercise<=1.5 & 47.72<=age<=49.64 & 272.73<=serum cholestoral<=290.25 & 0.0<=sex<=0.5 then absence', 490, 13, 251, 10, {'reversable defect': [3.0, 4.5], 'chest pain': [2.5, 3.49], 'number of major vessels': [0.0, 0.51], 'oldpeak': [0.12400000000000011, 0.2479999999999999], 'exercise induced angina': [0.0, 0.5], 'maximum heart rate achieved': [137.81000000000003, 140.43000000000004], 'the slope of the peak exercise': [1.0, 1.5], 'age': [47.72, 49.63999999999999], 'serum cholestoral': [272.73, 290.25000000000006], 'sex': [0.0, 0.5]}]


In [8]:
discrete_features = ['age']
lf.check_changes_in_prediction(X[25], rule, discrete_features)

---------FEATURES THAT GOT REDUCED FROM LF BELOW---------
resting blood pressure
fasting blood sugar
resting electrocardiographic results
---------NEW FEATURE VALUES BELOW [left,middle,right]---------
age [46, 48, 50]
sex ['0.0000', '0.3008', '0.8008']
chest pain ['1.2741', '3.4086', '3.9804']
serum cholestoral ['173.5021', '289.8842', '439.0919']
maximum heart rate achieved ['85.4716', '140.4274', '192.7684']
exercise induced angina ['0.0000', '0.3281', '0.8281']
oldpeak ['0.0061', '0.2479', '1.7804']
the slope of the peak exercise ['1.0000', '1.3724', '2.2448']
number of major vessels ['0.0000', '0.4758', '1.4615']
reversable defect ['3.0000', '4.3623', '6.3164']
---------FEATURE VALUES THAT MAY CHANGE THE CLASSIFICATION BELOW---------


{0: [],
 1: ['0.8008',
  0,
  0,
  0.1042513007628005,
  0.16528190783353963,
  0.06103060707073914],
 2: ['3.9804',
  0,
  0,
  0.1042513007628005,
  0.18509948384983768,
  0.08084818308703719],
 4: [],
 7: ['192.7684',
  0,
  0,
  0.1042513007628005,
  0.04729645389044345,
  0.056954846872357044],
 8: ['0.8281',
  0,
  0,
  0.1042513007628005,
  0.17007089941060877,
  0.06581959864780827],
 9: ['1.7804',
  0,
  0,
  0.1042513007628005,
  0.19836487230125005,
  0.09411357153844956],
 10: [],
 11: [],
 12: ['6.3164',
  0,
  0,
  0.1042513007628005,
  0.24472100542570493,
  0.14046970466290443]}

In [9]:
#tests below

#X[25][2] = 1.5696
#lf.following_breadcrumbs(X[25], False, True, False, complexity=4)

#X[25][0] = 46
#lf.following_breadcrumbs[X[25], False, True, False, complexity=4]

#X[25][9] = 0.0197
#lf.following_breadcrumbs[X[25], False, True, False, complexity=4]

#X[25][12] = 6.7361
#lf.following_breadcrumbs[X[25], False, True, False, complexity=4]
