In [1]:
#Importing all necessary libraries
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from itertools import combinations
from scipy.stats import friedmanchisquare

In [2]:
X = pd.read_csv('heart_failure_clinical_records_dataset.csv') #Reading the csv file into a pandas DataFrame

In [3]:
X.isna().any() #Checking if we're having any NaN values

age                         False
anaemia                     False
creatinine_phosphokinase    False
diabetes                    False
ejection_fraction           False
high_blood_pressure         False
platelets                   False
serum_creatinine            False
serum_sodium                False
sex                         False
smoking                     False
time                        False
DEATH_EVENT                 False
dtype: bool

In [4]:
X.head(10) #Let's see our data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1
7,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,10,1
8,65.0,0,157,0,65,0,263358.03,1.5,138,0,0,10,1
9,80.0,1,123,0,35,1,388000.0,9.4,133,1,1,10,1


In [5]:
#Splitting data and target
y = X.DEATH_EVENT 
X.drop(columns='DEATH_EVENT',inplace=True)

In [6]:
# create loocv procedure
cv = LeaveOneOut()
# enumerate splits
y_true, y_pred = list(), list()
for train_ix, test_ix in cv.split(X.values):
	# split data
	X_train, X_test = X.values[train_ix, :], X.values[test_ix, :]
	y_train, y_test = y.values[train_ix], y.values[test_ix]
	# fit model
	model = RandomForestClassifier(random_state=1)
	model.fit(X_train, y_train)
	# evaluate model
	yhat = model.predict(X_test)
	# store
	y_true.append(y_test[0])
	y_pred.append(yhat[0])
# calculate accuracy
acc = accuracy_score(y_true, y_pred)
# creating confusion matrix in order to get the TP,FP,TN,FN
cm=confusion_matrix(y_true, y_pred)

print(f'True Positives: {cm[1][1]}')
print(f'False Positives: {cm[0][1]}')
print(f'True Negatives: {cm[0][0]}')
print(f'False Negatives: {cm[1][0]}')
print('Accuracy: %.3f' % acc)

True Positives: 69
False Positives: 17
True Negatives: 186
False Negatives: 27
Accuracy: 0.853


In [7]:
df = pd.read_csv('algo_performance.csv') # getting the second csv
# df_transposed = df.T

In [8]:
print('''The Friedman Test uses the following null and alternative hypotheses:

> The null hypothesis (H0): The mean for each population is equal.
> The alternative hypothesis: (Ha): At least one population mean is different from the rest.\n''')
a,b = friedmanchisquare(df['1-NN'],df['CN2'],df['Kernel'],df['NaiveBayes'],df['C4.5'])
print("The hypothesis we accept for our 5 algorithms is",end=' ')
print("H0" if b > 0.05 else "H1")

The Friedman Test uses the following null and alternative hypotheses:

> The null hypothesis (H0): The mean for each population is equal.
> The alternative hypothesis: (Ha): At least one population mean is different from the rest.

The hypothesis we accept for our 5 algorithms is H1
