In [56]:
import io
import pandas as pd
import joblib as jb
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_excel('heart_failure_clinical_records_dataset.xls')

df.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1
7,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,10,1
8,65.0,0,157,0,65,0,263358.03,1.5,138,0,0,10,1
9,80.0,1,123,0,35,1,388000.0,9.4,133,1,1,10,1


In [57]:
df. rename(columns = {'creatinine_phosphokinase':'cr_ph', 'ejection_fraction':'ej_fr', 'high_blood_pressure':'hbp','serum_creatinine':'ser_cr', 'serum_sodium':'ser_na','DEATH_EVENT':'status'}, inplace = True)
# df('creatinine_phosphokinase') =
print(df)

      age  anaemia  cr_ph  diabetes  ej_fr  hbp  platelets  ser_cr  ser_na  \
0    75.0        0    582         0     20    1  265000.00     1.9     130   
1    55.0        0   7861         0     38    0  263358.03     1.1     136   
2    65.0        0    146         0     20    0  162000.00     1.3     129   
3    50.0        1    111         0     20    0  210000.00     1.9     137   
4    65.0        1    160         1     20    0  327000.00     2.7     116   
..    ...      ...    ...       ...    ...  ...        ...     ...     ...   
294  62.0        0     61         1     38    1  155000.00     1.1     143   
295  55.0        0   1820         0     38    0  270000.00     1.2     139   
296  45.0        0   2060         1     60    0  742000.00     0.8     138   
297  45.0        0   2413         0     38    0  140000.00     1.4     140   
298  50.0        0    196         0     45    0  395000.00     1.6     136   

     sex  smoking  time  status  
0      1        0     4      

In [58]:
from numpy import set_printoptions
from sklearn.preprocessing import Normalizer

# #Splitting into dependent and independent data
X_data = df[['age', 'ej_fr', 'ser_cr', 'ser_na','time']] # USING THE SELECTED FEATURES BASED ON FORWARD FEATURE SELECTION
# X_data = df.drop(['status'], axis=1) #independent variables
Y_data = df.pop('status')

#normalised the independent data
#l2 leverages more features and distances between points remain the same
df_normaliser = Normalizer(norm='l2').fit(X_data)
my_normalized_data = df_normaliser.transform(X_data)
set_printoptions(precision=2)
print('\n My Normalised data:\n', my_normalized_data[200:204])
#dataset are now within same ranges


 My Normalised data:
 [[0.26 0.18 0.   0.56 0.76]
 [0.19 0.25 0.   0.56 0.77]
 [0.28 0.24 0.   0.55 0.75]
 [0.25 0.1  0.01 0.57 0.78]]


In [59]:
# Spliiting dataset into train and test
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X_data,
                                                    Y_data,
                                                    test_size=0.3,  # 70% training and 30% test
                                                    random_state=1)
#Checking the  number of test and train data
print('\n The total of training dataset:', X_train.shape)
print('\n The total of test dataset:', X_test.shape)
print(Y_test.shape)


 The total of training dataset: (209, 5)

 The total of test dataset: (90, 5)
(90,)


In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#importing the models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

svm =SVC(kernel='poly', max_iter=-1, degree=3, probability=True)
dt = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=0.05, min_samples_split=2,random_state=1)
kn =KNeighborsClassifier(n_neighbors=8,leaf_size=30,metric='minkowski', p=2)
lr =LogisticRegression(intercept_scaling= '1',max_iter=1000,multi_class= 'auto', penalty='l2',random_state=1, solver='newton-cg',tol=0.0001, verbose= 0, warm_start=False)

class_list = [('DecisionTreeClassifier:',dt),('Supportvector:',svm),('LogisticRegression:',lr),('KNeighborsClassifier:',kn)]

#iteration
for clsf_name,clsf in class_list:
    clsf.fit(X_train,Y_train)
    Y_pred = clsf.predict(X_test)
    print('\n{:s} :{:.3f}'.format(clsf_name, accuracy_score(Y_test, Y_pred)))

votingc = VotingClassifier(estimators=class_list, voting='soft')
votingc.fit(X_train, Y_train)

#predict test label
Y_pred_vc = clsf.predict(X_test)



DecisionTreeClassifier: :0.844

Supportvector: :0.856

LogisticRegression: :0.833

KNeighborsClassifier: :0.878


In [61]:

print('\n voting classifier {:.3f}'. format(accuracy_score(Y_test, Y_pred_vc)))


 voting classifier 0.878


In [62]:

#confusion matrix
matrix_info = confusion_matrix(Y_test, Y_pred_vc)
print('\n Confusion matrix on heart failure data\n', matrix_info, '\n')

class_report = classification_report(Y_test, Y_pred_vc)
print('Classification report:\n ', class_report)




 Confusion matrix on heart failure data
 [[63  1]
 [10 16]] 

Classification report:
                precision    recall  f1-score   support

           0       0.86      0.98      0.92        64
           1       0.94      0.62      0.74        26

    accuracy                           0.88        90
   macro avg       0.90      0.80      0.83        90
weighted avg       0.89      0.88      0.87        90



In [63]:
#MAKING MODEL PERSISTENT FOR USE

#VIEWING THE TEST DATASET
print(X_test.tail(5).join(Y_test.tail(5)))

      age  ej_fr  ser_cr  ser_na  time  status
122  60.0     38    0.75     140    95       0
246  55.0     25    1.10     138   214       1
278  50.0     30    0.70     136   246       0
251  55.0     35    0.80     143   215       0
19   48.0     55    1.90     121    15       1


In [64]:
#SAVING MODEL
my_model = 'dissert_model.sav'
jb.dump(votingc, my_model)


['dissert_model.sav']

In [65]:

#TESTING THE MODEL EFFECT

load_my_model = jb.load(my_model)
results = load_my_model.score(X_test, Y_test)
print('\n This is the result of the persistent model\n', results)


 This is the result of the persistent model
 0.8666666666666667


In [66]:
x= 60.0,38,0.75,140,95
client_data = np.array(x).reshape(1,-1)
forte =load_my_model.predict(client_data)
print('This is the prediction')
print(forte)

This is the prediction
[0]


