# Heart Disease Prediction
Mengyang He and Milo Yen-Goossens

In [2]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
data = pd.read_csv('./cardio_train.csv',sep=";")
df = pd.DataFrame(data)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [4]:
df.drop(columns=['id'],inplace=True)
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [5]:
# check if the dataset has null or missing value
df.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [6]:
# add bmi feature
bmi = df['weight'].div(df['height'].multiply(df['height'])) * 100
df['bmi'] = bmi

In [7]:
# make normalized dataframe
df_normalized = df.copy()
for var in ['age', 'height', 'weight', 'ap_hi', 'ap_lo']:
  print(var + " mean, avg: " + str(df[var].mean()) + ", " + str(df[var].std()))
  df_normalized[var] = df[var] - df[var].mean()
  df_normalized[var] = df_normalized[var].div(df[var].std())

df_normalized

age mean, avg: 19468.865814285713, 2467.2516672413913
height mean, avg: 164.35922857142856, 8.210126364538139
weight mean, avg: 74.20569, 14.39575667851056
ap_hi mean, avg: 128.8172857142857, 154.01141945605565
ap_lo mean, avg: 96.63041428571428, 188.47253029643605


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,-0.436058,2,0.443449,-0.847867,-0.122181,-0.088238,1,1,0,0,1,0,0.219671
1,0.307684,1,-1.018161,0.749826,0.072610,-0.035180,3,1,0,0,1,1,0.349277
2,-0.247995,1,0.078046,-0.708937,0.007679,-0.141296,3,1,0,0,0,1,0.235078
3,-0.748147,2,0.565250,0.541431,0.137540,0.017878,1,1,0,0,1,1,0.287105
4,-0.808538,1,-1.018161,-1.264657,-0.187111,-0.194354,1,1,0,0,0,0,0.230112
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,-0.092761,2,0.443449,0.124642,-0.057251,-0.088238,1,1,1,0,1,0,0.269274
69996,1.269483,1,-0.774559,3.597887,0.072610,-0.035180,2,2,0,0,1,1,0.504727
69997,-0.163285,2,2.270461,2.139124,0.332331,-0.035180,3,1,0,1,0,1,0.313536
69998,1.200580,1,-0.165555,-0.153218,0.040145,-0.088238,1,2,0,0,0,1,0.270993


In [21]:
# logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_log = df_normalized.drop(columns = ["cardio"])
Y_log = df_normalized["cardio"]

X_log_train , X_log_test , Y_log_train , Y_log_test = train_test_split(X_log,Y_log,test_size=0.3,random_state=0)
acc_log = 0
# best_C = float('-inf')
# C_vals = 10**np.arange(-8.,20.)
# for C in C_vals:
logreg = LogisticRegression()
logreg.fit(X_log_train, Y_log_train)
acc_log = max(acc_log, round(logreg.score(X_log_train, Y_log_train) * 100, 5))
# best_C = max(best_C, C)
print("Accureacy of Logistic Regression training set:" ,acc_log)

acc_test_log = round(logreg.score(X_log_test, Y_log_test) * 100, 5)
print("Accureacy of Logistic Regression validation set:" ,acc_test_log)


Accureacy of Logistic Regression training set: 72.26939
Accureacy of Logistic Regression validation set: 72.10476


In [None]:
# Correlation of Logisctic Regression
coeff_df = pd.DataFrame(X_log_train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

In [20]:
# ANN
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers

def build_ann(optimizer='adam'):
    # Initializing the ANN
    ann = Sequential()
    
    # Adding the input layer and the first hidden layer of the ANN with dropout
    ann.add(Dense(units=32, kernel_initializer='glorot_uniform', activation='relu', input_shape=(len(X_ann_train.columns),)))
    
    # Add other layers, it is not necessary to pass the shape because there is a layer before
    ann.add(Dense(units=64, kernel_initializer='glorot_uniform', activation='relu'))
    ann.add(Dropout(rate=0.5))
    ann.add(Dense(units=64, kernel_initializer='glorot_uniform', activation='relu'))
    ann.add(Dropout(rate=0.5))
    
    # Adding the output layer
    ann.add(Dense(units=1, kernel_initializer='glorot_uniform', activation='sigmoid'))
    
    # Compiling the ANN
    ann.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return ann

#train, test, target, target_test = train_test_split(df_normalized, df_nomalized["cardio"], test_size=0.2, random_state=0)

X_ann = df_normalized.drop(columns = ["cardio"])
Y_ann = df_normalized["cardio"]
X_ann_train , X_ann_test , Y_ann_train , Y_ann_test = train_test_split(X_ann,Y_ann,test_size=0.3,random_state=0)

opt = optimizers.Adam(lr=0.001)
ann = build_ann(opt)
# Training the ANN
history = ann.fit(X_ann_train, Y_ann_train, batch_size=16, epochs=100, validation_data=(X_ann_test, Y_ann_test))

# Predicting the Train set results
ann_prediction = ann.predict(X_ann_train)
ann_prediction = (ann_prediction > 0.5)*1 # convert probabilities to binary output



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [22]:
# Training set results
ann_prediction = ann.predict(X_ann_train)
ann_prediction = (ann_prediction > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data and true response and display it in confusion matrix
acc_ann = round(metrics.accuracy_score(Y_ann_train, ann_prediction) * 100, 2)
print("Accureacy of ANN training set:", acc_ann)

Accureacy of ANN training set: 74.1


In [23]:
# Predicting the Test set results
ann_prediction_test = ann.predict(X_ann_test)
ann_prediction_test = (ann_prediction_test > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data and true response and display it in confusion matrix
acc_test_ann = round(metrics.accuracy_score(Y_ann_test, ann_prediction_test) * 100, 2)
print("Accureacy of ANN validation set:",acc_test_ann)

Accureacy of ANN validation set: 73.48


In [111]:
from keras.utils import plot_model
from tensorflow.keras.utils import plot_model

keras.utils.plot_model(
    ann,
    to_file="model.png",
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=200,
    show_layer_activations=True,
    show_trainable=True,
)
#plot_model(ann, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [24]:
print(ann.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                416       
                                                                 
 dense_1 (Dense)             (None, 64)                2112      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 6753 (26.38 KB)
Trainable params: 6753 (26

In [25]:
# SVM
from sklearn.svm import SVC, LinearSVC
#train, test, target, target_test = train_test_split(df_normalized, df_nomalized["cardio"], test_size=0.2, random_state=0)

X_svm = df_normalized.drop(columns = ["cardio"])
Y_svm = df_normalized["cardio"]
X_svm_train , X_svm_test , Y_svm_train , Y_svm_test = train_test_split(X_svm,Y_svm,test_size=0.3,random_state=0)

svc = SVC(kernel = 'rbf')
svc.fit(X_svm_train, Y_svm_train)
acc_svc = round(svc.score(X_svm_train, Y_svm_train) * 100, 5)
print("Accureacy of SVM training set:" ,acc_svc)

acc_test_svc = round(svc.score(X_svm_test, Y_svm_test) * 100, 5)
print("Accureacy of SVM validation set:" ,acc_test_svc)

Accureacy of SVM training set: 73.5102
Accureacy of SVM validation set: 73.11905
