# Import Dependencies Library 

In [None]:
#Import Dependencies
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from pathlib import Path

# Create CSVs - Cleveland and VA Files

In [None]:
#read in the txt file 1
df = pd.read_csv("resources/cleveland.txt", header=None)
df.head(30)

In [None]:
#read in txt file 2
df2 = pd.read_csv("resources/va.txt", header = None)
df2.head(30)

# Clean Data and Pre-Process Heart Disease Dataset

In [None]:
#combine the data into one data frame
df3 = pd.concat([df, df2], axis = 0)
df3

In [None]:
#rename the columns
renamed_df3 = df3.rename(columns={df.columns[0]:'Age', df.columns[1]:'Sex', df.columns[2]:'Chest Pain Type',df.columns[3]:'Resting Blood Pressure',
                   df.columns[4]:'Cholesterol', df.columns[5]:'Fasting Blood Sugar', df.columns[6]:'Resting EKG',
                   df.columns[7]:'Max Heart Rate', df.columns[8]:'Exercise Enduced Chest Pain', df.columns[9]:'STDep Induced by Exercise',
                   df.columns[10]:'Slope of Peak Exercise ST', df.columns[11]:'Major Vessels Colored', df.columns[12]:'Defect Type', 
                   df.columns[13]:'Diagnosis'})

renamed_df3

In [None]:
#drop columns with alot of null values
cleaned_df3 = renamed_df3.drop(columns=['Major Vessels Colored', 'Defect Type'])
cleaned_df3

In [None]:
#replace question marks with 0
df4 = cleaned_df3.replace(to_replace = '?', value = 0)
df4

In [None]:
#export new dataframe to csv
df4.to_csv('resources/heartdisease.csv')

In [None]:
#perform value counts on the dagnosis data for preprocessing
Diagnosis_counts = df4['Diagnosis'].value_counts()
Diagnosis_counts

In [2]:
# Import cleaned preprocessed data
heart_data = pd.read_csv("resources/heartdisease.csv")
heart_data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,2
2,2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [3]:
# Further clean data - remove "Unnamed: 0" column
heart_data = heart_data.drop(columns=['Unnamed: 0'])

# Verify
heart_data.columns

Index(['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure',
       'Cholesterol', 'Fasting Blood Sugar', 'Resting EKG', 'Max Heart Rate',
       'Exercise Enduced Chest Pain', 'STDep Induced by Exercise',
       'Slope of Peak Exercise ST', 'Diagnosis'],
      dtype='object')

In [4]:
# Change values
heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data['Diagnosis'].head()

0    0
1    1
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [5]:
# Change Diagnosis column so we only have 0 and 1 (absense and presence)
# Copy data
heart_new = heart_data.copy()

heart_new['Diagnosis'].head()

0    0
1    1
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [6]:
#create data frame for yes/no diagnosis
dgHeartData = heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


# Identify 2 features for use - Cholesterol and Resting Blood Pressure based on CDC risk factors (https://www.cdc.gov/heart-disease/risk-factors/index.html)

In [7]:
# Determine the number of unique values in each column.
uniqueValues = heart_data.nunique()
uniqueValues

Age                             43
Sex                              2
Chest Pain Type                  4
Resting Blood Pressure          55
Cholesterol                    180
Fasting Blood Sugar              2
Resting EKG                      3
Max Heart Rate                 106
Exercise Enduced Chest Pain      2
STDep Induced by Exercise       42
Slope of Peak Exercise ST        4
Diagnosis                        2
dtype: int64

In [8]:
# Look at  value counts to identify and delete.
Cholesterol_Type = heart_data['Cholesterol'].value_counts()
Sorted_Cholesterol = Cholesterol_Type.sort_values(ascending=False)
Sorted_Cholesterol.head()

Cholesterol
0.0      56
254.0     8
204.0     8
258.0     7
240.0     7
Name: count, dtype: int64

In [None]:
#Look at Cholesterol frequency less than 56 to remove 0 values. 
Sorted_Cholesterol56 = Sorted_Cholesterol[Sorted_Cholesterol < 56]
Sorted_Cholesterol56.sort_values(ascending=False)

In [14]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
cholesterol_cutoff = 9
cl8 = heart_data['Cholesterol'].value_counts()
chol_to_replace = list(cl8[cl8 > cholesterol_cutoff].index)

# Replace in dataframe
for chol in chol_to_replace:
    heart_data['Cholesterol'] = heart_data['Cholesterol'].replace(chol,"Other")

# Check to make sure replacement was successful
heart_data['Cholesterol'].value_counts()

Cholesterol
Other    56
204.0     8
254.0     8
258.0     7
240.0     7
         ..
409.0     1
184.0     1
394.0     1
293.0     1
385.0     1
Name: count, Length: 180, dtype: int64

In [16]:
# Look at  value counts to identify and delete.
Resting_Blood_Pressure_Type = heart_data['Resting Blood Pressure'].value_counts()
Sorted_Resting_Blood_Pressure_Type = Resting_Blood_Pressure_Type.sort_values(ascending=False)
Sorted_Resting_Blood_Pressure_Type.head()

Resting Blood Pressure
0.0      57
120.0    53
130.0    51
140.0    42
110.0    28
Name: count, dtype: int64

In [20]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
BP_cutoff = 54
BPcutoff = heart_data['Resting Blood Pressure'].value_counts()
BP_to_replace = list(BPcutoff[BPcutoff > BP_cutoff].index)

# Replace in dataframe
for BP in BP_to_replace:
    heart_data['Resting Blood Pressure'] = heart_data['Resting Blood Pressure'].replace(BP,"Other")

# Check to make sure replacement was successful
heart_data['Resting Blood Pressure'].value_counts()

Resting Blood Pressure
Other    57
120.0    53
130.0    51
140.0    42
110.0    28
150.0    26
160.0    19
128.0    16
138.0    14
125.0    13
112.0    11
132.0    11
124.0    10
122.0    10
134.0     9
118.0     8
142.0     8
170.0     8
135.0     7
136.0     7
126.0     7
152.0     7
144.0     6
108.0     6
145.0     5
100.0     5
180.0     5
158.0     4
115.0     3
178.0     3
146.0     3
154.0     3
102.0     3
104.0     3
105.0     3
155.0     3
114.0     2
148.0     2
156.0     2
172.0     2
116.0     2
94.0      2
106.0     2
164.0     1
96.0      1
190.0     1
129.0     1
117.0     1
123.0     1
192.0     1
174.0     1
101.0     1
165.0     1
200.0     1
127.0     1
Name: count, dtype: int64

In [25]:
#Convert categorical Data to numeric with 'pd.get_dummies'
converted_heart_data = pd.get_dummies(heart_data)
converted_heart_data.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis,...,Cholesterol_369.0,Cholesterol_384.0,Cholesterol_385.0,Cholesterol_394.0,Cholesterol_407.0,Cholesterol_409.0,Cholesterol_417.0,Cholesterol_458.0,Cholesterol_564.0,Cholesterol_Other
0,63.0,1.0,1.0,1.0,2.0,150.0,0.0,2.3,3.0,0,...,False,False,False,False,False,False,False,False,False,False
1,67.0,1.0,4.0,0.0,2.0,108.0,1.0,1.5,2.0,1,...,False,False,False,False,False,False,False,False,False,False
2,67.0,1.0,4.0,0.0,2.0,129.0,1.0,2.6,2.0,1,...,False,False,False,False,False,False,False,False,False,False
3,37.0,1.0,3.0,0.0,0.0,187.0,0.0,3.5,3.0,0,...,False,False,False,False,False,False,False,False,False,False
4,41.0,0.0,2.0,0.0,2.0,172.0,0.0,1.4,1.0,0,...,False,False,False,False,False,False,False,False,False,False


In [27]:
## Split our preprocessed data into our features and target arrays
X = converted_heart_data.drop('Diagnosis', axis=1).values
y = converted_heart_data['Diagnosis'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [28]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Neural Net with Relu (non-linear) and Sigmoid (non-linear) Activation features

In [29]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation = 'relu')
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [30]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
# Train the model
fitModel = nn.fit(X_train_scaled, y_train, epochs=300)

Epoch 1/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5088 - loss: 0.7121
Epoch 2/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6873 - loss: 0.5909 
Epoch 3/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8235 - loss: 0.5082 
Epoch 4/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8277 - loss: 0.4489 
Epoch 5/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8680 - loss: 0.3821 
Epoch 6/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9067 - loss: 0.3190 
Epoch 7/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9323 - loss: 0.2825 
Epoch 8/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9247 - loss: 0.2501 
Epoch 9/300
[1m12/12[0m [32m━━━━━━━━━━

# Accuracy Score - 75.4%

In [32]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - 52ms/step - accuracy: 0.7540 - loss: 3.1112
Loss: 3.1112425327301025, Accuracy: 0.7539682388305664
