In [1]:
#preprocessing - impoort dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

2024-07-29 21:15:13.071580: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Import cleaned preprocessed data
heart_data = pd.read_csv("./heartdisease.csv")
heart_data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,2
2,2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [3]:
# Further clean data - remove "Unnamed: 0" column
heart_data = heart_data.drop(columns=['Unnamed: 0'])

# Verify
heart_data.columns

Index(['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure',
       'Cholesterol', 'Fasting Blood Sugar', 'Resting EKG', 'Max Heart Rate',
       'Exercise Enduced Chest Pain', 'STDep Induced by Exercise',
       'Slope of Peak Exercise ST', 'Diagnosis'],
      dtype='object')

In [4]:
# Change values
heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data['Diagnosis'].head()

0    0
1    1
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [5]:
# Change Diagnosis column so we only have 0 and 1 (absense and presence)
# Copy data
heart_new = heart_data.copy()

heart_new['Diagnosis'].head()

0    0
1    1
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [6]:
#create data frame for yes/no diagnosis
dgHeartData = heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [7]:
# Determine the number of unique values in each column.
uniqueValues = heart_data.nunique()
uniqueValues

Age                             43
Sex                              2
Chest Pain Type                  4
Resting Blood Pressure          55
Cholesterol                    180
Fasting Blood Sugar              2
Resting EKG                      3
Max Heart Rate                 106
Exercise Enduced Chest Pain      2
STDep Induced by Exercise       42
Slope of Peak Exercise ST        4
Diagnosis                        2
dtype: int64

In [8]:
# Look at  value counts to identify and delete.
MaxRate_Type = heart_data['Max Heart Rate'].value_counts()
Sorted_MaxRate = MaxRate_Type.sort_values(ascending=False)
Sorted_MaxRate.head()

Max Heart Rate
0.0      53
140.0    18
120.0    15
160.0    13
162.0    12
Name: count, dtype: int64

In [9]:
#Look at Max Rate frequency less than 53 to remove 0 values. 
Sorted_MaxRate53 = Sorted_MaxRate[Sorted_MaxRate < 53]
Sorted_MaxRate53.sort_values(ascending=False)

Max Heart Rate
140.0    18
120.0    15
160.0    13
162.0    12
150.0    11
         ..
188.0     1
137.0     1
194.0     1
184.0     1
93.0      1
Name: count, Length: 105, dtype: int64

In [10]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
MaxRate_cutoff = 19
mr19 = heart_data['Max Heart Rate'].value_counts()
mr_to_replace = list(mr19[mr19 > MaxRate_cutoff].index)

# Replace in dataframe
for mr in mr_to_replace:
    heart_data['Max Heart Rate'] = heart_data['Max Heart Rate'].replace(mr,"Other")

# Check to make sure replacement was successful
heart_data['Max Heart Rate'].value_counts()

Max Heart Rate
Other    53
140.0    18
120.0    15
160.0    13
162.0    12
         ..
202.0     1
190.0     1
177.0     1
185.0     1
93.0      1
Name: count, Length: 106, dtype: int64

In [11]:
# Look at  value counts to identify and delete.
ages = heart_data['Age'].value_counts()
Sorted_Ages = ages.sort_values(ascending=False)
Sorted_Ages.head()

Age
58.0    31
57.0    27
62.0    26
63.0    24
60.0    24
Name: count, dtype: int64

In [12]:
#Convert categorical Data to numeric with 'pd.get_dummies'
converted_heart_data = pd.get_dummies(heart_data)
converted_heart_data.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,...,Max Heart Rate_185.0,Max Heart Rate_186.0,Max Heart Rate_187.0,Max Heart Rate_188.0,Max Heart Rate_190.0,Max Heart Rate_192.0,Max Heart Rate_194.0,Max Heart Rate_195.0,Max Heart Rate_202.0,Max Heart Rate_Other
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,0.0,2.3,3.0,...,False,False,False,False,False,False,False,False,False,False
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,1.0,1.5,2.0,...,False,False,False,False,False,False,False,False,False,False
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,1.0,2.6,2.0,...,False,False,False,False,False,False,False,False,False,False
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,0.0,3.5,3.0,...,False,False,True,False,False,False,False,False,False,False
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,0.0,1.4,1.0,...,False,False,False,False,False,False,False,False,False,False


In [13]:
## Split our preprocessed data into our features and target arrays
X = converted_heart_data.drop('Diagnosis', axis=1).values
y = converted_heart_data['Diagnosis'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 90
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation = 'relu')
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='leaky_relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

In [41]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
# Train the model
fitModel = nn.fit(X_train_scaled, y_train, epochs=300)

Epoch 1/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5800 - loss: 0.7011
Epoch 2/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6751 - loss: 0.5891
Epoch 3/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7424 - loss: 0.5120
Epoch 4/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8306 - loss: 0.4254
Epoch 5/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8671 - loss: 0.4006
Epoch 6/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8663 - loss: 0.3634
Epoch 7/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8632 - loss: 0.3586 
Epoch 8/300
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8801 - loss: 0.3336
Epoch 9/300
[1m12/12[0m [32m━━━━━━━━━━━━━━

In [43]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - 50ms/step - accuracy: 0.7540 - loss: 4.3024
Loss: 4.3023681640625, Accuracy: 0.7539682388305664
