In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
input_file = 'heart_attack_prediction_dataset.csv'
output_file = 'output.csv'
df = pd.read_csv(input_file)

In [3]:
df[['Systolic', 'Diastolic']] = df['Blood Pressure'].str.split('/', expand=True)
df = df.drop('Blood Pressure', axis=1)

In [4]:
df['Diet'] = pd.factorize(df['Diet'])[0]

In [5]:
df['Continent'] = pd.factorize(df['Continent'])[0]

In [6]:
df['Hemisphere'] = pd.factorize(df['Hemisphere'])[0]

In [7]:
df['Sex'] = pd.factorize(df['Sex'])[0]

In [8]:
df['Country'] = pd.factorize(df['Country'])[0]

In [9]:
df.to_csv(output_file, index=False)

In [10]:
df.head(10)

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Systolic,Diastolic
0,BMW7812,67,0,208,72,0,0,1,0,0,...,31.251233,286,0,6,0,0,0,0,158,88
1,CZE1114,21,0,389,98,1,1,1,1,1,...,27.194973,235,1,7,1,1,1,0,165,93
2,BNI9906,21,1,324,72,1,0,0,0,0,...,28.176571,587,4,4,2,2,1,0,174,99
3,JLN3497,84,0,383,73,1,1,1,0,1,...,36.464704,378,3,4,1,1,1,0,163,100
4,GFO8847,66,0,318,93,1,1,1,1,0,...,21.809144,231,1,5,3,3,1,0,91,88
5,ZOO7941,54,1,297,48,1,1,1,0,1,...,20.14684,795,5,10,4,2,1,1,172,86
6,WYV0966,90,0,358,84,0,0,1,0,1,...,28.885811,284,4,10,1,1,1,1,102,73
7,XXM0972,84,0,220,107,0,0,1,1,1,...,22.221862,370,6,7,5,3,1,1,131,68
8,XCQ5937,20,0,145,68,1,0,1,1,0,...,35.809901,790,7,4,6,0,0,0,144,105
9,FTJ5456,43,1,248,55,0,1,1,1,1,...,22.558917,232,7,7,5,3,1,0,160,70


In [11]:
heart_data_scaled = StandardScaler().fit_transform(df[["Age","Sex","Cholesterol","Heart Rate",
                                                       "Diabetes","Family History","Smoking","Obesity",
                                                       "Alcohol Consumption","Exercise Hours Per Week",
                                                       "Diet","Previous Heart Problems","Medication Use",
                                                       "Stress Level","Sedentary Hours Per Day",
                                                       "Income","BMI","Triglycerides","Physical Activity Days Per Week",
                                                       "Sleep Hours Per Day",
                                                       "Systolic","Diastolic"]])

In [12]:
df_heart_scaled = pd.DataFrame(heart_data_scaled,columns=["Age","Sex","Cholesterol","Heart Rate","Diabetes","Family History","Smoking","Obesity",
                                                       "Alcohol Consumption","Exercise Hours Per Week",
                                                       "Diet","Previous Heart Problems","Medication Use",
                                                       "Stress Level","Sedentary Hours Per Day",
                                                       "Income","BMI","Triglycerides","Physical Activity Days Per Week",
                                                       "Sleep Hours Per Day",
                                                       "Systolic","Diastolic"])
df_heart_scaled.head(10)

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Systolic,Diastolic
0,0.625557,-0.658765,-0.641579,-0.147042,-1.369651,-0.986061,0.339157,-1.002857,-1.219867,-1.010838,...,-0.996696,1.234604,0.179251,1.28013,0.373454,-0.588539,-1.528843,-0.51475,0.870044,0.193782
1,-1.539322,-0.658765,1.596895,1.118179,0.730113,1.014136,0.339157,0.997151,0.819762,-1.418027,...,-0.996696,-1.563129,-0.297225,1.582523,-0.268479,-0.816487,-1.090738,-0.011823,1.135714,0.53448
2,-1.539322,1.517992,0.793023,-0.147042,0.730113,-0.986061,-2.948488,-1.002857,-1.219867,-1.372188,...,1.003315,1.234604,1.001031,0.955917,-0.113134,0.7568,0.223577,-1.520604,1.47729,0.943319
3,1.425621,-0.658765,1.522691,-0.09838,0.730113,1.014136,0.339157,-1.002857,0.819762,-0.032188,...,-0.996696,1.234604,0.477557,-0.404902,1.198524,-0.177339,-0.214528,-1.520604,1.059809,1.011458
4,0.578495,-0.658765,0.71882,0.874867,0.730113,1.014136,0.339157,0.997151,-1.219867,-0.727941,...,-0.996696,0.185454,-1.29217,0.028445,-1.120826,-0.834365,-1.090738,-1.017677,-1.672797,0.193782
5,0.013743,1.517992,0.459107,-1.314938,0.730113,1.014136,0.339157,-1.002857,0.819762,-1.623483,...,1.003315,-1.213412,0.520767,1.031094,-1.383898,1.68647,0.661682,1.496959,1.401384,0.057502
6,1.707997,-0.658765,1.21351,0.436906,-1.369651,-0.986061,0.339157,-1.002857,0.819762,-1.022943,...,-0.996696,0.53517,-1.548207,0.399486,-0.000892,-0.597478,0.223577,1.496959,-1.255316,-0.828314
7,1.425621,-0.658765,-0.493172,1.55614,-1.369651,-0.986061,0.339157,0.997151,0.819762,-1.138835,...,1.003315,-0.513979,1.312717,-0.448925,-1.055511,-0.213096,1.099787,-0.011823,-0.154683,-1.169013
8,-1.586385,-0.658765,-1.420716,-0.341691,0.730113,-0.986061,0.339157,0.997151,-1.219867,1.185116,...,-0.996696,-0.164263,1.544965,-1.652925,1.094897,1.664122,1.537893,-1.520604,0.338704,1.352157
9,-0.503945,1.517992,-0.146889,-0.974302,-1.369651,1.014136,0.339157,0.997151,0.819762,-1.697919,...,-0.996696,-0.513979,-0.559286,0.638444,-1.002169,-0.829896,1.537893,-0.011823,0.94595,-1.032733


## Unsupervised (most accurate results)

In [13]:
model = KMeans(n_clusters=2)

In [14]:
model.fit(df_heart_scaled)



In [15]:
heart_clusters = model.predict(df_heart_scaled)

In [16]:
df_heart_scaled_predictions = df_heart_scaled.copy()
df_heart_scaled_predictions["HeartRisk"] = heart_clusters

df_heart_scaled_predictions.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Systolic,Diastolic,HeartRisk
0,0.625557,-0.658765,-0.641579,-0.147042,-1.369651,-0.986061,0.339157,-1.002857,-1.219867,-1.010838,...,1.234604,0.179251,1.28013,0.373454,-0.588539,-1.528843,-0.51475,0.870044,0.193782,0
1,-1.539322,-0.658765,1.596895,1.118179,0.730113,1.014136,0.339157,0.997151,0.819762,-1.418027,...,-1.563129,-0.297225,1.582523,-0.268479,-0.816487,-1.090738,-0.011823,1.135714,0.53448,0
2,-1.539322,1.517992,0.793023,-0.147042,0.730113,-0.986061,-2.948488,-1.002857,-1.219867,-1.372188,...,1.234604,1.001031,0.955917,-0.113134,0.7568,0.223577,-1.520604,1.47729,0.943319,1
3,1.425621,-0.658765,1.522691,-0.09838,0.730113,1.014136,0.339157,-1.002857,0.819762,-0.032188,...,1.234604,0.477557,-0.404902,1.198524,-0.177339,-0.214528,-1.520604,1.059809,1.011458,0
4,0.578495,-0.658765,0.71882,0.874867,0.730113,1.014136,0.339157,0.997151,-1.219867,-0.727941,...,0.185454,-1.29217,0.028445,-1.120826,-0.834365,-1.090738,-1.017677,-1.672797,0.193782,0


## Supervised -- Logistic Regression

In [17]:
df_heart_scaled2 = df_heart_scaled.copy()
df_heart_scaled2["Heart Attack Risk"] = df["Heart Attack Risk"]
df_heart_scaled2.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Systolic,Diastolic,Heart Attack Risk
0,0.625557,-0.658765,-0.641579,-0.147042,-1.369651,-0.986061,0.339157,-1.002857,-1.219867,-1.010838,...,1.234604,0.179251,1.28013,0.373454,-0.588539,-1.528843,-0.51475,0.870044,0.193782,0
1,-1.539322,-0.658765,1.596895,1.118179,0.730113,1.014136,0.339157,0.997151,0.819762,-1.418027,...,-1.563129,-0.297225,1.582523,-0.268479,-0.816487,-1.090738,-0.011823,1.135714,0.53448,0
2,-1.539322,1.517992,0.793023,-0.147042,0.730113,-0.986061,-2.948488,-1.002857,-1.219867,-1.372188,...,1.234604,1.001031,0.955917,-0.113134,0.7568,0.223577,-1.520604,1.47729,0.943319,0
3,1.425621,-0.658765,1.522691,-0.09838,0.730113,1.014136,0.339157,-1.002857,0.819762,-0.032188,...,1.234604,0.477557,-0.404902,1.198524,-0.177339,-0.214528,-1.520604,1.059809,1.011458,0
4,0.578495,-0.658765,0.71882,0.874867,0.730113,1.014136,0.339157,0.997151,-1.219867,-0.727941,...,0.185454,-1.29217,0.028445,-1.120826,-0.834365,-1.090738,-1.017677,-1.672797,0.193782,0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [19]:
y = df_heart_scaled2["Heart Attack Risk"]
X = df_heart_scaled2.drop(columns=["Heart Attack Risk"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [20]:
logistic_regression_model = LogisticRegression(random_state=9)
lr_model = logistic_regression_model.fit(X_train, y_train)
training_predictions = lr_model.predict(X_train)
testing_predictions = logistic_regression_model.predict(X_test)

In [21]:
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      4218
           1       0.00      0.00      0.00      2354

    accuracy                           0.64      6572
   macro avg       0.32      0.50      0.39      6572
weighted avg       0.41      0.64      0.50      6572



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1406
           1       0.00      0.00      0.00       785

    accuracy                           0.64      2191
   macro avg       0.32      0.50      0.39      2191
weighted avg       0.41      0.64      0.50      2191



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Supervised -- SVC

In [23]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.642


In [24]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1406
           1       0.00      0.00      0.00       785

    accuracy                           0.64      2191
   macro avg       0.32      0.50      0.39      2191
weighted avg       0.41      0.64      0.50      2191



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Supervised -- Decision Trees

In [25]:
y = df_heart_scaled2["Heart Attack Risk"].values.reshape(-1, 1)
X = df_heart_scaled2.drop(columns=["Heart Attack Risk"])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)

In [28]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.64      0.63      0.63      1409
           1       0.35      0.36      0.35       782

    accuracy                           0.53      2191
   macro avg       0.49      0.49      0.49      2191
weighted avg       0.53      0.53      0.53      2191



## Supervised -- Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
y = df_heart_scaled2["Heart Attack Risk"]
X = df_heart_scaled2.drop(columns=["Heart Attack Risk"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [35]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=9)
rf_model = rf_model.fit(X_train, y_train)
predictions = rf_model.predict(X_test)

In [36]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1409
           1       0.40      0.01      0.01       782

    accuracy                           0.64      2191
   macro avg       0.52      0.50      0.40      2191
weighted avg       0.56      0.64      0.51      2191



In [37]:
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.0822650464130001, 'BMI'),
 (0.08160813939628872, 'Sedentary Hours Per Day'),
 (0.08134381410903704, 'Income'),
 (0.08126686725662448, 'Exercise Hours Per Week'),
 (0.08033056711413102, 'Triglycerides'),
 (0.07754317158210133, 'Cholesterol'),
 (0.07299243394210482, 'Systolic'),
 (0.07158706104430276, 'Heart Rate'),
 (0.07113410077492625, 'Age'),
 (0.0670401566463874, 'Diastolic')]

## Supervised -- KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [33]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.69      0.64      0.67      1519
           1       0.31      0.36      0.33       672

    accuracy                           0.56      2191
   macro avg       0.50      0.50      0.50      2191
weighted avg       0.57      0.56      0.56      2191



## Neural Networks

In [38]:
import sklearn as skl
import tensorflow as tf

In [63]:
y = df_heart_scaled2["Heart Attack Risk"]
X = df_heart_scaled2.drop(columns=["Heart Attack Risk"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [64]:
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=48, activation="relu", input_dim=22))

nn_model.add(tf.keras.layers.Dense(units=48, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=48, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

nn_model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_30 (Dense)            (None, 48)                1104      
                                                                 
 dense_31 (Dense)            (None, 48)                2352      
                                                                 
 dense_32 (Dense)            (None, 48)                2352      
                                                                 
 dense_33 (Dense)            (None, 1)                 49        
                                                                 
Total params: 5857 (22.88 KB)
Trainable params: 5857 (22.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [65]:
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
fit_model = nn_model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [62]:
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

69/69 - 0s - loss: 2.9727 - accuracy: 0.5322 - 363ms/epoch - 5ms/step
Loss: 2.972691297531128, Accuracy: 0.5321770906448364
