In [1]:
# Import dependencies
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Import CSV
input_file = 'Resources/heart_disease_health_indicators_BRFSS2015.csv'
df = pd.read_csv(input_file)

In [4]:
df.head(5)

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [5]:
# Scale CSV data
heart_data_scaled = StandardScaler().fit_transform(df[["HighBP","HighChol","CholCheck","BMI","Smoker","Stroke","Diabetes","PhysActivity",
                                                       "Fruits","Veggies","HvyAlcoholConsump","AnyHealthcare","NoDocbcCost","GenHlth","MentHlth",
                                                       "PhysHlth","DiffWalk","Sex","Age","Education","Income"]])

In [6]:
#Save scaled data in a dataframe
df_heart_scaled = pd.DataFrame(heart_data_scaled,columns=["HighBP","HighChol","CholCheck","BMI","Smoker","Stroke","Diabetes","PhysActivity",
                                                       "Fruits","Veggies","HvyAlcoholConsump","AnyHealthcare","NoDocbcCost","GenHlth","MentHlth",
                                                       "PhysHlth","DiffWalk","Sex","Age","Education","Income"])
df_heart_scaled.head(5)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.153688,1.165254,0.196922,1.757936,1.120927,-0.205637,-0.425292,-1.762814,-1.316872,0.482087,...,0.226863,-0.303173,2.329121,1.998592,1.233999,2.223615,-0.887021,0.3169,-1.065595,-1.474487
1,-0.866785,-0.858182,-5.078164,-0.511806,1.120927,-0.205637,-0.425292,0.567275,-1.316872,-2.074316,...,-4.407954,3.298445,0.457294,-0.42963,-0.486592,-0.449718,-0.887021,-0.337933,0.963272,-2.440138
2,1.153688,1.165254,0.196922,-0.057858,-0.892119,-0.205637,-0.425292,-1.762814,0.759375,-2.074316,...,0.226863,3.298445,2.329121,3.617407,2.95459,2.223615,-0.887021,0.3169,-1.065595,0.939638
3,1.153688,-0.858182,0.196922,-0.209174,-0.892119,-0.205637,-0.425292,0.567275,0.759375,0.482087,...,0.226863,-0.303173,-0.478619,-0.42963,-0.486592,-0.449718,-0.887021,0.971733,-2.080028,-0.026012
4,1.153688,1.165254,0.196922,-0.663122,-0.892119,-0.205637,-0.425292,0.567275,0.759375,0.482087,...,0.226863,-0.303173,-0.478619,-0.024926,-0.486592,-0.449718,-0.887021,0.971733,-0.051162,-0.991662


## Unsupervised - KMeans

In [7]:
# Define the model
model = KMeans(n_clusters=2)

In [8]:
#Fit the model
model.fit(df_heart_scaled)



In [9]:
# Make predictions
heart_clusters = model.predict(df_heart_scaled)

In [10]:
# Add predicitons to copied dataframe
df_heart_scaled_predictions = df_heart_scaled.copy()
df_heart_scaled_predictions["HeartRisk"] = heart_clusters

df_heart_scaled_predictions.head(5)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,HeartRisk
0,1.153688,1.165254,0.196922,1.757936,1.120927,-0.205637,-0.425292,-1.762814,-1.316872,0.482087,...,-0.303173,2.329121,1.998592,1.233999,2.223615,-0.887021,0.3169,-1.065595,-1.474487,0
1,-0.866785,-0.858182,-5.078164,-0.511806,1.120927,-0.205637,-0.425292,0.567275,-1.316872,-2.074316,...,3.298445,0.457294,-0.42963,-0.486592,-0.449718,-0.887021,-0.337933,0.963272,-2.440138,1
2,1.153688,1.165254,0.196922,-0.057858,-0.892119,-0.205637,-0.425292,-1.762814,0.759375,-2.074316,...,3.298445,2.329121,3.617407,2.95459,2.223615,-0.887021,0.3169,-1.065595,0.939638,0
3,1.153688,-0.858182,0.196922,-0.209174,-0.892119,-0.205637,-0.425292,0.567275,0.759375,0.482087,...,-0.303173,-0.478619,-0.42963,-0.486592,-0.449718,-0.887021,0.971733,-2.080028,-0.026012,1
4,1.153688,1.165254,0.196922,-0.663122,-0.892119,-0.205637,-0.425292,0.567275,0.759375,0.482087,...,-0.303173,-0.478619,-0.024926,-0.486592,-0.449718,-0.887021,0.971733,-0.051162,-0.991662,1


## Supervised -- Logistic Regression

In [11]:
# Create copy of scaled dataframe 
df_heart_scaled2 = df_heart_scaled.copy()
df_heart_scaled2["HeartDiseaseorAttack"] = df["HeartDiseaseorAttack"]
df_heart_scaled2.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,HeartDiseaseorAttack
0,1.153688,1.165254,0.196922,1.757936,1.120927,-0.205637,-0.425292,-1.762814,-1.316872,0.482087,...,-0.303173,2.329121,1.998592,1.233999,2.223615,-0.887021,0.3169,-1.065595,-1.474487,0.0
1,-0.866785,-0.858182,-5.078164,-0.511806,1.120927,-0.205637,-0.425292,0.567275,-1.316872,-2.074316,...,3.298445,0.457294,-0.42963,-0.486592,-0.449718,-0.887021,-0.337933,0.963272,-2.440138,0.0
2,1.153688,1.165254,0.196922,-0.057858,-0.892119,-0.205637,-0.425292,-1.762814,0.759375,-2.074316,...,3.298445,2.329121,3.617407,2.95459,2.223615,-0.887021,0.3169,-1.065595,0.939638,0.0
3,1.153688,-0.858182,0.196922,-0.209174,-0.892119,-0.205637,-0.425292,0.567275,0.759375,0.482087,...,-0.303173,-0.478619,-0.42963,-0.486592,-0.449718,-0.887021,0.971733,-2.080028,-0.026012,0.0
4,1.153688,1.165254,0.196922,-0.663122,-0.892119,-0.205637,-0.425292,0.567275,0.759375,0.482087,...,-0.303173,-0.478619,-0.024926,-0.486592,-0.449718,-0.887021,0.971733,-0.051162,-0.991662,0.0


In [12]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [13]:
# Define and split training and testing data
y = df_heart_scaled2["HeartDiseaseorAttack"]
X = df_heart_scaled2.drop(columns=["HeartDiseaseorAttack"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [14]:
# Create logistic regression model and make predictions
logistic_regression_model = LogisticRegression(random_state=9)
lr_model = logistic_regression_model.fit(X_train, y_train)
training_predictions = lr_model.predict(X_train)
testing_predictions = logistic_regression_model.predict(X_test)

In [15]:
# Training classification report
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95    172413
         1.0       0.55      0.13      0.21     17847

    accuracy                           0.91    190260
   macro avg       0.73      0.56      0.58    190260
weighted avg       0.88      0.91      0.88    190260



In [16]:
# Testing classification report
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     57374
         1.0       0.54      0.12      0.20      6046

    accuracy                           0.91     63420
   macro avg       0.73      0.56      0.58     63420
weighted avg       0.88      0.91      0.88     63420



## Supervised -- SVC

In [17]:
# Import dependencies, define model, fit training data, and print model score
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.905


In [18]:
# Make predictions and print classification report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,))

              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95     57374
         1.0       0.00      0.00      0.00      6046

    accuracy                           0.90     63420
   macro avg       0.45      0.50      0.47     63420
weighted avg       0.82      0.90      0.86     63420



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Supervised -- Decision Trees

In [19]:
# Define X and y
y = df_heart_scaled2["HeartDiseaseorAttack"].values.reshape(-1, 1)
X = df_heart_scaled2.drop(columns=["HeartDiseaseorAttack"])

In [20]:
# Split testing and training data and fit data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Import dependency, define model, and make predictions
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)

In [22]:
# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92     57374
         1.0       0.24      0.27      0.26      6046

    accuracy                           0.85     63420
   macro avg       0.58      0.59      0.59     63420
weighted avg       0.86      0.85      0.85     63420



## Supervised -- Random Forest

In [23]:
# Import dependencies, define X and y, and split training/testing data
from sklearn.ensemble import RandomForestClassifier
y = df_heart_scaled2["HeartDiseaseorAttack"]
X = df_heart_scaled2.drop(columns=["HeartDiseaseorAttack"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [24]:
# Define model, fit data, and make predictions
rf_model = RandomForestClassifier(n_estimators=500, random_state=9)
rf_model = rf_model.fit(X_train, y_train)
predictions = rf_model.predict(X_test)

In [25]:
# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     57374
         1.0       0.45      0.11      0.18      6046

    accuracy                           0.90     63420
   macro avg       0.68      0.55      0.56     63420
weighted avg       0.87      0.90      0.87     63420



In [26]:
# List feature importances
importances = rf_model.feature_importances_
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.19204344334827295, 'BMI'),
 (0.11059921843565504, 'Age'),
 (0.10271723885234146, 'Income'),
 (0.08475586289077425, 'PhysHlth'),
 (0.07157163714066175, 'Education'),
 (0.06991583193088348, 'GenHlth'),
 (0.06395824129488618, 'MentHlth'),
 (0.033161146699945174, 'Fruits'),
 (0.028705904338478187, 'PhysActivity'),
 (0.028521423090090182, 'Diabetes')]

## Supervised -- KNN

In [27]:
# Define dependency, define model, fit data, and make predictions
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [28]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.96      0.92      0.94     59997
         1.0       0.19      0.34      0.25      3423

    accuracy                           0.89     63420
   macro avg       0.58      0.63      0.59     63420
weighted avg       0.92      0.89      0.90     63420



## Neural Networks

In [29]:
# Import dependencies
import sklearn as skl
import tensorflow as tf

In [30]:
# Define X and y and split training/testing data
y = df_heart_scaled2["HeartDiseaseorAttack"]
X = df_heart_scaled2.drop(columns=["HeartDiseaseorAttack"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [31]:
# Define neural network model and add layers
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=46, activation="relu", input_dim=21))

nn_model.add(tf.keras.layers.Dense(units=22, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 46)                1012      
                                                                 
 dense_1 (Dense)             (None, 22)                1034      
                                                                 
 dense_2 (Dense)             (None, 6)                 138       
                                                                 
 dense_3 (Dense)             (None, 1)                 7         
                                                                 
Total params: 2191 (8.56 KB)
Trainable params: 2191 (8.56 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
# Compile model and fit data
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
fit_model = nn_model.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [33]:
# Print model loss and accuracy
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1982/1982 - 4s - loss: 0.2440 - accuracy: 0.9051 - 4s/epoch - 2ms/step
Loss: 0.2439565360546112, Accuracy: 0.9051088094711304
