In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
heart_df = pd.read_csv('./heart_df_cleaned.csv')
ori_df = pd.read_csv('./heart_disease_health_indicators_BRFSS2015.csv')

## Correlation Matrix

In [3]:
cor_matrix = heart_df.corr()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
upper_tri

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
HeartDiseaseorAttack,,0.201271,0.176279,0.049995,0.039926,0.105154,0.198863,0.170816,-0.073267,-0.007128,...,0.025899,0.022076,0.246411,0.052756,0.170473,0.202779,0.089717,0.223626,-0.082466,-0.122908
HighBP,,,0.284186,0.11122,0.194126,0.074237,0.124426,0.261976,-0.104131,-0.019329,...,0.052084,0.002216,0.272562,0.037374,0.144413,0.211498,0.047155,0.339808,-0.112676,-0.139782
HighChol,,,,0.094753,0.089615,0.074627,0.089258,0.203327,-0.063266,-0.026125,...,0.052412,0.0029,0.187929,0.050212,0.110801,0.135826,0.022894,0.263866,-0.049838,-0.061871
CholCheck,,,,,0.04242,-0.003776,0.027894,0.075701,-0.004409,0.017973,...,0.115539,-0.054198,0.062782,-0.001549,0.040612,0.048969,-0.024255,0.095996,-0.009758,0.002161
BMI,,,,,,-0.009196,0.011062,0.212027,-0.127864,-0.067528,...,-0.00856,0.045837,0.208411,0.068653,0.102844,0.182604,0.030989,-0.049347,-0.074568,-0.069192
Smoker,,,,,,,0.054438,0.046774,-0.066981,-0.061947,...,-0.013983,0.037353,0.134979,0.077715,0.100514,0.108179,0.096709,0.107653,-0.135793,-0.095418
Stroke,,,,,,,,0.100276,-0.05944,-0.004613,...,0.013565,0.028691,0.16987,0.062111,0.140919,0.169442,0.00356,0.12804,-0.064319,-0.117232
Diabetes,,,,,,,,,-0.103408,-0.025462,...,0.024911,0.023568,0.284881,0.057698,0.160485,0.210638,0.032243,0.184642,-0.107742,-0.147102
PhysActivity,,,,,,,,,,0.125283,...,0.024095,-0.04662,-0.237676,-0.106175,-0.199562,-0.235943,0.033586,-0.087539,0.171224,0.166171
Fruits,,,,,,,,,,,...,0.022756,-0.032521,-0.071433,-0.052409,-0.024744,-0.030188,-0.088628,0.073726,0.085174,0.05123


In [28]:
#drop both highly correlated columns
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7) or any(upper_tri[column] < 0.01)]
to_drop

['Smoker',
 'PhysActivity',
 'Fruits',
 'Veggies',
 'HvyAlcoholConsump',
 'AnyHealthcare',
 'NoDocbcCost',
 'GenHlth',
 'MentHlth',
 'PhysHlth',
 'DiffWalk',
 'Sex',
 'Age',
 'Education',
 'Income']

# Data Modelling

In [5]:
target = heart_df['HeartDiseaseorAttack']
features = heart_df.drop('HeartDiseaseorAttack',axis=1)
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229781 entries, 0 to 229780
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   HighBP             229781 non-null  float64
 1   HighChol           229781 non-null  float64
 2   CholCheck          229781 non-null  float64
 3   BMI                229781 non-null  float64
 4   Smoker             229781 non-null  float64
 5   Stroke             229781 non-null  float64
 6   Diabetes           229781 non-null  float64
 7   PhysActivity       229781 non-null  float64
 8   Fruits             229781 non-null  float64
 9   Veggies            229781 non-null  float64
 10  HvyAlcoholConsump  229781 non-null  float64
 11  AnyHealthcare      229781 non-null  float64
 12  NoDocbcCost        229781 non-null  float64
 13  GenHlth            229781 non-null  float64
 14  MentHlth           229781 non-null  float64
 15  PhysHlth           229781 non-null  float64
 16  Di

In [6]:
#Data Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
#X_en_scaled =  pd.DataFrame(scaled_features, columns = features.columns)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, target, test_size=0.20, random_state = 41)

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)
print("Logistic Regression Classifier Accuracy: ", metrics.accuracy_score(y_test, y_lr))
print("Logistic Regression Classifier Precision: ", metrics.precision_score(y_test, y_lr))
print("Logistic Regression Classifier Recall: ", metrics.recall_score(y_test, y_lr))
print("Logistic Regression Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_lr))

Logistic Regression Classifier Accuracy:  0.9008638509911439
Logistic Regression Classifier Precision:  0.5484739676840216
Logistic Regression Classifier Recall:  0.1310034305317324
Logistic Regression Classifier AUC Score:  0.5594110945795513


In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print("Random Forest Classifier Accuracy: ", metrics.accuracy_score(y_test, y_rf))
print("Random Forest Classifier Precision: ", metrics.precision_score(y_test, y_rf))
print("Random Forest Classifier Recall: ", metrics.recall_score(y_test, y_rf))
print("Random Forest Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_rf))

Random Forest Classifier Accuracy:  0.8925299736710404
Random Forest Classifier Precision:  0.39807264640474427
Random Forest Classifier Recall:  0.11513722126929674
Random Forest Classifier AUC Score:  0.5477364356897424


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, y_train)
y_knn = knn.predict(X_test)
print("KNN Classifier Accuracy: ", metrics.accuracy_score(y_test, y_knn))
print("KNN Classifier Precision: ", metrics.precision_score(y_test, y_knn))
print("KNN Classifier Recall: ", metrics.recall_score(y_test, y_knn))
print("KNN Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_knn))

KNN Classifier Accuracy:  0.8901364318819767
KNN Classifier Precision:  0.3358908780903666
KNN Classifier Recall:  0.08447684391080618
KNN Classifier AUC Score:  0.5328058304750068


In [11]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Classifier Accuracy: ", metrics.accuracy_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier Precision: ", metrics.precision_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier Recall: ", metrics.recall_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier F1 Score: ", metrics.roc_auc_score(y_test, y_gnb))

Gaussian Naive Bayes Classifier Accuracy:  0.8062536719106991
Gaussian Naive Bayes Classifier Precision:  0.2709098768100281
Gaussian Naive Bayes Classifier Recall:  0.5375214408233276
Gaussian Naive Bayes Classifier F1 Score:  0.6870640648041758


In [12]:
acc_table = pd.DataFrame({'Model': ['Logisitic Regression',
                                   'Random Forest Classifier',
                                   'K-NN Classifier',
                                   'Gaussian NB Classifier'],
                         'Accuracy': [metrics.accuracy_score(y_test, y_lr),
                                        metrics.accuracy_score(y_test, y_rf),
                                        metrics.accuracy_score(y_test, y_knn),
                                        metrics.accuracy_score(y_test, y_gnb)],
                         'Precision': [metrics.precision_score(y_test, y_lr),
                                        metrics.precision_score(y_test, y_rf),
                                        metrics.precision_score(y_test, y_knn),
                                        metrics.precision_score(y_test, y_gnb)],
                         'Recall': [metrics.recall_score(y_test, y_lr),
                                        metrics.recall_score(y_test, y_rf),
                                        metrics.recall_score(y_test, y_knn),
                                        metrics.recall_score(y_test, y_gnb)],
                         'AUC Score': [metrics.roc_auc_score(y_test, y_lr),
                                        metrics.roc_auc_score(y_test, y_rf),
                                        metrics.roc_auc_score(y_test, y_knn),
                                        metrics.roc_auc_score(y_test, y_gnb)]})
acc_table.style.set_caption("Model Evaluation Table Using All Features")

Unnamed: 0,Model,Accuracy,Precision,Recall,AUC Score
0,Logisitic Regression,0.900864,0.548474,0.131003,0.559411
1,Random Forest Classifier,0.89253,0.398073,0.115137,0.547736
2,K-NN Classifier,0.890136,0.335891,0.084477,0.532806
3,Gaussian NB Classifier,0.806254,0.27091,0.537521,0.687064


In [29]:
uncorr_features = features.drop(to_drop, axis=1)
uncorr_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229781 entries, 0 to 229780
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   HighBP     229781 non-null  float64
 1   HighChol   229781 non-null  float64
 2   CholCheck  229781 non-null  float64
 3   BMI        229781 non-null  float64
 4   Stroke     229781 non-null  float64
 5   Diabetes   229781 non-null  float64
dtypes: float64(6)
memory usage: 10.5 MB


In [14]:
scaled_uncorr = scaler.fit_transform(uncorr_features)

In [15]:
X_trn, X_tst, y_trn, y_tst = train_test_split(scaled_uncorr, target, test_size = 0.2, random_state = 42)

In [16]:
lr.fit(X_trn, y_trn)
yh_lr = lr.predict(X_tst)
print("Logistic Regression Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_lr))
print("Logistic Regression Classifier Precision: ", metrics.precision_score(y_tst, yh_lr))
print("Logistic Regression Classifier Recall: ", metrics.recall_score(y_tst, yh_lr))
print("Logistic Regression Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_lr))

Logistic Regression Classifier Accuracy:  0.8982091955523641
Logistic Regression Classifier Precision:  0.5111607142857143
Logistic Regression Classifier Recall:  0.048848122866894196
Logistic Regression Classifier AUC Score:  0.5217707381157026


In [17]:
rf.fit(X_trn, y_trn)
yh_rf = rf.predict(X_tst)
print("Random Forest Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_rf))
print("Random Forest Classifier Precision: ", metrics.precision_score(y_tst, yh_rf))
print("Random Forest Classifier Recall: ", metrics.recall_score(y_tst, yh_rf))
print("Random Forest Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_rf))

Random Forest Classifier Accuracy:  0.897686968252932
Random Forest Classifier Precision:  0.4838709677419355
Random Forest Classifier Recall:  0.04479522184300341
Random Forest Classifier AUC Score:  0.5196837094458178


In [18]:
knn.fit(X_trn, y_trn)
yh_knn = knn.predict(X_tst)
print("KNN Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_knn))
print("KNN Classifier Precision: ", metrics.precision_score(y_tst, yh_knn))
print("KNN Classifier Recall: ", metrics.recall_score(y_tst, yh_knn))
print("KNN Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_knn))

KNN Classifier Accuracy:  0.8864808407859521
KNN Classifier Precision:  0.24542829643888353
KNN Classifier Recall:  0.05439419795221843
KNN Classifier AUC Score:  0.5176984438112155


In [19]:
gnb.fit(X_trn, y_trn)
yh_gnb = gnb.predict(X_tst)
print("Gaussian Naive Bayes Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier Precision: ", metrics.precision_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier Recall: ", metrics.recall_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier F1 Score: ", metrics.roc_auc_score(y_tst, yh_gnb))

Gaussian Naive Bayes Classifier Accuracy:  0.8534499640968731
Gaussian Naive Bayes Classifier Precision:  0.305750616815335
Gaussian Naive Bayes Classifier Recall:  0.3436433447098976
Gaussian Naive Bayes Classifier F1 Score:  0.6275026920065032


In [20]:
acc_table = pd.DataFrame({'Model': ['Logisitic Regression',
                                   'Random Forest Classifier',
                                   'K-NN Classifier',
                                   'Gaussian NB Classifier'],
                         'Accuracy': [metrics.accuracy_score(y_tst, yh_lr),
                                           metrics.accuracy_score(y_tst, yh_rf),
                                           metrics.accuracy_score(y_tst, yh_knn),
                                     metrics.accuracy_score(y_tst, yh_gnb)],
                         'Precision': [metrics.precision_score(y_tst, yh_lr),
                                            metrics.precision_score(y_tst, yh_rf),
                                            metrics.precision_score(y_tst, yh_knn),
                                      metrics.precision_score(y_tst, yh_gnb)],
                         'Recall': [metrics.recall_score(y_tst, yh_lr),
                                         metrics.recall_score(y_tst, yh_rf),
                                         metrics.recall_score(y_tst, yh_knn),
                                   metrics.recall_score(y_tst, yh_gnb)],
                         'AUC Score': [metrics.roc_auc_score(y_tst, yh_lr),
                                     metrics.roc_auc_score(y_tst, yh_rf),
                                     metrics.roc_auc_score(y_tst, yh_knn),
                                      metrics.roc_auc_score(y_tst, yh_gnb)]})
acc_table.style.set_caption("Model Evaluation Table Using Un-Correlated Features")

Unnamed: 0,Model,Accuracy,Precision,Recall,AUC Score
0,Logisitic Regression,0.898209,0.511161,0.048848,0.521771
1,Random Forest Classifier,0.897687,0.483871,0.044795,0.519684
2,K-NN Classifier,0.886481,0.245428,0.054394,0.517698
3,Gaussian NB Classifier,0.85345,0.305751,0.343643,0.627503
