In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
heart_df = pd.read_csv('./heart_df_cleaned.csv')
ori_df = pd.read_csv('./heart_disease_health_indicators_BRFSS2015.csv')

In [3]:
print('Target distribution(%):') 
print(np.round(heart_df['HeartDiseaseorAttack'].value_counts(normalize=True)*100, 2))

Target distribution(%):
0.0    89.68
1.0    10.32
Name: HeartDiseaseorAttack, dtype: float64


In [4]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#https://www.kaggle.com/code/hadeerismail/heart-disease-prediction#Dealing-with-the-imbalance-problem-with-multiple-ways
under_sampler = RandomUnderSampler(sampling_strategy=0.6)
x = heart_df.iloc[:, 1:]
y = heart_df['HeartDiseaseorAttack']
x_under, y_under = under_sampler.fit_resample(x, y)

print(f'Before undersampling: {Counter(heart_df["HeartDiseaseorAttack"])}')
print(f'After undersampling: {Counter(y_under)}')

Before undersampling: Counter({0.0: 206064, 1.0: 23717})
After undersampling: Counter({0.0: 39528, 1.0: 23717})


In [5]:
cor_matrix = x_under.corr()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7) or any(upper_tri[column] < 0.01)]

In [6]:
#Data Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(x_under)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y_under, test_size=0.30, random_state = 41)

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)
print("Logistic Regression Classifier Accuracy: ", metrics.accuracy_score(y_test, y_lr))
print("Logistic Regression Classifier Precision: ", metrics.precision_score(y_test, y_lr))
print("Logistic Regression Classifier Recall: ", metrics.recall_score(y_test, y_lr))
print("Logistic Regression Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_lr))

Logistic Regression Classifier Accuracy:  0.7606197955096448
Logistic Regression Classifier Precision:  0.6962602614776527
Logistic Regression Classifier Recall:  0.6428972487366648
Logistic Regression Classifier AUC Score:  0.737144826899978


In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print("Random Forest Classifier Accuracy: ", metrics.accuracy_score(y_test, y_rf))
print("Random Forest Classifier Precision: ", metrics.precision_score(y_test, y_rf))
print("Random Forest Classifier Recall: ", metrics.recall_score(y_test, y_rf))
print("Random Forest Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_rf))

Random Forest Classifier Accuracy:  0.7444924633709287
Random Forest Classifier Precision:  0.6629905471211687
Random Forest Classifier Recall:  0.6497754070746772
Random Forest Classifier AUC Score:  0.7256050031153977


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_knn = knn.predict(X_test)
print("KNN Classifier Accuracy: ", metrics.accuracy_score(y_test, y_knn))
print("KNN Classifier Precision: ", metrics.precision_score(y_test, y_knn))
print("KNN Classifier Recall: ", metrics.recall_score(y_test, y_knn))
print("KNN Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_knn))

KNN Classifier Accuracy:  0.7228312427532413
KNN Classifier Precision:  0.6345793043729254
KNN Classifier Recall:  0.6172094329028636
KNN Classifier AUC Score:  0.7017692734134572


In [11]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Classifier Accuracy: ", metrics.accuracy_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier Precision: ", metrics.precision_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier Recall: ", metrics.recall_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier F1 Score: ", metrics.roc_auc_score(y_test, y_gnb))

Gaussian Naive Bayes Classifier Accuracy:  0.7256245388426268
Gaussian Naive Bayes Classifier Precision:  0.6287939833467634
Gaussian Naive Bayes Classifier Recall:  0.6572150477259966
Gaussian Naive Bayes Classifier F1 Score:  0.7119830512891586


In [12]:
acc_table = pd.DataFrame({'Model': ['Logisitic Regression',
                                   'Random Forest Classifier',
                                   'K-NN Classifier',
                                   'Gaussian NB Classifier'],
                         'Accuracy': [metrics.accuracy_score(y_test, y_lr),
                                        metrics.accuracy_score(y_test, y_rf),
                                        metrics.accuracy_score(y_test, y_knn),
                                        metrics.accuracy_score(y_test, y_gnb)],
                         'Precision': [metrics.precision_score(y_test, y_lr),
                                        metrics.precision_score(y_test, y_rf),
                                        metrics.precision_score(y_test, y_knn),
                                        metrics.precision_score(y_test, y_gnb)],
                         'Recall': [metrics.recall_score(y_test, y_lr),
                                        metrics.recall_score(y_test, y_rf),
                                        metrics.recall_score(y_test, y_knn),
                                        metrics.recall_score(y_test, y_gnb)],
                         'AUC Score': [metrics.roc_auc_score(y_test, y_lr),
                                        metrics.roc_auc_score(y_test, y_rf),
                                        metrics.roc_auc_score(y_test, y_knn),
                                        metrics.roc_auc_score(y_test, y_gnb)]})
acc_table.style.set_caption("Model Evaluation Table Using All Features")

Unnamed: 0,Model,Accuracy,Precision,Recall,AUC Score
0,Logisitic Regression,0.76062,0.69626,0.642897,0.737145
1,Random Forest Classifier,0.744492,0.662991,0.649775,0.725605
2,K-NN Classifier,0.722831,0.634579,0.617209,0.701769
3,Gaussian NB Classifier,0.725625,0.628794,0.657215,0.711983


In [13]:
uncorr_features = x_under.drop(to_drop, axis=1)
uncorr_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63245 entries, 0 to 63244
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   HighBP     63245 non-null  float64
 1   HighChol   63245 non-null  float64
 2   CholCheck  63245 non-null  float64
 3   BMI        63245 non-null  float64
 4   Stroke     63245 non-null  float64
 5   Diabetes   63245 non-null  float64
dtypes: float64(6)
memory usage: 2.9 MB


In [14]:
scaled_selected = scaler.fit_transform(uncorr_features)

In [15]:
X_trn, X_tst, y_trn, y_tst = train_test_split(scaled_selected, y_under, test_size=0.30, random_state = 41)

In [16]:
lr.fit(X_trn, y_trn)
yh_lr = lr.predict(X_tst)
print("Logistic Regression Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_lr))
print("Logistic Regression Classifier Precision: ", metrics.precision_score(y_tst, yh_lr))
print("Logistic Regression Classifier Recall: ", metrics.recall_score(y_tst, yh_lr))
print("Logistic Regression Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_lr))

Logistic Regression Classifier Accuracy:  0.7007483925371562
Logistic Regression Classifier Precision:  0.6547517123287672
Logistic Regression Classifier Recall:  0.4293935991016283
Logistic Regression Classifier AUC Score:  0.6466377278208564


In [17]:
rf.fit(X_trn, y_trn)
yh_rf = rf.predict(X_tst)
print("Random Forest Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_rf))
print("Random Forest Classifier Precision: ", metrics.precision_score(y_tst, yh_rf))
print("Random Forest Classifier Recall: ", metrics.recall_score(y_tst, yh_rf))
print("Random Forest Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_rf))

Random Forest Classifier Accuracy:  0.6985875408453673
Random Forest Classifier Precision:  0.59410582719357
Random Forest Classifier Recall:  0.6225435148792813
Random Forest Classifier AUC Score:  0.6834236561738178


In [18]:
knn.fit(X_trn, y_trn)
yh_knn = knn.predict(X_tst)
print("KNN Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_knn))
print("KNN Classifier Precision: ", metrics.precision_score(y_tst, yh_knn))
print("KNN Classifier Recall: ", metrics.recall_score(y_tst, yh_knn))
print("KNN Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_knn))

KNN Classifier Accuracy:  0.6783493201222726
KNN Classifier Precision:  0.5782615361030201
KNN Classifier Recall:  0.5294778214486243
KNN Classifier AUC Score:  0.6486629613572236


In [19]:
gnb.fit(X_trn, y_trn)
yh_gnb = gnb.predict(X_tst)
print("Gaussian Naive Bayes Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier Precision: ", metrics.precision_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier Recall: ", metrics.recall_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier F1 Score: ", metrics.roc_auc_score(y_tst, yh_gnb))

Gaussian Naive Bayes Classifier Accuracy:  0.6937915041635923
Gaussian Naive Bayes Classifier Precision:  0.599065138721351
Gaussian Naive Bayes Classifier Recall:  0.5576923076923077
Gaussian Naive Bayes Classifier F1 Score:  0.6666520610191496


In [20]:
acc_table = pd.DataFrame({'Model': ['Logisitic Regression',
                                   'Random Forest Classifier',
                                   'K-NN Classifier',
                                   'Gaussian NB Classifier'],
                         'Accuracy': [metrics.accuracy_score(y_tst, yh_lr),
                                           metrics.accuracy_score(y_tst, yh_rf),
                                           metrics.accuracy_score(y_tst, yh_knn),
                                     metrics.accuracy_score(y_tst, yh_gnb)],
                         'Precision': [metrics.precision_score(y_tst, yh_lr),
                                            metrics.precision_score(y_tst, yh_rf),
                                            metrics.precision_score(y_tst, yh_knn),
                                      metrics.precision_score(y_tst, yh_gnb)],
                         'Recall': [metrics.recall_score(y_tst, yh_lr),
                                         metrics.recall_score(y_tst, yh_rf),
                                         metrics.recall_score(y_tst, yh_knn),
                                   metrics.recall_score(y_tst, yh_gnb)],
                         'AUC Score': [metrics.roc_auc_score(y_tst, yh_lr),
                                     metrics.roc_auc_score(y_tst, yh_rf),
                                     metrics.roc_auc_score(y_tst, yh_knn),
                                      metrics.roc_auc_score(y_tst, yh_gnb)]})
acc_table.style.set_caption("Model Evaluation Table Using Un-Correlated Features")

Unnamed: 0,Model,Accuracy,Precision,Recall,AUC Score
0,Logisitic Regression,0.700748,0.654752,0.429394,0.646638
1,Random Forest Classifier,0.698588,0.594106,0.622544,0.683424
2,K-NN Classifier,0.678349,0.578262,0.529478,0.648663
3,Gaussian NB Classifier,0.693792,0.599065,0.557692,0.666652
