In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
heart_df = pd.read_csv('./heart_df_cleaned.csv')
ori_df = pd.read_csv('./heart_disease_health_indicators_BRFSS2015.csv')

In [3]:
print('Target distribution(%):') 
print(np.round(heart_df['HeartDiseaseorAttack'].value_counts(normalize=True)*100, 2))

Target distribution(%):
0.0    89.68
1.0    10.32
Name: HeartDiseaseorAttack, dtype: float64


In [4]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#https://www.kaggle.com/code/hadeerismail/heart-disease-prediction#Dealing-with-the-imbalance-problem-with-multiple-ways
under_sampler = RandomUnderSampler(sampling_strategy=0.5)
x = heart_df.iloc[:, 1:]
y = heart_df['HeartDiseaseorAttack']
x_under, y_under = under_sampler.fit_resample(x, y)

print(f'Before undersampling: {Counter(heart_df["HeartDiseaseorAttack"])}')
print(f'After undersampling: {Counter(y_under)}')

Before undersampling: Counter({0.0: 206064, 1.0: 23717})
After undersampling: Counter({0.0: 47434, 1.0: 23717})


In [5]:
cor_matrix = x_under.corr()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))

#drop both highly correlated columns
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7) or any(upper_tri[column] < 0.01)]

In [6]:
#Data Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(x_under)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y_under, test_size=0.20, random_state = 41)

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)
print("Logistic Regression Classifier Accuracy: ", metrics.accuracy_score(y_test, y_lr))
print("Logistic Regression Classifier Precision: ", metrics.precision_score(y_test, y_lr))
print("Logistic Regression Classifier Recall: ", metrics.recall_score(y_test, y_lr))
print("Logistic Regression Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_lr))

Logistic Regression Classifier Accuracy:  0.7693767128100626
Logistic Regression Classifier Precision:  0.6827568404749613
Logistic Regression Classifier Recall:  0.5630055342699021
Logistic Regression Classifier AUC Score:  0.7170424713204121


In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print("Random Forest Classifier Accuracy: ", metrics.accuracy_score(y_test, y_rf))
print("Random Forest Classifier Precision: ", metrics.precision_score(y_test, y_rf))
print("Random Forest Classifier Recall: ", metrics.recall_score(y_test, y_rf))
print("Random Forest Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_rf))

Random Forest Classifier Accuracy:  0.754901271871267
Random Forest Classifier Precision:  0.6460647030420087
Random Forest Classifier Recall:  0.5696040868454662
Random Forest Classifier AUC Score:  0.7079112430450976


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, y_train)
y_knn = knn.predict(X_test)
print("KNN Classifier Accuracy: ", metrics.accuracy_score(y_test, y_knn))
print("KNN Classifier Precision: ", metrics.precision_score(y_test, y_knn))
print("KNN Classifier Recall: ", metrics.recall_score(y_test, y_knn))
print("KNN Classifier AUC Score: ", metrics.roc_auc_score(y_test, y_knn))

KNN Classifier Accuracy:  0.7097182207856089
KNN Classifier Precision:  0.613083366573594
KNN Classifier Recall:  0.3271604938271605
KNN Classifier AUC Score:  0.6127043421616659


In [11]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Classifier Accuracy: ", metrics.accuracy_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier Precision: ", metrics.precision_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier Recall: ", metrics.recall_score(y_test, y_gnb))
print("Gaussian Naive Bayes Classifier F1 Score: ", metrics.roc_auc_score(y_test, y_gnb))

Gaussian Naive Bayes Classifier Accuracy:  0.7352961843861991
Gaussian Naive Bayes Classifier Precision:  0.5904410336118127
Gaussian Naive Bayes Classifier Recall:  0.6468710089399745
Gaussian Naive Bayes Classifier F1 Score:  0.7128721980606723


In [12]:
acc_table = pd.DataFrame({'Model': ['Logisitic Regression',
                                   'Random Forest Classifier',
                                   'K-NN Classifier',
                                   'Gaussian NB Classifier'],
                         'Accuracy': [metrics.accuracy_score(y_test, y_lr),
                                        metrics.accuracy_score(y_test, y_rf),
                                        metrics.accuracy_score(y_test, y_knn),
                                        metrics.accuracy_score(y_test, y_gnb)],
                         'Precision': [metrics.precision_score(y_test, y_lr),
                                        metrics.precision_score(y_test, y_rf),
                                        metrics.precision_score(y_test, y_knn),
                                        metrics.precision_score(y_test, y_gnb)],
                         'Recall': [metrics.recall_score(y_test, y_lr),
                                        metrics.recall_score(y_test, y_rf),
                                        metrics.recall_score(y_test, y_knn),
                                        metrics.recall_score(y_test, y_gnb)],
                         'AUC Score': [metrics.roc_auc_score(y_test, y_lr),
                                        metrics.roc_auc_score(y_test, y_rf),
                                        metrics.roc_auc_score(y_test, y_knn),
                                        metrics.roc_auc_score(y_test, y_gnb)]})
acc_table.style.set_caption("Model Evaluation Table Using All Features")

Unnamed: 0,Model,Accuracy,Precision,Recall,AUC Score
0,Logisitic Regression,0.769377,0.682757,0.563006,0.717042
1,Random Forest Classifier,0.754901,0.646065,0.569604,0.707911
2,K-NN Classifier,0.709718,0.613083,0.32716,0.612704
3,Gaussian NB Classifier,0.735296,0.590441,0.646871,0.712872


In [13]:
uncorr_features = x_under.drop(to_drop, axis=1)
uncorr_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71151 entries, 0 to 71150
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   HighBP     71151 non-null  float64
 1   HighChol   71151 non-null  float64
 2   CholCheck  71151 non-null  float64
 3   BMI        71151 non-null  float64
 4   Stroke     71151 non-null  float64
 5   Diabetes   71151 non-null  float64
dtypes: float64(6)
memory usage: 3.3 MB


In [14]:
scaled_selected = scaler.fit_transform(uncorr_features)

In [15]:
X_trn, X_tst, y_trn, y_tst = train_test_split(scaled_selected, y_under, test_size=0.20, random_state = 41)

In [16]:
lr.fit(X_trn, y_trn)
yh_lr = lr.predict(X_tst)
print("Logistic Regression Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_lr))
print("Logistic Regression Classifier Precision: ", metrics.precision_score(y_tst, yh_lr))
print("Logistic Regression Classifier Recall: ", metrics.recall_score(y_tst, yh_lr))
print("Logistic Regression Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_lr))

Logistic Regression Classifier Accuracy:  0.7210315508397162
Logistic Regression Classifier Precision:  0.6476074614760746
Logistic Regression Classifier Recall:  0.3399318859088974
Logistic Regression Classifier AUC Score:  0.6243874262230944


In [17]:
rf.fit(X_trn, y_trn)
yh_rf = rf.predict(X_tst)
print("Random Forest Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_rf))
print("Random Forest Classifier Precision: ", metrics.precision_score(y_tst, yh_rf))
print("Random Forest Classifier Recall: ", metrics.recall_score(y_tst, yh_rf))
print("Random Forest Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_rf))

Random Forest Classifier Accuracy:  0.7159019042934439
Random Forest Classifier Precision:  0.6135181975736569
Random Forest Classifier Recall:  0.3767560664112388
Random Forest Classifier AUC Score:  0.629896967434089


In [18]:
knn.fit(X_trn, y_trn)
yh_knn = knn.predict(X_tst)
print("KNN Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_knn))
print("KNN Classifier Precision: ", metrics.precision_score(y_tst, yh_knn))
print("KNN Classifier Recall: ", metrics.recall_score(y_tst, yh_knn))
print("KNN Classifier AUC Score: ", metrics.roc_auc_score(y_tst, yh_knn))

KNN Classifier Accuracy:  0.6867402150235402
KNN Classifier Precision:  0.5541516245487365
KNN Classifier Recall:  0.2613878246062154
KNN Classifier AUC Score:  0.5788739185970341


In [19]:
gnb.fit(X_trn, y_trn)
yh_gnb = gnb.predict(X_tst)
print("Gaussian Naive Bayes Classifier Accuracy: ", metrics.accuracy_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier Precision: ", metrics.precision_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier Recall: ", metrics.recall_score(y_tst, yh_gnb))
print("Gaussian Naive Bayes Classifier F1 Score: ", metrics.roc_auc_score(y_tst, yh_gnb))

Gaussian Naive Bayes Classifier Accuracy:  0.7112641416625677
Gaussian Naive Bayes Classifier Precision:  0.5789332618600911
Gaussian Naive Bayes Classifier Recall:  0.45977011494252873
Gaussian Naive Bayes Classifier F1 Score:  0.6474870715276999


In [20]:
acc_table = pd.DataFrame({'Model': ['Logisitic Regression',
                                   'Random Forest Classifier',
                                   'K-NN Classifier',
                                   'Gaussian NB Classifier'],
                         'Accuracy': [metrics.accuracy_score(y_tst, yh_lr),
                                           metrics.accuracy_score(y_tst, yh_rf),
                                           metrics.accuracy_score(y_tst, yh_knn),
                                     metrics.accuracy_score(y_tst, yh_gnb)],
                         'Precision': [metrics.precision_score(y_tst, yh_lr),
                                            metrics.precision_score(y_tst, yh_rf),
                                            metrics.precision_score(y_tst, yh_knn),
                                      metrics.precision_score(y_tst, yh_gnb)],
                         'Recall': [metrics.recall_score(y_tst, yh_lr),
                                         metrics.recall_score(y_tst, yh_rf),
                                         metrics.recall_score(y_tst, yh_knn),
                                   metrics.recall_score(y_tst, yh_gnb)],
                         'AUC Score': [metrics.roc_auc_score(y_tst, yh_lr),
                                     metrics.roc_auc_score(y_tst, yh_rf),
                                     metrics.roc_auc_score(y_tst, yh_knn),
                                      metrics.roc_auc_score(y_tst, yh_gnb)]})
acc_table.style.set_caption("Model Evaluation Table Using Un-Correlated Features")

Unnamed: 0,Model,Accuracy,Precision,Recall,AUC Score
0,Logisitic Regression,0.721032,0.647607,0.339932,0.624387
1,Random Forest Classifier,0.715902,0.613518,0.376756,0.629897
2,K-NN Classifier,0.68674,0.554152,0.261388,0.578874
3,Gaussian NB Classifier,0.711264,0.578933,0.45977,0.647487
