In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.under_sampling import RandomUnderSampler

In [52]:
df = pd.read_csv('CardiacPrediction.csv')
df

Unnamed: 0,SEQN,Gender,Age,Annual-Family-Income,Ratio-Family-Income-Poverty,X60-sec-pulse,Systolic,Diastolic,Weight,Height,...,Total-Cholesterol,HDL,Glycohemoglobin,Vigorous-work,Moderate-work,Health-Insurance,Diabetes,Blood-Rel-Diabetes,Blood-Rel-Stroke,CoronaryHeartDisease
0,2,1,77,8,5.00,68,98,56,75.4,174.0,...,5.56,1.39,4.7,3,3,1,2,2,2,0
1,5,1,49,11,5.00,66,122,83,92.5,178.3,...,7.21,1.08,5.5,1,1,1,2,2,2,0
2,12,1,37,11,4.93,64,174,99,99.2,180.0,...,4.03,0.98,5.2,2,1,1,2,1,1,0
3,13,1,70,3,1.07,102,130,66,63.6,157.7,...,8.12,1.28,7.6,3,3,1,1,1,2,0
4,14,1,81,5,2.67,72,136,61,75.5,166.2,...,4.50,1.04,5.8,1,1,1,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37074,93691,1,25,10,1.59,92,112,76,39.2,136.5,...,4.14,1.27,5.8,2,2,1,2,1,2,0
37075,93695,2,76,4,1.43,78,112,46,59.1,165.8,...,3.62,1.76,5.6,2,2,1,2,2,2,0
37076,93697,2,80,7,2.97,74,146,58,71.7,152.2,...,6.62,2.90,5.7,2,2,1,2,2,2,0
37077,93700,1,35,1,0.00,76,106,66,78.2,173.3,...,3.72,1.09,5.2,2,2,1,2,2,1,0


## Exploratory Data Analysis

In [53]:
#Removes unimportant variables as shown in Research Paper 2
df.drop(['SEQN','Annual-Family-Income','Height','Ratio-Family-Income-Poverty','X60-sec-pulse',
          'Health-Insurance','Lymphocyte','Monocyte','Eosinophils','Total-Cholesterol','Mean-Cell-Vol','Mean-Cell-Hgb-Conc.','Hematocrit','Segmented-Neutrophils'], axis = 1, inplace=True)

In [54]:
#All Columns in the dataset
print(list(df))

['Gender', 'Age', 'Systolic', 'Diastolic', 'Weight', 'Body-Mass-Index', 'White-Blood-Cells', 'Basophils', 'Red-Blood-Cells', 'Hemoglobin', 'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol', 'Red-Cell-Distribution-Width', 'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose', 'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid', 'Triglycerides', 'HDL', 'Glycohemoglobin', 'Vigorous-work', 'Moderate-work', 'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease']


In [55]:
heartDiseaseCount = (df['CoronaryHeartDisease'] == 1).sum()
noHeartDiseaseCount = (df['CoronaryHeartDisease'] == 0).sum()
print(f'The number of people in this dataset with heart disease is {heartDiseaseCount} while the number of people without it is {noHeartDiseaseCount}')

The number of people in this dataset with heart disease is 1508 while the number of people without it is 35571


## Balancing the Imbalanced Data

In [56]:
X = df.drop('CoronaryHeartDisease', axis=1)
y = df['CoronaryHeartDisease']

# undersampling the data
rus = RandomUnderSampler()

X_resampled, y_resampled = rus.fit_resample(X, y)


## LASSO for Feature Selection

In [57]:
# split into training, validation, and testing
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4)

# split temporary set into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (1809, 36) (1809,)
Validation set shapes: (603, 36) (603,)
Testing set shapes: (604, 36) (604,)


In [58]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [59]:
# create the lasso model
lasso_cv = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)

lasso_cv.fit(X_train, y_train)

print('The optimal alpha is', lasso_cv.alpha_)

# finding the predicted probability
y_pred_proba = lasso_cv.predict(X_valid)
# converting to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)   # might need to lower threshold
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)

The optimal alpha is 0.01
Accuracy: 0.8009950248756219


In [60]:
#Feature selection in Lasso
lassoCoef = pd.Series(lasso_cv.coef_, index=X.columns)
selectedFeatures = lassoCoef[lassoCoef != 0]
print(f'The selected features are \n{selectedFeatures}')

The selected features are 
Gender                        -0.021216
Age                            0.234847
Systolic                      -0.003612
Diastolic                     -0.002927
White-Blood-Cells              0.019726
Mean-cell-Hemoglobin          -0.000732
Platelet-count                -0.034910
Red-Cell-Distribution-Width    0.013532
ALP                            0.013334
Cholesterol                   -0.064700
Creatinine                     0.005909
Protein                       -0.003603
Uric.Acid                      0.018828
Triglycerides                  0.013772
HDL                           -0.020338
Glycohemoglobin                0.026515
Vigorous-work                  0.002558
Moderate-work                 -0.001117
Diabetes                      -0.020292
Blood-Rel-Diabetes            -0.007630
Blood-Rel-Stroke              -0.049933
dtype: float64


## Machine Learning Experiments

In [61]:
df_reduced = df[[
    "CoronaryHeartDisease", "Gender", "Age", "Diastolic", "Red-Blood-Cells",
    "Platelet-count", "Mean-Platelet-Vol", "Red-Cell-Distribution-Width", "ALP",
    "Cholesterol", "Creatinine", "Iron", "LDH", "Uric.Acid", "Triglycerides",
    "HDL", "Glycohemoglobin", "Vigorous-work", "Diabetes", "Blood-Rel-Diabetes",
    "Blood-Rel-Stroke"
                 ]]
df_reduced

Unnamed: 0,CoronaryHeartDisease,Gender,Age,Diastolic,Red-Blood-Cells,Platelet-count,Mean-Platelet-Vol,Red-Cell-Distribution-Width,ALP,Cholesterol,...,Iron,LDH,Uric.Acid,Triglycerides,HDL,Glycohemoglobin,Vigorous-work,Diabetes,Blood-Rel-Diabetes,Blood-Rel-Stroke
0,0,1,77,56,4.73,214.0,7.7,13.7,62,5.250,...,11.28,140,362.8,1.298,1.39,4.7,3,2,2,2
1,0,1,49,83,5.13,209.0,10.4,13.1,63,7.160,...,24.54,133,404.5,3.850,1.08,5.5,1,2,2,2
2,0,1,37,99,5.76,357.0,7.9,13.6,63,3.900,...,11.28,131,339.0,1.581,0.98,5.2,2,2,1,1
3,0,1,70,66,5.53,228.0,8.8,14.4,103,7.940,...,12.18,181,410.4,3.635,1.28,7.6,3,1,1,2
4,0,1,81,61,5.32,160.0,9.0,12.4,110,4.420,...,11.82,150,368.8,0.756,1.04,5.8,1,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37074,0,1,25,76,5.50,253.0,7.5,13.3,67,4.189,...,13.10,144,428.3,1.264,1.27,5.8,2,2,1,2
37075,0,2,76,46,4.70,135.0,9.5,13.6,50,3.646,...,15.00,124,368.8,0.948,1.76,5.6,2,2,2,2
37076,0,2,80,58,4.44,172.0,8.1,16.9,54,6.801,...,8.40,120,273.6,1.095,2.90,5.7,2,2,2,2
37077,0,1,35,66,5.01,194.0,9.6,13.0,140,3.672,...,9.00,136,237.9,0.937,1.09,5.2,2,2,2,1


In [62]:
X = df.drop('CoronaryHeartDisease', axis=1)
y = df['CoronaryHeartDisease']

# undersampling the data
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)

# Splitting into training, validation, and testing
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)

# Split the temporary set into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (1809, 36) (1809,)
Validation set shapes: (603, 36) (603,)
Testing set shapes: (604, 36) (604,)


In [63]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [64]:
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

### Logistic Regression

In [65]:
# param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

lr = LogisticRegression(C=10)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_valid)


accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.7860696517412935
Precision: 0.7491961414790996
Recall: 0.8204225352112676
F1-Score: 0.7831932773109244


In [66]:
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)

### Naive Bayes

In [67]:
nb = GaussianNB()

nb.fit(X_train, y_train)

y_pred = nb.predict(X_valid)


accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.7180762852404643
Precision: 0.7142857142857143
Recall: 0.6690140845070423
F1-Score: 0.690909090909091


In [68]:
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)

### SVM

In [69]:
# param_grid = {'kernel': ('linear', 'rbf','poly') ,
#               'C':[5, 10, 100],
#               'gamma': [1,0.1,0.01,0.001],
#               'degree' : [1,2,3,4,5,6]}

sv = SVC(C=5, degree=1, gamma=0.001, kernel='poly')

sv.fit(X_train, y_train)

y_pred = sv.predict(X_valid)


accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
# print('Config: %s' % sv_search.best_params_)

Accuracy: 0.7893864013266998
Precision: 0.7476340694006309
Recall: 0.8345070422535211
F1-Score: 0.7886855241264558


In [70]:
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)

### K-Nearest Neighbors

In [71]:
# k = list(range(1, 25))
# param_grid = {'n_neighbors': k}

knn = KNeighborsClassifier(n_neighbors=17)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_valid)


accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.7446102819237148
Precision: 0.7272727272727273
Recall: 0.7323943661971831
F1-Score: 0.7298245614035087


In [72]:
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)

### Random Forest

In [73]:
# param_grid = {'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
#               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#               'max_features': ['sqrt', 'log2']}

rf = RandomForestClassifier(n_estimators=1800, max_depth=20)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_valid)


accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.7910447761194029
Precision: 0.7423312883435583
Recall: 0.852112676056338
F1-Score: 0.7934426229508196


In [74]:
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)

### XGBoost

In [75]:
xgb_model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=250, max_depth=9)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_valid)


accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.7694859038142621
Precision: 0.7217125382262997
Recall: 0.8309859154929577
F1-Score: 0.7725040916530278


In [78]:
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)

### Model Metric Comparison

In [79]:
# create a table/chart with the metrics of all the models and figure out which performs the best
data = [accuracy_list, precision_list, recall_list, f1_list]

df_metrics = pd.DataFrame(data=data, columns=['Logistic Regression', 'Naive Bayes', 'SVM', 'K-Nearest Neighbors', 'Random Forest', 'XGBoost'],
                          index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])
df_metrics

Unnamed: 0,Logistic Regression,Naive Bayes,SVM,K-Nearest Neighbors,Random Forest,XGBoost
Accuracy,0.78607,0.718076,0.789386,0.74461,0.791045,0.769486
Precision,0.749196,0.714286,0.747634,0.727273,0.742331,0.721713
Recall,0.820423,0.669014,0.834507,0.732394,0.852113,0.830986
F1-Score,0.783193,0.690909,0.788686,0.729825,0.793443,0.772504
