In [105]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [106]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [107]:
df = pd.read_csv('CardiacPrediction.csv')
df

Unnamed: 0,SEQN,Gender,Age,Annual-Family-Income,Ratio-Family-Income-Poverty,X60-sec-pulse,Systolic,Diastolic,Weight,Height,...,Total-Cholesterol,HDL,Glycohemoglobin,Vigorous-work,Moderate-work,Health-Insurance,Diabetes,Blood-Rel-Diabetes,Blood-Rel-Stroke,CoronaryHeartDisease
0,2,1,77,8,5.00,68,98,56,75.4,174.0,...,5.56,1.39,4.7,3,3,1,2,2,2,0
1,5,1,49,11,5.00,66,122,83,92.5,178.3,...,7.21,1.08,5.5,1,1,1,2,2,2,0
2,12,1,37,11,4.93,64,174,99,99.2,180.0,...,4.03,0.98,5.2,2,1,1,2,1,1,0
3,13,1,70,3,1.07,102,130,66,63.6,157.7,...,8.12,1.28,7.6,3,3,1,1,1,2,0
4,14,1,81,5,2.67,72,136,61,75.5,166.2,...,4.50,1.04,5.8,1,1,1,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37074,93691,1,25,10,1.59,92,112,76,39.2,136.5,...,4.14,1.27,5.8,2,2,1,2,1,2,0
37075,93695,2,76,4,1.43,78,112,46,59.1,165.8,...,3.62,1.76,5.6,2,2,1,2,2,2,0
37076,93697,2,80,7,2.97,74,146,58,71.7,152.2,...,6.62,2.90,5.7,2,2,1,2,2,2,0
37077,93700,1,35,1,0.00,76,106,66,78.2,173.3,...,3.72,1.09,5.2,2,2,1,2,2,1,0


## Exploratory Data Analysis

In [108]:
#Removes unimportant variables as shown in Research Paper 2
df.drop(['SEQN','Annual-Family-Income','Height','Ratio-Family-Income-Poverty','X60-sec-pulse',
          'Health-Insurance','Lymphocyte','Monocyte','Eosinophils','Total-Cholesterol','Mean-Cell-Vol','Mean-Cell-Hgb-Conc.','Hematocrit','Segmented-Neutrophils'], axis = 1, inplace=True)

In [109]:
#All Columns in the dataset
print(list(df))

['Gender', 'Age', 'Systolic', 'Diastolic', 'Weight', 'Body-Mass-Index', 'White-Blood-Cells', 'Basophils', 'Red-Blood-Cells', 'Hemoglobin', 'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol', 'Red-Cell-Distribution-Width', 'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose', 'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid', 'Triglycerides', 'HDL', 'Glycohemoglobin', 'Vigorous-work', 'Moderate-work', 'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease']


In [110]:
heartDiseaseCount = (df['CoronaryHeartDisease'] == 1).sum()
noHeartDiseaseCount = (df['CoronaryHeartDisease'] == 0).sum()
print(f'The number of people in this dataset with heart disease is {heartDiseaseCount} while the number of people without it is {noHeartDiseaseCount}')

The number of people in this dataset with heart disease is 1508 while the number of people without it is 35571


## LASSO for Feature Selection

In [111]:
#Lasso
#Training/testing split
X = df.drop('CoronaryHeartDisease', axis=1)  # Adjust 'target_column' to your target variable
y = df['CoronaryHeartDisease']

In [112]:
# Splitting into training, validation, and testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the temporary set into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (22247, 36) (22247,)
Validation set shapes: (7416, 36) (7416,)
Testing set shapes: (7416, 36) (7416,)


In [113]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [114]:
# create the lasso model
lasso_cv = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)

lasso_cv.fit(X_train, y_train)

print('The optimal alpha is', lasso_cv.alpha_)

# finding the predicted probability
y_pred_proba = lasso_cv.predict(X_valid)
# converting to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)   # might need to lower threshold
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)

The optimal alpha is 0.001
Accuracy: 0.959277238403452


In [115]:
#Feature selection in Lasso
lassoCoef = pd.Series(lasso_cv.coef_, index=X.columns)
selectedFeatures = lassoCoef[lassoCoef != 0]
print(f'The selected features are \n{selectedFeatures}')

The selected features are 
Gender                        -0.008132
Age                            0.036720
Systolic                      -0.003595
Diastolic                     -0.005334
Body-Mass-Index               -0.001301
White-Blood-Cells              0.002993
Red-Blood-Cells               -0.003281
Platelet-count                -0.007384
Red-Cell-Distribution-Width    0.005400
Albumin                       -0.001744
AST                           -0.002343
Cholesterol                   -0.017123
Creatinine                     0.006187
LDH                            0.006178
Bilirubin                      0.002525
Protein                       -0.001026
Uric.Acid                      0.005591
Triglycerides                  0.005399
HDL                           -0.003142
Glycohemoglobin                0.002057
Vigorous-work                  0.001763
Moderate-work                 -0.000550
Diabetes                      -0.009231
Blood-Rel-Stroke              -0.010817
dtype: float6

## Machine Learning Experiments

In [119]:
df_reduced = df[[
    "CoronaryHeartDisease", "Gender", "Age", "Systolic",
    "Diastolic", "Body-Mass-Index", "White-Blood-Cells",
    "Red-Blood-Cells", "Platelet-count", "Red-Cell-Distribution-Width",
    "Albumin", "AST", "Cholesterol", "Creatinine", "LDH", "Bilirubin",
    "Protein", "Uric.Acid", "Triglycerides", "HDL", "Glycohemoglobin",
    "Vigorous-work", "Moderate-work", "Diabetes", "Blood-Rel-Stroke"
                 ]]
df_reduced

Unnamed: 0,CoronaryHeartDisease,Gender,Age,Systolic,Diastolic,Body-Mass-Index,White-Blood-Cells,Red-Blood-Cells,Platelet-count,Red-Cell-Distribution-Width,...,Bilirubin,Protein,Uric.Acid,Triglycerides,HDL,Glycohemoglobin,Vigorous-work,Moderate-work,Diabetes,Blood-Rel-Stroke
0,0,1,77,98,56,24.90,7.6,4.73,214.0,13.7,...,12.00,72.0,362.8,1.298,1.39,4.7,3,3,2,2
1,0,1,49,122,83,29.10,5.9,5.13,209.0,13.1,...,8.60,73.0,404.5,3.850,1.08,5.5,1,1,2,2
2,0,1,37,174,99,30.62,10.2,5.76,357.0,13.6,...,6.80,72.0,339.0,1.581,0.98,5.2,2,1,2,1
3,0,1,70,130,66,25.57,11.6,5.53,228.0,14.4,...,8.60,66.0,410.4,3.635,1.28,7.6,3,3,1,2
4,0,1,81,136,61,27.33,9.1,5.32,160.0,12.4,...,10.30,79.0,368.8,0.756,1.04,5.8,1,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37074,0,1,25,112,76,21.00,7.1,5.50,253.0,13.3,...,11.97,76.0,428.3,1.264,1.27,5.8,2,2,2,2
37075,0,2,76,112,46,21.50,6.4,4.70,135.0,13.6,...,18.81,69.0,368.8,0.948,1.76,5.6,2,2,2,2
37076,0,2,80,146,58,31.00,4.7,4.44,172.0,16.9,...,5.13,66.0,273.6,1.095,2.90,5.7,2,2,2,2
37077,0,1,35,106,66,26.00,7.6,5.01,194.0,13.0,...,3.42,73.0,237.9,0.937,1.09,5.2,2,2,2,1


In [120]:
X = df_reduced.drop('CoronaryHeartDisease', axis=1)
y = df_reduced['CoronaryHeartDisease']

# Splitting into training, validation, and testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the temporary set into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Training set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_valid.shape, y_valid.shape)
print("Testing set shapes:", X_test.shape, y_test.shape)

Training set shapes: (22247, 24) (22247,)
Validation set shapes: (7416, 24) (7416,)
Testing set shapes: (7416, 24) (7416,)


In [122]:
#Feature Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

#### Logistic Regression

In [127]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9594120819848975
