The dataset (https://www.kaggle.com/dileep070/heart-disease-prediction-using-logistic-regression) contains data for 4,238 patients. Each record contains 16 features.

The models outlined here are trained to predict whether the patient has a 10-year risk of future coronary heart disease (CHD).

In [1]:
# print everything from a given cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
# Read the data into Google Colab using .read_csv() function.
# import numpy and pandas
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/framingham.csv')
df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0


In [26]:
# Since the feature "education" shouldn't influence the risk of developing a coronary heart disease, remove it from the dataframe.

df = df.drop('education', axis=1)
df

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,1,50,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,0,48,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0


In [27]:
# Get a short description of the dataframe.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   currentSmoker    4238 non-null   int64  
 3   cigsPerDay       4209 non-null   float64
 4   BPMeds           4185 non-null   float64
 5   prevalentStroke  4238 non-null   int64  
 6   prevalentHyp     4238 non-null   int64  
 7   diabetes         4238 non-null   int64  
 8   totChol          4188 non-null   float64
 9   sysBP            4238 non-null   float64
 10  diaBP            4238 non-null   float64
 11  BMI              4219 non-null   float64
 12  heartRate        4237 non-null   float64
 13  glucose          3850 non-null   float64
 14  TenYearCHD       4238 non-null   int64  
dtypes: float64(8), int64(7)
memory usage: 496.8 KB


In [28]:
# Some of the feuatures are clearly wrong type - TenYearCHD, male, currentSmoker, BPMeds, prevalentStroke, prevalentHyp, diabetes should all be categorical.
# Correct variable type

df['TenYearCHD'] = pd.Categorical(df.TenYearCHD)
df['male'] = pd.Categorical(df.male)
df['currentSmoker'] = pd.Categorical(df.currentSmoker)
df['BPMeds'] = pd.Categorical(df.BPMeds)
df['prevalentStroke'] = pd.Categorical(df.prevalentStroke)
df['prevalentHyp'] = pd.Categorical(df.prevalentHyp)
df['diabetes'] = pd.Categorical(df.diabetes)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   male             4238 non-null   category
 1   age              4238 non-null   int64   
 2   currentSmoker    4238 non-null   category
 3   cigsPerDay       4209 non-null   float64 
 4   BPMeds           4185 non-null   category
 5   prevalentStroke  4238 non-null   category
 6   prevalentHyp     4238 non-null   category
 7   diabetes         4238 non-null   category
 8   totChol          4188 non-null   float64 
 9   sysBP            4238 non-null   float64 
 10  diaBP            4238 non-null   float64 
 11  BMI              4219 non-null   float64 
 12  heartRate        4237 non-null   float64 
 13  glucose          3850 non-null   float64 
 14  TenYearCHD       4238 non-null   category
dtypes: category(7), float64(7), int64(1)
memory usage: 294.8 KB


In [29]:
# Check the 10-year incidence rate of CHD.

df['TenYearCHD'].value_counts()
print()
print()
print('0 = no CHD risk; 1 = there is CHD risk.')

TenYearCHD
0    3594
1     644
Name: count, dtype: int64



0 = no CHD risk; 1 = there is CHD risk.


In [30]:
# Explore the categorical data by checking their descriptive statistics.
df['male'].value_counts()
print()
print()
df['currentSmoker'].value_counts()
print()
print()
df['BPMeds'].value_counts()
print()
print()
df['prevalentStroke'].value_counts()
print()
print()
df['prevalentHyp'].value_counts()
print()
print()
df['diabetes'].value_counts()

male
0    2419
1    1819
Name: count, dtype: int64





currentSmoker
0    2144
1    2094
Name: count, dtype: int64





BPMeds
0.0    4061
1.0     124
Name: count, dtype: int64





prevalentStroke
0    4213
1      25
Name: count, dtype: int64





prevalentHyp
0    2922
1    1316
Name: count, dtype: int64





diabetes
0    4129
1     109
Name: count, dtype: int64

In [31]:
# Create a pipeline to process numerical features.
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([
    # replace missing values of a feature with its median
    ('imputer', SimpleImputer(strategy='median')),
    # standardize the feature
    ('scaler', StandardScaler())
])

In [32]:
# Create a final pipeline to process both numerical and categorical features.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numerical_features = df.select_dtypes(exclude='category').columns.to_list()
categorical_features = df.select_dtypes(include='category').columns.to_list()
categorical_features.remove('TenYearCHD')

pipe = ColumnTransformer([
    # Use numerical_pipeline to process numerical features.
    ('num', numerical_pipeline, numerical_features),
    # Use .OneHotEncoder() function to one hot encode the categorical features.
    ('cat', OneHotEncoder(), categorical_features)
])

In [33]:
# Separate the target from the features and make them two separate NumPy arrays.

X = df.drop('TenYearCHD', axis=1)
y = np.c_[df['TenYearCHD']]

# Train/test split the dataframe with 20% of the data being the test set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
# Fit and transform the train set using the full pipeline.

X_train = pipe.fit_transform(X_train)

In [35]:
# Transform the test set using the full pipeline.

X_test = pipe.transform(X_test)

In [36]:
# Build and train a voting classifier consisting of three different classifiers on the train set:
# logistic regression, random-forest classifier, and SVC classifier.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

logistic = LogisticRegression()
svc = SVC()
RndmForest = RandomForestClassifier()

voter = VotingClassifier(
    estimators= [('logistic', logistic), ('svc', svc), ('RndmForest', RndmForest)]
)

In [37]:
# Calculate the accuracy of each separate classifier as well as the voting classifier on the test set to check which performs the best.
from sklearn.metrics import accuracy_score

logistic.fit(X_train, y_train.ravel())
y_pred_logistic = logistic.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_logistic

svc.fit(X_train, y_train.ravel())
y_pred_svc = svc.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
accuracy_svc

RndmForest.fit(X_train, y_train.ravel())
y_pred_RndmForest = RndmForest.predict(X_test)
accuracy_RndmForrest = accuracy_score(y_test, y_pred_RndmForest)
accuracy_RndmForrest

voter.fit(X_train, y_train.ravel())
y_pred_voter = voter.predict(X_test)
accuracy_voter = accuracy_score(y_test, y_pred_voter)
accuracy_voter

0.8726415094339622

0.8679245283018868

0.8691037735849056

0.8738207547169812

In [38]:
# Try other models. Fit an Extremely Randomized Trees (Extra-Trees) classifier on the train set. Calculate the accuracy based on the test set.
from sklearn.ensemble import ExtraTreesClassifier

ExtraTrees = ExtraTreesClassifier()
ExtraTrees.fit(X_train, y_train.ravel())
y_pred_ExtraTrees = ExtraTrees.predict(X_test)
accuracy_ExtraTrees = accuracy_score(y_test, y_pred_ExtraTrees)
accuracy_ExtraTrees

0.8643867924528302

In [39]:
# Fit an AdaBoost classifier on the train set. Calculate the accuracy based on the test set.
from sklearn.ensemble import AdaBoostClassifier

ABC = AdaBoostClassifier()
ABC.fit(X_train, y_train.ravel())
y_pred_ABC = ABC.predict(X_test)
accuracy_ABC = accuracy_score(y_test, y_pred_ABC)
accuracy_ABC

0.8655660377358491

In [40]:
# Fit a Gradient Boosting classifier on the train set. Calculate the accuracy base on the test set.
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier()
GBC.fit(X_train, y_train.ravel())
y_pred_GBC = GBC.predict(X_test)
accuracy_GBC = accuracy_score(y_test, y_pred_GBC)
accuracy_GBC

0.8714622641509434

In [41]:
# Use the early stopping method to identify the optimal number of trees for the Gradient Boosting classifier.

# set early stopping after five iterations
GBC = GradientBoostingClassifier(n_iter_no_change=5)

# Use the optimal number of trees to re-train the Gradient Boosting classifier and report accuracy based on the test set.
from sklearn.model_selection import GridSearchCV
Grid = GridSearchCV(estimator=GBC, param_grid={'n_estimators': [10,20,30,40,50,60,70,80,90,100]})
Grid.fit(X_train, y_train.ravel())
# use the best parameters
GBC_early = Grid.best_estimator_

GBC_early.fit(X_train, y_train.ravel())
y_pred_GBC_early = GBC_early.predict(X_test)
accuracy_GBC_early = accuracy_score(y_test, y_pred_GBC_early)
accuracy_GBC_early

0.8691037735849056

In [42]:
# Fit a Histogram-based Gradient Boosting classifier on the train set. Calculate the accuracy based on the test set.
from sklearn.ensemble import HistGradientBoostingClassifier

HGBC = HistGradientBoostingClassifier()
HGBC.fit(X_train, y_train.ravel())
y_pred_HGBC = HGBC.predict(X_test)
accuracy_HGBC = accuracy_score(y_test, y_pred_HGBC)
accuracy_HGBC

0.8561320754716981

In [43]:
# Fit an Extreme Gradient Boosting (XGBoost) classifier on the train set. Calculate the accuracy based on the test set.
import xgboost

XGBC = xgboost.XGBClassifier()
XGBC.fit(X_train, y_train.ravel())
y_pred_XGBC = XGBC.predict(X_test)
accuracy_XGBC = accuracy_score(y_test, y_pred_XGBC)
accuracy_XGBC

0.8490566037735849

In [44]:
# Build a stacking classifier using a random-forest classifier and a linear SVC classifier. Use logistic regression as the final estimator.
# Calculate the accuracy based on the test set.
from sklearn.ensemble import StackingClassifier

Stacking = StackingClassifier(estimators=[('RndmForest', RandomForestClassifier()), ('svc', SVC(kernel='linear'))], final_estimator=LogisticRegression())
Stacking.fit(X_train, y_train.ravel())
y_pred_Stacking = Stacking.predict(X_test)
accuracy_Stacking = accuracy_score(y_test, y_pred_Stacking)
accuracy_Stacking

0.8691037735849056

In [45]:
# Based on all ensemble models applied above, what are the top three models that offer the highest accuracy?
accuracies = {'AdaBoostC': accuracy_ABC, 'ExtraTrees': accuracy_ExtraTrees, 'GBC': accuracy_GBC, 'RndmForrest': accuracy_RndmForrest, 'Stacking': accuracy_Stacking,
              'XGBC': accuracy_XGBC, 'logistic': accuracy_logistic, 'svc': accuracy_svc, 'voter': accuracy_voter, 'HGBC': accuracy_HGBC, 'GBC_early': accuracy_GBC_early}
accuracies = sorted(accuracies.items(), key=lambda x:x[1], reverse=True)


print(f'The top model is {accuracies[0]}, the second best is {accuracies[1]}, and third best is {accuracies[2]}')

The top model is ('voter', 0.8738207547169812), the second best is ('logistic', 0.8726415094339622), and third best is ('GBC', 0.8714622641509434)
