# REPORT: Predicting Risk of Heart Disease from Accessible Health Metrics

## Introduction:

According to the Public Health Agency of Canada, heart disease is the second leading cause of death in Canada, with approximately 1 in 12 Canadian adults over 20 living with a diagnosis. These metrics highlight the importance of knowing the risk factors and having access to medical advice. However, a shortage of physicians in Canada is causing a lack of available health care (Flood et al., 2023). Non-healthcare professionals do not have the means to properly self-evaluate symptoms, therefore our project seeks to help the general population to make informative decisions about heart disease symptoms that are self-monitored or easily accessible.


Thus we ask, is it possible to classify individuals into levels of heart disease risk (low risk, moderate risk, or high risk) based on blood pressure, cholesterol, heart rate and chest pain?


Our analysis will use the Heart Disease dataset from the Cleveland database for heart disease (Andras et al., 1988). This database consists of 303 patients without history of heart disease, who were admitted to the Cleveland Clinic between 1981 and 1984. 



In [1]:
# Please uncomment the following cell to install the altair in case your package is not up-to-date

In [2]:
# pip install -U altair

In [3]:
import altair as alt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.utils import resample

In [4]:
# import dataset
heart_disease = pd.read_csv("https://archive.ics.uci.edu/static/public/45/data.csv")

# filter dataframe
heart_disease.rename(columns = {
                          "fbs" : "fasting_blood_sugar",
                          "chol" : "cholesterol", 
                          "cp":"type_chestpain",
                          "restecg" : "resting_ecg",
                          "thalach" : "max_heart_rate",
                          "exang" : "exercise_induced_angina",
                          "oldpeak" : "ST_depression", 
                          "slope" : "ST_segment_slope", 
                          "ca" : "num_major_vessels", 
                          "thal" : "thallium_stress_test", #not sure
                          "num" : "diagnosis",
                          "trestbps" : "resting_bp"
}, inplace = True)

heart_disease = heart_disease[["cholesterol","type_chestpain","max_heart_rate","resting_bp","diagnosis"]]


# A low-risk diagnosis is 0, 1
# A moderate-risk diagnosis is 2, 3
# A high-risk diagnosis is 4
heart_disease['diagnosis'] = heart_disease['diagnosis'].replace([0,1], "low-risk heart disease")
heart_disease['diagnosis'] = heart_disease['diagnosis'].replace([2,3], "moderate-risk heart disease")
heart_disease['diagnosis'] = heart_disease['diagnosis'].replace([4], "high-risk heart disease")

# chest pain type
heart_disease['type_chestpain'] = heart_disease['type_chestpain'].replace(
    [1,2,3,4],
    ["type1","type2","type3","type4"])


heart_disease

Unnamed: 0,cholesterol,type_chestpain,max_heart_rate,resting_bp,diagnosis
0,233,type1,150,145,low-risk heart disease
1,286,type4,108,160,moderate-risk heart disease
2,229,type4,129,120,low-risk heart disease
3,250,type3,187,130,low-risk heart disease
4,204,type2,172,130,low-risk heart disease
...,...,...,...,...,...
298,264,type1,132,110,low-risk heart disease
299,193,type4,141,144,moderate-risk heart disease
300,131,type4,115,130,moderate-risk heart disease
301,236,type2,174,130,low-risk heart disease


In [17]:
# split data into training and test sets
heart_disease_train, heart_disease_test = train_test_split(heart_disease, train_size = 0.75, random_state = 0)

low-risk heart disease         219
moderate-risk heart disease     71
high-risk heart disease         13
Name: diagnosis, dtype: int64

In [6]:
# Summary of the categorical variables
heart_disease_categorical = heart_disease_train.drop(columns = ["cholesterol","max_heart_rate","resting_bp"])
heart_disease_categorical.describe()

Unnamed: 0,type_chestpain,diagnosis
count,227,227
unique,4,3
top,type4,low-risk heart disease
freq,108,163


In [7]:
# Summary of the continuous variables
heart_disease_continuous = heart_disease_train.drop(columns = ["type_chestpain","diagnosis"])
heart_disease_continuous.describe()

Unnamed: 0,cholesterol,max_heart_rate,resting_bp
count,227.0,227.0,227.0
mean,245.810573,150.286344,132.277533
std,49.162043,21.961187,16.659197
min,126.0,96.0,94.0
25%,212.0,133.5,120.0
50%,240.0,152.0,130.0
75%,273.5,167.5,140.0
max,417.0,202.0,180.0


In [8]:
# blood pressure
bp_hist = alt.Chart(heart_disease_train).mark_bar().encode(
    x=alt.X("resting_bp:Q", bin = True).title("Blood Pressure"),
    y=alt.Y("count()").stack(False),
    color="diagnosis:N"
).properties(
    title = "Distribution of Blood Pressure"
)

bp_hist

In [9]:
# cholesterol
chol_hist = alt.Chart(heart_disease_train).mark_bar().encode(
    x=alt.X("cholesterol:Q", bin = True).title("Cholesterol"),
    y=alt.Y("count()").stack(False),
    color = "diagnosis:N"
).properties(
    title = "Distribution of Cholesterol"
)

chol_hist

In [10]:
# chest pain type
cp_hist = alt.Chart(heart_disease_train).mark_bar().encode(
    x=alt.X("type_chestpain").title("Chest Pain Type"),
    y=alt.Y("count()").stack(False),
    color = "diagnosis:N"
).properties(
    width=300,
    height=300,
    title = "Distribution of Chest Pain Type"
)

cp_hist

In [11]:
# heart rate
hr_hist = alt.Chart(heart_disease_train).mark_bar().encode(
    x=alt.X("max_heart_rate:Q", bin = True).title("Heart Rate"),
    y=alt.Y("count()").stack(False),
    color = "diagnosis:N"
).properties(
    title = "Distribution of Heart Rate"
)

hr_hist

In [12]:
# balancing heart disease
rare_diagnosis_1 = heart_disease[heart_disease["diagnosis"] == "high-risk heart disease"]
rare_diagnosis_2 = heart_disease[heart_disease["diagnosis"] == "moderate-risk heart disease"]
low_risk_diagnosis = heart_disease[heart_disease["diagnosis"] == "low-risk heart disease"]

rare_diagnosis_upsample_1 = resample(rare_diagnosis_1, n_samples = low_risk_diagnosis.shape[0])
rare_diagnosis_upsample_2 = resample(rare_diagnosis_2, n_samples = low_risk_diagnosis.shape[0])

heart_disease = pd.concat((rare_diagnosis_upsample_1, rare_diagnosis_upsample_2, low_risk_diagnosis))

heart_disease["diagnosis"].value_counts()

In [13]:
preprocessor=make_column_transformer(
    (StandardScaler(),
     ['cholesterol','max_heart_rate','resting_bp']
    ),remainder='drop',
    verbose_feature_names_out=False
)
preprocessor

In [14]:
heart_disease_standard=pd.DataFrame(
    preprocessor
    .fit(heart_disease_continuous)
    .transform(heart_disease_continuous)
)
heart_disease_standard.columns=['cholesterol','max_heart_rate','resting_bp']

In [15]:
np.random.seed(1234)
knn = KNeighborsClassifier()
parameter_grid = {
    "n_neighbors": range(5, 31)}
grid_search = GridSearchCV(
    estimator=knn, 
    param_grid=parameter_grid, 
    cv=5, 
    return_train_score=True, 
    n_jobs=-1)
X_heart_train=heart_disease_train[['cholesterol','max_heart_rate','resting_bp']]
y_heart_train=heart_disease_train['diagnosis']
grid_search.fit(X_heart_train, y_heart_train)

grid_results=pd.DataFrame(grid_search.cv_results_)

cross_val_plot = alt.Chart(grid_results).mark_line(point=True).encode(
    x=alt.X("param_n_neighbors").title("Values for K").scale(zero=True),
    y=alt.Y("mean_test_score").title("Acuracy of model").scale(zero=False)
)

cross_val_plot

In [16]:
knn=KNeighborsClassifier(n_neighbors=12)
heart_fit=knn.fit(X_heart_train,y_heart_train)
heart_predictions=pd.DataFrame(heart_fit.predict(heart_disease_test[['cholesterol','max_heart_rate','resting_bp']]))
hear

NameError: name 'hear' is not defined