In [None]:
library(tidyverse)
library(tidymodels)

In [None]:
# clean and wrangle data
cleveland_data <- read_csv("data/processed_cleveland.csv")|>
                mutate(sex = as_factor(sex),
                       cp = as_factor(cp),
                       fbs = as_factor(fbs),
                       restecg = as_factor(restecg),
                       exang = as_factor(exang),
                       slope = as_factor(slope),
                       thal = as_factor(thal),
                       num = as_factor(num),
                       ca = as.integer(ca))|>
                rename(chest_pain = cp,
                       blood_pressure = trestbps,
                       cholesterol = chol,
                       blood_sugar = fbs,
                       rest_ecg = restecg,
                       heart_rate = thalach,
                       angina = exang,
                       st_depression = oldpeak,
                       num_vessels = ca,
                       diagnosis = num)

cleveland_data

In [None]:
# splitting data into training and testing
cleveland_split <- initial_split(cleveland_data, prop = 0.75, strata = diagnosis)
cleveland_train <- training(cleveland_split)
cleveland_test <- testing(cleveland_split)

glimpse(cleveland_train)

In [None]:
# summarizing the number of observations of each diagnosis in the training data
heart_disease_proportions <- cleveland_train |>
                          group_by(diagnosis) |>
                          summarize(n = n()) |>
                          mutate(percent = 100*n/nrow(cleveland_train))

heart_disease_proportions

In [None]:
rbp_chol_plot <- ggplot(cleveland_train, aes(x = blood_pressure, y = cholesterol, color = diagnosis)) +
                geom_point(alpha = 0.6) +
                labs(x = "Resting blood pressure in mm Hg", 
                     y = "Serum cholesterol level in mg/dl", 
                     color = "Diagnosis of heart disease") +
                theme(text = element_text(size = 13))
rbp_chol_plot

In [None]:
rbp_hr_plot <- ggplot(cleveland_train, aes(x = blood_pressure, y = heart_rate, color = diagnosis)) +
                geom_point(alpha = 0.6) +
                labs(x = "Resting blood pressure in mm Hg", 
                     y = "Maximum heart rate", 
                     color = "Diagnosis of heart disease") +
                theme(text = element_text(size = 13))
rbp_hr_plot

In [None]:
fbs_ratio_plot <- ggplot(cleveland_train, aes(x = blood_sugar, fill = diagnosis)) +
                geom_bar(position = "fill") +
                labs(x = "Fasting blood sugar > 120 mg/dl",
                     y = "Ratio",
                     fill = "Diagnosis of heart disease") +
                theme(text = element_text(size = 12))
fbs_ratio_plot

In [None]:
cp_ratio_plot <- ggplot(cleveland_train, aes(x = chest_pain, fill = diagnosis)) +
                geom_bar(position = "fill") +
                labs(x = "Chest pain type",
                     y = "Ratio",
                     fill = "Diagnosis of heart disease") +
                theme(text = element_text(size = 12))
cp_ratio_plot