In [15]:
# Libraries/Packages to load
library(tidyverse)
library(tidymodels)
library(RColorBrewer)
#install.packages("themis")
#install.packages("kknn")
library(themis)

In [16]:
# Reading the data from the web, adding column names
heart_data <- read_csv("https://raw.githubusercontent.com/Mr-Slope/DSCI-100_Group_Project/main/processed.cleveland.data",
                       col_names=c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
                                   "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"))
# Preview the data set
head(heart_data)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): ca, thal
[32mdbl[39m (12): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0


In [17]:
# Cleaning & Wrangling
heart_tidy <- heart_data |>
    filter(ca != "?", thal != "?") |>
    mutate(across(c(ca, thal), as.numeric)) |>
    mutate(across(c(cp, fbs, restecg, exang, slope, ca, thal, num), as_factor)) |>
    mutate(cp = fct_recode(cp, "angina" = "1", "abnormal" = "2", "nonang" = "3", "asymp" = "4")) |>
    mutate(fbs = fct_recode(fbs, "high" = "1", "norm" = "0")) |>
    mutate(restecg = fct_recode(restecg, "norm" = "0", "abnorm" = "1", "damage" = "2")) |>
    mutate(exang = fct_recode(exang, "TRUE" = "1", "FALSE" = "0")) |>
    mutate(slope = fct_recode(slope, "up" = "1", "flat" = "2", "down" = "3")) |>
    mutate(thal = fct_recode(thal, "norm" = "3", "fixed" = "6", "reversible" = "7")) |>
    mutate(num = fct_recode(num, "healthy" = "0", "sick" = "1", "sick" = "2", "sick" = "3", "sick" = "4")) |> # in the data files, 1,2,3,4 are all sick
    tibble() 
head(heart_tidy)
     

age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<fct>,<fct>,<dbl>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>
63,1,angina,145,233,high,damage,150,False,2.3,down,0,fixed,healthy
67,1,asymp,160,286,norm,damage,108,True,1.5,flat,3,norm,sick
67,1,asymp,120,229,norm,damage,129,True,2.6,flat,2,reversible,sick
37,1,nonang,130,250,norm,norm,187,False,3.5,down,0,norm,healthy
41,0,abnormal,130,204,norm,damage,172,False,1.4,up,0,norm,healthy
56,1,abnormal,120,236,norm,norm,178,False,0.8,up,0,norm,healthy


In [20]:
# KNN Classification model with predictions for 
# Chol
# Age & Chol
# Age & thalach (Max heart rate)
# Age & Chol & thalach (Max heart rate)
set.seed(29)

# Splitting the data (separate this)
heart_split <- initial_split(heart_tidy, prop = 0.75, strata = num)
heart_training <- training(heart_split)
heart_testing <- testing(heart_split)

# model 
heart_tune <- nearest_neighbor(weight="rectangular", neighbors=tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_vfold <- vfold_cv(heart_training, v = 5, strata = num)
gridvals <- tibble(neighbors = seq(1,100,5))

# Chol
heart_recipe_chol <- recipe(num ~ chol, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_chol <- workflow() |>
    add_recipe(heart_recipe_chol) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

# Age & Chol
heart_recipe_age_chol <- recipe(num ~ age + chol, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_age_chol <- workflow() |>
    add_recipe(heart_recipe_age_chol) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

# Age & thalach (Max heart rate)
heart_recipe_age_thal <- recipe(num ~ age + thalach, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_age_thal <- workflow() |>
    add_recipe(heart_recipe_age_thal) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

# Age & Chol & thalach (Max heart rate)
heart_recipe_act <- recipe(num ~ age + chol + thalach, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_act <- workflow() |>
    add_recipe(heart_recipe_act) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

In [21]:
heart_fit_chol_acc <- heart_tune_chol |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice(5)
heart_fit_chol_acc

heart_fit_age_chol_acc <- heart_tune_age_chol |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice_max(order_by = mean, n = 1)
heart_fit_age_chol_acc

heart_fit_age_thal_acc <- heart_tune_age_thal |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice_max(order_by = mean, n = 1)
heart_fit_age_thal_acc

heart_fit_act_acc <- heart_tune_act |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice_max(order_by = mean, n = 1)
    
heart_fit_act_acc

neighbors,mean,.metric
<dbl>,<dbl>,<chr>
21,0.540303,accuracy


neighbors,mean,.metric
<dbl>,<dbl>,<chr>
66,0.599596,accuracy


neighbors,mean,.metric
<dbl>,<dbl>,<chr>
16,0.7027273,accuracy


neighbors,mean,.metric
<dbl>,<dbl>,<chr>
86,0.689596,accuracy


In [23]:
heart_spec_s <- nearest_neighbor(weight="rectangular", neighbors=5) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_spec_st <- nearest_neighbor(weight="rectangular", neighbors=35) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_spec_ast <- nearest_neighbor(weight="rectangular", neighbors=13) |>
    set_engine("kknn") |>
    set_mode("classification")
# chol
heart_fit_chol <- workflow() |>
    add_recipe(heart_recipe_chol) |>
    add_model(heart_spec_s) |>
    fit(heart_training) 

heart_pred_chol <- heart_fit_chol |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_chol_metrics <- heart_pred_chol |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_chol_metrics
heart_pred_chol_conf <- heart_pred_chol |>
    conf_mat(truth=num, estimate=.pred_class)
heart_pred_chol_conf

# chol and age
heart_fit_age_chol <- workflow() |>
    add_recipe(heart_recipe_age_chol) |>
    add_model(heart_spec_st) |>
    fit(heart_training) 

heart_pred_age_chol <- heart_fit_age_chol |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_age_chol_metrics <- heart_pred_age_chol |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_age_chol_metrics
heart_pred_age_chol_conf <- heart_pred_age_chol |>
    conf_mat(truth=num,estimate=.pred_class)
heart_pred_age_chol_conf

# age and thalach
heart_fit_age_thal <- workflow() |>
    add_recipe(heart_recipe_age_thal) |>
    add_model(heart_spec_st) |>
    fit(heart_training) 

heart_pred_age_thal <- heart_fit_age_thal |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_age_thal_metrics <- heart_pred_age_thal |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_age_thal_metrics
heart_pred_age_thal_conf <- heart_pred_age_thal |>
    conf_mat(truth=num,estimate=.pred_class)
heart_pred_age_thal_conf

# age, chol, and thalach
heart_fit_act <- workflow() |>
    add_recipe(heart_recipe_act) |>
    add_model(heart_spec_ast) |>
    fit(heart_training) 

heart_pred_act <- heart_fit_act |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_act_metrics <- heart_pred_act |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_act_metrics
heart_pred_act_conf <- heart_pred_act |>
    conf_mat(truth=num,estimate=.pred_class)
heart_pred_act_conf

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.5333333


          Truth
Prediction healthy sick
   healthy      25   20
   sick         15   15

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.6666667


          Truth
Prediction healthy sick
   healthy      30   15
   sick         10   20

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.7466667


          Truth
Prediction healthy sick
   healthy      31   10
   sick          9   25

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.6533333


          Truth
Prediction healthy sick
   healthy      29   15
   sick         11   20