In [1]:
library(tidyverse)
library(tidymodels)
install.packages("kknn")

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mdplyr  [39m 1.1.1
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 0.5.2
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.2     [32m✔[39m [34mrsample     [39m 1.1.1
[32m✔[39m [34mdials       [39m 1.1.0     [32m✔[39m [34mtune        [39m 1.0.1
[32m✔[39m [34minfer       [39m 1.0.4     [32m✔[39m [34mworkflows   [39m 1.1.2
[32m✔[39

In [2]:
heart_data <- read_csv("https://raw.githubusercontent.com/Mr-Slope/DSCI-100_Group_Project/main/processed.cleveland.data",
                       col_names=c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
                                   "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"))
head(heart_data)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): ca, thal
[32mdbl[39m (12): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0


In [14]:
heart_tidy <- heart_data |>
    filter(ca != "?", thal != "?") |>
    mutate(across(c(ca, thal), as.numeric)) |>
    mutate(across(c(cp, fbs, restecg, exang, slope, ca, thal, num), as_factor)) |>
    mutate(cp = fct_recode(cp, "angina" = "1", "abnormal" = "2", "nonang" = "3", "asymp" = "4")) |>
    mutate(fbs = fct_recode(fbs, "high" = "1", "norm" = "0")) |>
    mutate(restecg = fct_recode(restecg, "norm" = "0", "abnorm" = "1", "damage" = "2")) |>
    mutate(exang = fct_recode(exang, "TRUE" = "1", "FALSE" = "0")) |>
    mutate(slope = fct_recode(slope, "up" = "1", "flat" = "2", "down" = "3")) |>
    mutate(thal = fct_recode(thal, "norm" = "3", "fixed" = "6", "reversible" = "7")) |>
    mutate(num = fct_recode(num, "healthy" = "0", "sick" = "1", "sick" = "2", "sick" = "3", "sick" = "4")) |> # in the data files, 1,2,3,4 are all sick
    tibble() 

In [15]:
set.seed(29)

heart_split <- initial_split(heart_tidy, prop = 0.75, strata = num)
heart_training <- training(heart_split)
heart_testing <- testing(heart_split)

In [16]:
heart_tune <- nearest_neighbor(weight="rectangular", neighbors=tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_vfold <- vfold_cv(heart_training, v = 5, strata = num)

gridvals <- tibble(neighbors = seq(1,100,5))

In [17]:
heart_recipe_age <- recipe(num ~ age, data = heart_training) |>
                        step_scale(all_numeric_predictors())|>
                            step_center(all_numeric_predictors())

heart_results_age <- workflow() |>
                        add_recipe(heart_recipe_age) |>
                            add_model(heart_tune) |>
                                tune_grid(resamples = heart_vfold, grid = gridvals) |>
                                    collect_metrics()

heart_recipe_a_s <- recipe(num ~ age + sex, data = heart_training) |>
                        step_scale(all_numeric_predictors())|>
                            step_center(all_numeric_predictors())

heart_results_a_s_ <- workflow() |>
                        add_recipe(heart_recipe_a_s) |>
                            add_model(heart_tune) |>
                                tune_grid(resamples = heart_vfold, grid = gridvals) |>
                                    collect_metrics()

heart_recipe_a_s_c <- recipe(num ~ age + sex + chol, data = heart_training) |>
                        step_scale(all_numeric_predictors())|>
                            step_center(all_numeric_predictors())

heart_results_a_s_c_ <- workflow() |>
                            add_recipe(heart_recipe_a_s_c) |>
                                add_model(heart_tune) |>
                                    tune_grid(resamples = heart_vfold, grid = gridvals) |>
                                        collect_metrics()

heart_recipe_a_s_c_th <- recipe(num ~ age + sex + chol + thalach, data = heart_training) |>
                            step_scale(all_numeric_predictors())|>
                                step_center(all_numeric_predictors())

heart_results_a_s_c_th <- workflow() |>
                            add_recipe(heart_recipe_a_s_c_th) |>
                                add_model(heart_tune) |>
                                    tune_grid(resamples = heart_vfold, grid = gridvals) |>
                                        collect_metrics()

In [18]:
heart_fit_age_acc <- heart_results_age |>
                        select(neighbors, mean, .metric) |>
                            filter(.metric == "accuracy") |>
                                slice_max(order_by = mean, n = 1)
    
heart_fit_age_acc

heart_fit_a_s_acc <- heart_results_a_s_ |>
                        select(neighbors, mean, .metric) |>
                            filter(.metric == "accuracy") |>
                                slice_max(order_by = mean, n = 1)
heart_fit_a_s_acc

heart_fit_a_s_c_acc <- heart_results_a_s_c_ |>
                        select(neighbors, mean, .metric) |>
                            filter(.metric == "accuracy") |>
                                slice_max(order_by = mean, n = 1)
heart_fit_a_s_c_acc

heart_fit_a_s_c_th_acc <- heart_results_a_s_c_th |>
                            select(neighbors, mean, .metric) |>
                                filter(.metric == "accuracy") |>
                                    slice_max(order_by = mean, n = 1)
heart_fit_a_s_c_th_acc

neighbors,mean,.metric
<dbl>,<dbl>,<chr>
56,0.599697,accuracy


neighbors,mean,.metric
<dbl>,<dbl>,<chr>
56,0.6258586,accuracy


neighbors,mean,.metric
<dbl>,<dbl>,<chr>
6,0.6269697,accuracy


neighbors,mean,.metric
<dbl>,<dbl>,<chr>
31,0.7025253,accuracy
46,0.7025253,accuracy


In [30]:
heart_spec_age <- nearest_neighbor(weight = "rectangular", neighbors = 56) |>
                    set_engine("kknn") |>
                        set_mode("classification")

heart_spec_a_s <- nearest_neighbor(weight="rectangular", neighbors = 56) |>
                    set_engine("kknn") |>
                        set_mode("classification")

heart_spec_a_s_c <- nearest_neighbor(weight="rectangular", neighbors = 6) |>
                        set_engine("kknn") |>
                            set_mode("classification")

heart_spec_a_s_c_th <- nearest_neighbor(weight="rectangular", neighbors = 46) |>
                        set_engine("kknn") |>
                            set_mode("classification")

In [48]:
heart_fit_age <- workflow() |>
                    add_recipe(heart_recipe_age) |>
                        add_model(heart_spec_age) |>
                            fit(heart_training) 

heart_pred_age <- heart_fit_age |>
                    predict(heart_testing) |>
                        bind_cols(heart_testing) 

heart_pred_metrics_age <- heart_pred_age |>
                            metrics(truth = num, estimate = .pred_class) |>
                                filter(.metric == "accuracy")

accuracy_age <- heart_pred_metrics_age|>
                    mutate(predictor = c("Age"))
accuracy_age

heart_pred_conf_age <- heart_pred_age |>
                        conf_mat(truth = num, estimate = .pred_class)
heart_pred_conf_age

heart_pred_precision_age <- (31/(31+10))*100

heart_pred_precision_age

heart_pred_recall_age <- (31/(31+9))*100

heart_pred_recall_age

.metric,.estimator,.estimate,predictor
<chr>,<chr>,<dbl>,<chr>
accuracy,binary,0.7466667,Age


          Truth
Prediction healthy sick
   healthy      31   10
   sick          9   25

In [49]:
heart_fit_a_s <- workflow() |>
                    add_recipe(heart_recipe_a_s) |>
                        add_model(heart_spec_a_s) |>
                            fit(heart_training) 

heart_pred_a_s <- heart_fit_a_s |>
                    predict(heart_testing) |>
                        bind_cols(heart_testing) 

heart_pred_metrics_a_s <- heart_pred_a_s |>
                            metrics(truth = num, estimate=.pred_class) |>
                                filter(.metric == "accuracy")

accuracy_a_s <- heart_pred_metrics_a_s|>
                    mutate(predictor = c("Age and Sex"))
accuracy_a_s

heart_pred_conf_a_s <- heart_pred_a_s |>
                        conf_mat(truth = num, estimate = .pred_class)
heart_pred_conf_a_s

heart_pred_precision_a_s <- (32/(32+9))*100

heart_pred_precision_a_s

heart_pred_recall_a_s <- (32/(32+8))*100

heart_pred_recall_a_s

.metric,.estimator,.estimate,predictor
<chr>,<chr>,<dbl>,<chr>
accuracy,binary,0.7733333,Age and Sex


          Truth
Prediction healthy sick
   healthy      32    9
   sick          8   26

In [50]:
heart_fit_a_s_c <- workflow() |>
                    add_recipe(heart_recipe_a_s_c) |>
                        add_model(heart_spec_a_s_c) |>
                            fit(heart_training) 

heart_pred_a_s_c <- heart_fit_a_s_c |>
                        predict(heart_testing) |>
                            bind_cols(heart_testing) 

heart_pred_metrics_a_s_c <- heart_pred_a_s_c |>
                                metrics(truth = num, estimate=.pred_class) |>
                                    filter(.metric == "accuracy")

accuracy_a_s_c <- heart_pred_metrics_a_s_c|>
                    mutate(predictor = c("Age, Sex and Cholesterol"))
accuracy_a_s_c

heart_pred_conf_a_s_c <- heart_pred_a_s_c |>
                            conf_mat(truth = num, estimate = .pred_class)
heart_pred_conf_a_s_c

heart_pred_precision_a_s_c <- (28/(28+16))*100

heart_pred_precision_a_s_c

heart_pred_recall_a_s_c <- (28/(28+12))*100

heart_pred_recall_a_s_c

.metric,.estimator,.estimate,predictor
<chr>,<chr>,<dbl>,<chr>
accuracy,binary,0.6266667,"Age, Sex and Cholesterol"


          Truth
Prediction healthy sick
   healthy      28   16
   sick         12   19

In [52]:
heart_fit_a_s_c_th <- workflow() |>
                        add_recipe(heart_recipe_a_s_c_th) |>
                            add_model(heart_spec_a_s_c_th) |>
                                fit(heart_training) 

heart_pred_a_s_c_th <- heart_fit_a_s_c_th |>
                        predict(heart_testing) |>
                            bind_cols(heart_testing) 

heart_pred_metrics_a_s_c_th <- heart_pred_a_s_c_th |>
                                metrics(truth = num, estimate=.pred_class) |>
                                    filter(.metric == "accuracy")

accuracy_a_s_c_th <- heart_pred_metrics_a_s_c_th|>
                        mutate(predictor = c("Age, Sex, Cholesterol and Max-heartrate"))

accuracy_a_s_c_th

heart_pred_conf_a_s_c_th <- heart_pred_a_s_c_th |>
                            conf_mat(truth = num, estimate = .pred_class)
heart_pred_conf_a_s_c_th

heart_pred_precision_a_s_c_th <- (34/(34+13))*100

heart_pred_precision_a_s_c_th

heart_pred_recall_a_s_c_th <- (34/(34+6))*100

heart_pred_recall_a_s_c_th

.metric,.estimator,.estimate,predictor
<chr>,<chr>,<dbl>,<chr>
accuracy,binary,0.7466667,"Age, Sex, Cholesterol and Max-heartrate"


          Truth
Prediction healthy sick
   healthy      34   13
   sick          6   22

In [55]:
best_accuracy <- bind_rows(accuracy_age, accuracy_a_s, accuracy_a_s_c, accuracy_a_s_c_th)|>
                    filter(.estimate == max(.estimate))
best_accuracy

.metric,.estimator,.estimate,predictor
<chr>,<chr>,<dbl>,<chr>
accuracy,binary,0.7733333,Age and Sex
