In [1]:
# Libraries/Packages to load
library(tidyverse)
library(tidymodels)
library(RColorBrewer)
#install.packages("themis")
#install.packages("kknn")
library(themis)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mdplyr  [39m 1.1.1
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 0.5.2
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.2     [32m✔[39m [34mrsample     [39m 1.1.1
[32m✔[39m [34mdials       [39m 1.1.0     [32m✔[39m [34mtune        [39m 1.0.1
[32m✔[39m [34minfer       [39m 1.0.4     [32m✔[39m [34mworkflows   [39m 1.1.2
[32m✔[39

In [2]:
# Reading the data from the web, adding column names
heart_data <- read_csv("https://raw.githubusercontent.com/Mr-Slope/DSCI-100_Group_Project/main/processed.cleveland.data",
                       col_names=c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
                                   "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"))
# Preview the data set
head(heart_data)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): ca, thal
[32mdbl[39m (12): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0


In [3]:
# Cleaning & Wrangling
heart_tidy <- heart_data |>
    filter(ca != "?", thal != "?") |>
    mutate(across(c(ca, thal), as.numeric)) |>
    mutate(across(c(cp, fbs, restecg, exang, slope, ca, thal, num), as_factor)) |>
    mutate(cp = fct_recode(cp, "angina" = "1", "abnormal" = "2", "nonang" = "3", "asymp" = "4")) |>
    mutate(fbs = fct_recode(fbs, "high" = "1", "norm" = "0")) |>
    mutate(restecg = fct_recode(restecg, "norm" = "0", "abnorm" = "1", "damage" = "2")) |>
    mutate(exang = fct_recode(exang, "TRUE" = "1", "FALSE" = "0")) |>
    mutate(slope = fct_recode(slope, "up" = "1", "flat" = "2", "down" = "3")) |>
    mutate(thal = fct_recode(thal, "norm" = "3", "fixed" = "6", "reversible" = "7")) |>
    mutate(num = fct_recode(num, "healthy" = "0", "sick" = "1", "sick" = "2", "sick" = "3", "sick" = "4")) |> # in the data files, 1,2,3,4 are all sick
    tibble() 

In [23]:
# KNN Classification model with predictions for sex, sex + MH, age + sex + mh
set.seed(29)

# Splitting the data (separate this)
heart_split <- initial_split(heart_tidy, prop = 0.75, strata = num)
heart_training <- training(heart_split)
heart_testing <- testing(heart_split)

# model 
heart_tune <- nearest_neighbor(weight="rectangular", neighbors=tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_vfold <- vfold_cv(heart_training, v = 5, strata = num)
gridvals <- tibble(neighbors = seq(1,100,5))

heart_recipe_s <- recipe(num ~ sex, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_s <- workflow() |>
    add_recipe(heart_recipe_s) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

heart_recipe_st <- recipe(num ~ sex + thalach, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_st <- workflow() |>
    add_recipe(heart_recipe_st) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

heart_recipe_ast <- recipe(num ~ age + sex + thalach, data = heart_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

heart_tune_ast <- workflow() |>
    add_recipe(heart_recipe_ast) |>
    add_model(heart_tune) |>
    tune_grid(resamples = heart_vfold, grid = gridvals) |>
    collect_metrics()

In [24]:
heart_fit_s_acc <- heart_fit_s |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice(5)
heart_fit_s_acc

heart_fit_st_acc <- heart_fit_st |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice_max(order_by = mean, n = 1)
heart_fit_st_acc

heart_fit_ast_acc <- heart_fit_ast |>
    select(neighbors, mean, .metric) |>
    filter(.metric == "accuracy") |>
    slice_max(order_by = mean, n = 1)
    
heart_fit_ast_acc

neighbors,mean,.metric
<int>,<dbl>,<chr>
5,0.5406061,accuracy


neighbors,mean,.metric
<int>,<dbl>,<chr>
35,0.7068687,accuracy


neighbors,mean,.metric
<int>,<dbl>,<chr>
13,0.7433333,accuracy
14,0.7433333,accuracy


In [35]:
heart_spec_s <- nearest_neighbor(weight="rectangular", neighbors=5) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_spec_st <- nearest_neighbor(weight="rectangular", neighbors=35) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_spec_ast <- nearest_neighbor(weight="rectangular", neighbors=13) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_fit_s <- workflow() |>
    add_recipe(heart_recipe_s) |>
    add_model(heart_spec_s) |>
    fit(heart_training) 

heart_pred_s <- heart_fit_s |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_s_metrics <- heart_pred_s |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_s_metrics
heart_pred_s_conf <- heart_pred_s |>
    conf_mat(truth=num, estimate=.pred_class)
heart_pred_s_conf

heart_fit_st <- workflow() |>
    add_recipe(heart_recipe_st) |>
    add_model(heart_spec_st) |>
    fit(heart_training) 

heart_pred_st <- heart_fit_st |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_st_metrics <- heart_pred_st |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_st_metrics
heart_pred_st_conf <- heart_pred_st |>
    conf_mat(truth=num,estimate=.pred_class)
heart_pred_st_conf

heart_fit_ast <- workflow() |>
    add_recipe(heart_recipe_ast) |>
    add_model(heart_spec_ast) |>
    fit(heart_training) 

heart_pred_ast <- heart_fit_ast |>
    predict(heart_testing) |>
    bind_cols(heart_testing) 
heart_pred_ast_metrics <- heart_pred_ast |>
    metrics(truth = num, estimate=.pred_class) |>
    filter(.metric == "accuracy")
heart_pred_ast_metrics
heart_pred_ast_conf <- heart_pred_ast |>
    conf_mat(truth=num,estimate=.pred_class)
heart_pred_ast_conf

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.5333333


          Truth
Prediction healthy sick
   healthy      40   35
   sick          0    0

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.7733333


          Truth
Prediction healthy sick
   healthy      35   12
   sick          5   23

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.72


          Truth
Prediction healthy sick
   healthy      31   12
   sick          9   23