In [2]:
library(repr)
library(dplyr)
library(tidyverse)
library(tidymodels)
install.packages("themis")
library(themis)
options(repr.matrix.max.rows = 10)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [16]:
# Downloading the unscaled dataset from the web.
pulsar_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip"
download.file(pulsar_url, "HTRU2.zip")
unzip("HTRU2.zip")

# Reading and tidying the dataset.
pulsar_data <- read_csv("HTRU_2.csv", col_names = FALSE) |> 
    # Add column names:
    rename(mean_intp = X1, 
           std_dev_intp = X2, 
           xs_kurtosis_intp = X3, 
           skewness_intp = X4, 
           mean_dmsnr = X5, 
           std_dev_dmsnr = X6, 
           xs_kurtosis_dmsnr = X7, 
           skewness_dmsnr = X8, 
           class = X9) |>
    tibble::rowid_to_column('id') |> # Adds an id to each individual pulsar star candidate.
    mutate(class = as_factor(class), id = as_factor(id)) |> # Change class from dbl to factor (category) as these are categorial variables.
    select(id, class, everything()) |> # Reorder class as the first column in the table for organization purposes.
    mutate(class = case_when(class == 0 ~ "non_pulsar",
                            class == 1 ~ "pulsar")) # Clarify the classes.

# Split data into training and testing data

pulsar_data_split <- initial_split(pulsar_data, prop = 0.75, strata = class)
pulsar_data_train <- training(pulsar_data_split)
pulsar_data_testing <- testing(pulsar_data_split)

# Balancing the classes

ups_recipe <- recipe(class ~ ., data = pulsar_data_train) |>
              step_upsample(class, over_ratio = 1, skip = FALSE) |>
              prep()

pulsar_data_training <- bake(ups_recipe, pulsar_data_train)

# Create recipe with training data that centres and scales data

pulsar_data_recipe <- recipe(class ~ mean_intp + std_dev_intp + xs_kurtosis_intp + skewness_intp + mean_dmsnr + std_dev_dmsnr +
                                     xs_kurtosis_dmsnr + skewness_dmsnr, data = pulsar_data_training) |>
                      step_scale(all_predictors()) |>
                      step_center(all_predictors())

# Create KNN model that sets neighbours to tune()

pulsar_data_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
                    set_engine("kknn") |>
                    set_mode("classification")

# Create cross validation using training data

pulsar_data_vfold <- vfold_cv(pulsar_data_training, v = 5, strata = class)

# Create workflow

pulsar_data_results <- workflow() |>
                       add_recipe(pulsar_data_recipe) |>
                       add_model(pulsar_data_tune) |>
                       tune_grid(resamples = pulsar_data_vfold, grid = 10) |>
                       collect_metrics()

pulsar_data_results <- pulsar_data_results |>
                       filter(.metric == "accuracy") |>
                       filter(mean == max(mean))

pulsar_data_optimal_k <- pulsar_data_results |>
                         select(n) |>
                         pull()

pulsar_data_optimal_k

[1mRows: [22m[34m17898[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (9): X1, X2, X3, X4, X5, X6, X7, X8, X9

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
