# Model Construction Document

In [1]:
#Read packages into R
library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

In [2]:
#reading the dataset and shortening the column names
pulsar_base_data = read_csv("https://raw.githubusercontent.com/NicoRui/DSCI_100_Project/main/pulsar_data_train.csv", skip = 1,
                            col_names = c("mean_prof", "sd_prof", 
                                          "kurt_prof", "skew_prof", "mean_DMSNR",
                                          "sd_DMSNR","kurt_DMSNR","skew_DMSNR", "target_class")) %>% 
                    mutate(target_class = as_factor(target_class)) %>% 
                    mutate(target_class = fct_recode(target_class,
                                                    "non_pulsar" = '0',
                                                    "pulsar" = '1')) 
           
head(pulsar_base_data)

Parsed with column specification:
cols(
  mean_prof = [32mcol_double()[39m,
  sd_prof = [32mcol_double()[39m,
  kurt_prof = [32mcol_double()[39m,
  skew_prof = [32mcol_double()[39m,
  mean_DMSNR = [32mcol_double()[39m,
  sd_DMSNR = [32mcol_double()[39m,
  kurt_DMSNR = [32mcol_double()[39m,
  skew_DMSNR = [32mcol_double()[39m,
  target_class = [32mcol_double()[39m
)



mean_prof,sd_prof,kurt_prof,skew_prof,mean_DMSNR,sd_DMSNR,kurt_DMSNR,skew_DMSNR,target_class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
121.15625,48.37297,0.3754847,-0.01316549,3.168896,18.39937,7.449874,65.159298,non_pulsar
76.96875,36.17556,0.7128979,3.38871856,2.399666,17.571,9.414652,102.722975,non_pulsar
130.58594,53.22953,0.1334083,-0.29724164,2.743311,22.36255,8.508364,74.031324,non_pulsar
156.39844,48.86594,-0.2159886,-0.17129365,17.471572,,2.958066,7.197842,non_pulsar
84.80469,36.11766,0.8250128,3.27412537,2.790134,20.61801,8.405008,76.291128,non_pulsar
121.00781,47.17694,0.2297081,0.09133623,2.036789,,9.546051,112.131721,non_pulsar


In [3]:
set.seed(632)
pulsar_split = initial_split(pulsar_base_data, prop = 0.8, strata = target_class)
pulsar_train = training(pulsar_split)
pulsar_test = testing(pulsar_split)
head(pulsar_train); head(pulsar_test)

mean_prof,sd_prof,kurt_prof,skew_prof,mean_DMSNR,sd_DMSNR,kurt_DMSNR,skew_DMSNR,target_class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
121.15625,48.37297,0.3754847,-0.01316549,3.168896,18.39937,7.4498741,65.159298,non_pulsar
76.96875,36.17556,0.7128979,3.38871856,2.399666,17.571,9.4146523,102.722975,non_pulsar
130.58594,53.22953,0.1334083,-0.29724164,2.743311,22.36255,8.5083638,74.031324,non_pulsar
84.80469,36.11766,0.8250128,3.27412537,2.790134,20.61801,8.4050084,76.291128,non_pulsar
121.00781,47.17694,0.2297081,0.09133623,2.036789,,9.5460511,112.131721,non_pulsar
79.34375,42.40217,1.0634129,2.24437669,141.641304,,-0.7008088,-1.200653,non_pulsar


mean_prof,sd_prof,kurt_prof,skew_prof,mean_DMSNR,sd_DMSNR,kurt_DMSNR,skew_DMSNR,target_class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
156.3984,48.86594,-0.2159886,-0.17129365,17.471572,,2.958066,7.197842,non_pulsar
83.9375,45.71272,0.74877682,1.34290171,1.747492,12.07058,11.838557,,non_pulsar
111.7109,46.57054,0.27286063,-0.061646,2.139632,16.65571,9.353872,97.73376,non_pulsar
127.3281,45.33954,-0.02866461,-0.20653223,3.955686,26.25178,6.815905,46.623791,non_pulsar
108.7578,47.47084,0.23388338,-0.01183974,3.42893,22.80573,7.431087,58.339075,non_pulsar
117.6797,47.45704,0.2148014,-0.13550329,2.731605,21.99329,8.401669,71.587641,non_pulsar


In [15]:
#Pulsar train and test with no NAs
pulsar_train_noNA = drop_na(pulsar_train)
pulsar_test_noNA = drop_na(pulsar_test)

#Standardization of all predictor variables
kurt_recipe_train = recipe(target_class ~ kurt_prof + kurt_DMSNR, data = pulsar_train_noNA) %>% 
                    step_scale(all_predictors()) %>% 
                    step_center(all_predictors()) %>% 
#                     step_upsample(target_class, over_ratio = 1, skip = FALSE) %>%
                    prep()
scaled_kurt_train = bake(kurt_recipe_train, pulsar_train_noNA)
head(scaled_kurt_train)

# kurt_recipe_test = recipe(target_class ~ kurt_prof + kurt_DMSNR, data = pulsar_test_noNA) %>% 
#                     step_scale(all_predictors()) %>% 
#                     step_center(all_predictors()) %>% 
# #                     step_upsample(target_class, over_ratio = 1, skip = FALSE) %>%
#                     prep()
# scaled_kurt_test = bake(kurt_recipe_test, pulsar_test_noNA)
# head(scaled_kurt_test)

kurt_prof,kurt_DMSNR,target_class
<dbl>,<dbl>,<fct>
-0.10101033,-0.19431386,non_pulsar
0.2079344,0.23448236,non_pulsar
-0.32266206,0.03669256,non_pulsar
0.31058988,0.01413612,non_pulsar
0.07261209,0.24073059,non_pulsar
-0.12656234,-0.10672679,non_pulsar


## Kurtosis

In [16]:
set.seed(781)
kurt_vfold = vfold_cv(scaled_kurt_train, v = 5, strata = target_class)
gridvals = tibble(neighbors = seq(1,50))
knn_kurt_spec = nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>% 
    set_engine("kknn") %>% 
    set_mode("classification")    

In [17]:
set.seed(781)
kurt_fit = workflow() %>% 
    add_recipe(kurt_recipe_train) %>% 
    add_model(knn_kurt_spec) %>% 
    tune_grid(resamples = kurt_vfold, grid = gridvals) %>% 
    collect_metrics() %>% 
    filter(.metric == "roc_auc") %>% #or accuracy
    arrange(mean) %>% 
    slice(1)
kurt_fit

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,accuracy,binary,0.9619028,5,0.002139457,Model01


In [15]:
#Use F1 score: uses ROC_AUC

# create the grid of area/smoothness vals, and arrange in a data frame
# are_grid <- seq(min(unscaled_cancer$Area), max(unscaled_cancer$Area), length.out = 100)
# smo_grid <- seq(min(unscaled_cancer$Smoothness), max(unscaled_cancer$Smoothness), length.out = 100)
# asgrid <- as_tibble(expand.grid(Area = are_grid, Smoothness = smo_grid))

# use the fit workflow to make predictions at the grid points
# knnPredGrid <- predict(knn_fit, asgrid)

# bind the predictions as a new column with the grid points
# prediction_table <- bind_cols(knnPredGrid, asgrid) %>% rename(Class = .pred_class)

# plot:
# 1. the coloured scatter of the original data
# 2. the faded coloured scatter for the grid points
# wkflw_plot <-
#   ggplot() +
#   geom_point(data = , mapping = aes(x = , y = , color = ), alpha = 0.75) +
#   geom_point(data = , mapping = aes(x = , y = , color = ), alpha = 0.02, size = 5.) +
#   labs(color = "") +
#   scale_color_manual(labels = c("", ""), values = )