# DSCI Group Project -- Group 42

## Research Question

In [3]:
library(tidyverse)
library(dplyr)
library(tidymodels)
set.seed(6666)

players <- read_csv("players.csv")


# clean_players <- players |>
#     mutate(subscribe = as.factor(subscribe),
#            experience = as.factor(experience),
#            gender = as.factor(gender)) |>
#     select(-hashedEmail, -Age, -name)

clean_players <- players |>
    mutate(subscribe = as.factor(subscribe)) |>
    mutate(experience = as.numeric(factor(experience, 
                        levels = c("Beginner", "Amateur", "Pro", "Veteran")))) |>
    mutate(Age = as.numeric(Age)) |>
  
    select(-hashedEmail, -gender, -name)

clean_players

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

experience,subscribe,played_hours,Age
<dbl>,<fct>,<dbl>,<dbl>
3,TRUE,30.3,9
4,TRUE,3.8,17
4,FALSE,0.0,17
2,TRUE,0.7,21
,TRUE,0.1,21
2,TRUE,0.0,17
,TRUE,0.0,19
2,FALSE,0.0,21
2,TRUE,0.1,47
4,TRUE,0.0,22


## Setting Up Training and Testing Data

In [4]:
players_split <- initial_split(clean_players, prop = 0.7, strata = subscribe)
players_train <- training(players_split)
players_test <- testing(players_split)

## Creating a Recipe and Model

In [5]:
players_recipe <- recipe(subscribe ~ ., data = players_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
players_recipe

knn_spec_tuned <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")



[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome:   1
predictor: 3



── Operations 

[36m•[39m Scaling for: [34mall_predictors()[39m

[36m•[39m Centering for: [34mall_predictors()[39m



## Using 5 Fold Cross Validation and Train the Model

In [None]:
players_vfold <- vfold_cv(players_train, v = 5, strata = subscribe)
k_vals <- tibble(neighbors = seq(from = 1, to = 103, by = 1))

knn_tuned_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_spec_tuned) |>
    tune_grid(resamples = players_vfold, grid = k_vals) |>
    collect_metrics()

→ [31m[1mA[22m[39m | [31merror[39m:   only 0's may be mixed with negative subscripts

There were issues with some computations   [1m[31mA[39m[22m: x1



## Visualize Accuracy with Different K Values

In [None]:
accuracy <- knn_tuned_fit |>
    filter(.metric == "accuracy")
accuracy_plot <- accuracy |>
    ggplot(aes(x = neighbors, y = mean)) +
    geom_point() +
    labs(x = "Number of Neighbors",
         y = "Mean Value of Accuracy") +
    ggtitle("The Relationship between Number of Neighbors (k) and Their Mean Value") +
    theme(text = element_text(size = 20))
accuracy_plot

## Find the Best K and Fitting it in Training Data

In [13]:
best_k <- accuracy |>
    filter(mean == max(mean)) |>
    pull(neighbors)
knn_spec_best <- nearest_neighbor(weight_func = "rectangular", neighbor = 29) |>
    set_engine("kknn") |>
    set_mode("classification")
knn_best_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_spec_best) |>
    fit(players_test)


## Predict Using Testing Data

In [14]:
subscribe_predictions <- predict(knn_best_fit, players_test) |>
    bind_cols(players_test)
subscribe_metrics <- subscribe_predictions |>
    metrics(truth = subscribe, estimate = .pred_class)
subscribe_metrics
subscribe_predictions

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.7333333
kap,binary,0.0


.pred_class,experience,subscribe,played_hours,gender
<fct>,<fct>,<fct>,<dbl>,<fct>
True,Veteran,False,0.0,Male
True,Amateur,False,0.0,Male
True,Pro,True,0.0,Male
True,Amateur,True,0.2,Male
True,Veteran,True,0.0,Non-binary
True,Regular,True,0.3,Male
True,Amateur,False,0.1,Female
True,Beginner,True,0.0,Male
True,Amateur,True,1.8,Male
True,Veteran,True,0.1,Male
