# DSCI Group Project -- Group 42

## Research Question

In [None]:
library(tidyverse)
library(dplyr)
library(tidymodels)
library(themis)

set.seed(6666)

players <- read_csv("players.csv")


clean_players <- players |>
    mutate(subscribe = as.factor(subscribe),
           experience = as.factor(experience)) |>
    select(-hashedEmail, -gender, -name) |>
    filter(!is.na(Age)) |>
    filter(!is.na(experience)) |>
    filter(!is.na(subscribe))


clean_players

## Setting Up Training and Testing Data

In [None]:
players_split <- initial_split(clean_players, prop = 0.7, strata = subscribe)
players_train <- training(players_split)
players_test <- testing(players_split)

## Creating a Recipe and Model

In [None]:
players_recipe <- recipe(subscribe ~ ., data = players_train) |>
    step_scale(Age, played_hours) |>
    step_center(Age, played_hours) |>
    step_upsample(subscribe, over_ratio = 1, skip = TRUE)

knn_spec_tuned <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

## Using 5 Fold Cross Validation and Train the Model

In [None]:
players_vfold <- vfold_cv(players_train, v = 5, strata = subscribe)
k_vals <- tibble(neighbors = seq(from = 1, to = 103, by = 1))

knn_tuned_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_spec_tuned) |>
    tune_grid(resamples = players_vfold, grid = k_vals) |>
    collect_metrics()

## Visualize Accuracy with Different K Values

In [None]:
accuracy <- knn_tuned_fit |>
    filter(.metric == "accuracy")
accuracy_plot <- accuracy |>
    ggplot(aes(x = neighbors, y = mean)) +
    geom_point() +
    labs(x = "Number of Neighbors",
         y = "Mean Value of Accuracy") +
    ggtitle("The Relationship between Number of Neighbors (k) and Their Mean Value") +
    theme(text = element_text(size = 20))
accuracy_plot

## Find the Best K and Fitting it in Training Data

In [None]:
best_k <- accuracy |>
    slice_min(std_err)
best_k
# |>
#     pull(neighbors)
# best_k
# knn_spec_best <- nearest_neighbor(weight_func = "rectangular", neighbor = best_k) |>
#     set_engine("kknn") |>
#     set_mode("classification")
# knn_best_fit <- workflow() |>
#     add_recipe(players_recipe) |>
#     add_model(knn_spec_best) |>
#     fit(players_test)


## Predict Using Testing Data

In [None]:
subscribe_predictions <- predict(knn_best_fit, players_test) |>
    bind_cols(players_test)
subscribe_metrics <- subscribe_predictions |>
    metrics(truth = subscribe, estimate = .pred_class)
subscribe_metrics
subscribe_predictions