# Title

## Introduction

Our question is: Can age and experience predict whether or not a player is subscribed to a game-related newsletter in the players.csv dataset?

## Methods

In [None]:
#Run these beforehand
library(tidyverse)
library(repr)
library(tidymodels)
library(ggplot2)
library(gridExtra)

In [None]:
#Loading and cleaning dataset
players <- read_csv("players.csv")
#fix column names
colnames(players) <- c("experience","subscribe", "hashed_email", "played_hours", "name", "gender", "age")
#filter out NA values
players <- players |> 
    filter(!is.na(age)) |>
    filter(!is.na(experience))
#add a numeric experience column
players$experience_numeric <- recode(players$experience, 
                          "Pro" = 5,
                          "Veteran" = 4, 
                          "Regular" = 3, 
                          "Amateur" = 2, 
                          "Beginner" = 1)
players$subscribe_factor <- factor(players$subscribe)

In [None]:
plot <- players |>
    ggplot(aes(x = age, y =  experience_numeric, colour = subscribe)) + 
        geom_point() +
        labs(x="Player Age (years)", y="Player Experience", colour="Subscribed?") +
        ggtitle("Player Age vs Experience with Subscription")
plot

In [None]:
#Splitting dataset between training and testing
player_split <- initial_split(players, prop = 0.75, strata = subscribe_factor)  
player_train <- training(player_split)   
player_test <- testing(player_split)

#Setup classification
player_recipe <- recipe(subscribe_factor ~ age +  experience_numeric, data = player_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |> 
       set_engine("kknn") |>
       set_mode("classification") 
player_workflow <- workflow() |>
  add_recipe(player_recipe) |>
  add_model(player_spec)

#Find most accurate nearest neighbours
player_vfold <- vfold_cv(player_train, v = 10, strata = subscribe_factor, repeats = 5)
gridvals <- tibble(neighbors = seq(from = 1, to = 20, by = 1))
player_results <- player_workflow |> 
    tune_grid(resamples = player_vfold, grid = gridvals) |>
    collect_metrics()
accuracies <- player_results |> 
       filter(.metric == "accuracy")
accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(0, 14, by = 1)) +  # adjusting the x-axis
       scale_y_continuous(limits = c(0.4, 1.0)) # adjusting the y-axis
accuracy_versus_k

In [None]:
#Select most accurate neighber and compare with testing dataset
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 9) |>
       set_engine("kknn") |>
       set_mode("classification")
player_best_spec <- workflow() |>
       add_recipe(player_recipe) |>
       add_model(knn_spec)
player_fit <- fit(player_best_spec, data = player_train)
player_predictions <- predict(player_fit , player_test)|>
       bind_cols(player_test)
player_metrics <- player_predictions |>
         metrics(truth = subscribe_factor, estimate = .pred_class)
player_metrics

## Results

## Discussion