In [None]:
#load the necessary packages
library(repr)
library(themis)
library(tidyverse)
library(tidymodels)
options(repr.matrix.max.rows = 10)

In [None]:
#load the players data set
url <- "https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
players <- read_csv(url)

In [None]:
#convert the players data set into a tidy format by removing unnecessary columns
players_tidy <- players |> 
    select(experience:age, -hashedEmail, -name) #|> #removed the hashedEmail and name columns
    head(5) #print only the first 5 rows of the data set
players_tidy

In [None]:
#calculate the average number of played hours to determine a boundary separating high and low contributors
average_played_hours <- players_tidy |>
  summarize(avg_hours = mean(played_hours, na.rm = TRUE)) |>
    pull()


print(paste("The average number of played hours:", average_played_hours))

The above code output reveals that the average number of played hours in the players data set is 5.85 hours. Therefore, we will classify players who contributed 5.85 hours or more as "High Contributors" and players who contributed less than 5.85 hours as "Low Contributors."

In [None]:
#convert the character variables to factor variables so they can be used as categories for KNN classification
players_tidy <- players_tidy |> 
    mutate(experience = as.factor(experience), 
           gender = as.factor(gender))

In [None]:
#assign numerical values to the experience, and gender variables so they can be used to calculate distances between points in KNN classification
players_tidy <- players_tidy |> 
    mutate(experience = as.numeric(experience), 
           gender = as.numeric(gender))

In [None]:
#assign a contributor label to each played hours value
players_tidy <- players_tidy |> 
    mutate(contributor = factor(ifelse(played_hours > average_played_hours, "High Contributor", "Low Contributor"))) 

In [None]:
#test if this works (can delete later) 
head(players_tidy)

In [None]:
#set the seed for the project
set.seed(2024) 

#Split the data into a train:test ratio of 1:9
players_split <- initial_split(players_tidy, prop = 0.90, strata = contributor)  
players_train <- training(players_split)   
players_test <- testing(players_split)

#show data sample
players_train 
players_test

In [None]:
set.seed(2024)

#model for k=3 


#add recipe, use step_upsample to make high contributor data not so rare
players_recipe <- recipe(contributor ~ gender + age + experience, data = players_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())|>
    step_upsample(contributor, over_ratio = 1, skip = TRUE) 


players_recipe

#add model, use initial neighbors 3
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
       set_engine("kknn") |>
       set_mode("classification")

#get in to workflow
players_fit <- workflow() |>
       add_recipe(players_recipe) |>
       add_model(knn_spec) |>
       fit(data = players_train)


players_fit

In [None]:
set.seed(2024)

#test k = 1 to 10
k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

#create a 5fold cross-validation
player_vfold <- vfold_cv(players_train, v = 5, strata = contributor)

knn_spec <- nearest_neighbor(weight_func = "rectangular",
                             neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_results <- workflow() |>
  add_recipe(players_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = player_vfold, grid = k_vals) |>
  collect_metrics()

accuracies <- knn_results |>
  filter(.metric == "accuracy")

accuracies