# Title

# Introduction 

A research group in Computer Science at UBC, led by Frank Wood is collecting data about people's gaming behavior using a minecraft server. To better target recruitment efforts and use resources efficiently, the research group is interested in better understanding the dataset and identify underlying patterns. 

**Overview:** The "players.csv" dataset was collected by computer science research group at UBC led by Frank Wood using a minecraft server. The dataset includes a list of all unique players, with seven variables inclduing experience level, subscriptions status, hashed email, number of played hours, name, gender, and age, to describe each player. The dataset has 196 total observations. 

**Variables:**
- Experience: A character type variable, describing the experience level of the player. Values are one of amateur, beginner, regular, veteran, or pro.
- Subscribe: A logical type variable, indicating whether the player has a subscription to a game-related newsletter. 'TRUE' indicates the player has the subscription, while 'FALSE' indicates that the player does not.
- hashedEmail: A character type variable, where each player's email address is written in a cryptographic hash function to protect the player's privacy.
- played_hours: A double type variable, indicating the number of hours the player has spent on the server. Median is 0.1 hours and mean is 5.85 hours.
- name: A character type variable, displaying the player's name.
- gender: A character type variable, indicating the gender of the player.
- Age: A double type variable, indicating the age of the player. Median is 19.00 years old and mean is 21.14 years old.

# Methods & Results:

In [None]:
library(tidyverse)
library(repr)
options(repr.matrix.max.rows = 6)
library(patchwork)
library(tidymodels)
source("cleanup.R")

In [None]:
set.seed(1) 
player_data = read_csv('https://raw.githubusercontent.com/Inesh-DSCI/Group-Project-Final/refs/heads/main/players.csv')
player_data

# Data Wrangling
- Removal of NA's
- Subscriber value column and value name change
- Selection of Age, subscriber_status, experience columns
- Ranking of Experience levels and assigning numeric values to levels

In [None]:
player_data_selected<-player_data|>
 filter(!is.na(Age))|>
  mutate(subscriber_status = as_factor(subscribe)) |>
  mutate(subscriber_status= fct_recode(subscriber_status, "subscribed" = "TRUE", "Not Subscribed" = "FALSE"))|>
select(Age, subscriber_status, experience)
player_data_selected

In [None]:
player_order<-player_data_selected|>
mutate(experience= factor(experience, levels= c("Beginner","Amateur","Regular","Pro","Veteran"),ordered=TRUE))|>
mutate(experience_level=as.numeric(experience))
player_order

It must be noted that in order to rank the experience levels in order,factor (), a function from forcats (a package part of tidyverse) that was not covered in class was used. Information regarding the use of factor was collected from: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/factor 


<h2> Player Data </h2>

In [None]:
subscriber_max_min_avrg_hours = player_data |>
    group_by(subscribe) |>
    summarize(max_hours = max(played_hours, na.rm = TRUE),
             average_hours = mean(played_hours, na.rm = TRUE),
             min_hours = min(played_hours, na.rm = TRUE)) |>
    mutate(max_hours = round(max_hours, digits = 2),
          average_hours = round(average_hours, digits = 2),
          min_hours = round(min_hours, digits = 2))


number_of_subscribers = player_data |>
    group_by(experience) |>
    summarize(yes_subscribed = sum(subscribe, na.rm = TRUE),
             no_subscribed = sum(!subscribe, na.rm = TRUE))

subscriber_max_min_avrg_hours
number_of_subscribers

<h2> Visualization </h2>

In [None]:
options(repr.plot.width = 13, repr.plot.height = 8)

subscribe_and_experience_plot1 = number_of_subscribers |>
    ggplot(aes(x = experience, y = yes_subscribed, fill = experience)) +
    geom_bar(stat = 'identity') +
    ylim(0, 50) +
    labs(x = 'Experience level', y = 'Number of subscribers', fill = 'Experience level') +
    ggtitle('Figure 1A: Number of subscribers at each experience level')

subscribe_and_experience_plot2 = number_of_subscribers |>
    ggplot(aes(x = experience, y = no_subscribed, fill = experience)) +
    geom_bar(stat = 'identity') +
    ylim(0, 50) +
    labs(x = 'Experience level', y = 'Number of non-subscribers', fill = 'Experience level') +
    ggtitle('Figure 1B: Number of non-subscribers at each experience level')

subscribe_and_experience_plot0 = subscribe_and_experience_plot1 + subscribe_and_experience_plot2

subscribe_vs_age = player_data |>
    ggplot(aes(x = Age, fill = subscribe)) +
    geom_histogram() +
    facet_grid(cols = vars(subscribe)) +
    labs(x = 'Age (years)', y = 'Number of subscribers', fill = 'Subscription') +
    ggtitle('Figure 2: Age of subscribed and not subscribed players')

subscribe_and_experience_plot0
subscribe_vs_age

# Determining K

Data Scaling

In [None]:

player_data_scaled<-player_order|>
mutate(scaled_Age = scale(Age, center = TRUE),
scaled_experience_level = scale(experience_level, center = TRUE))

Data Split

In [None]:
player_split <- initial_split(player_order, prop = 0.75, strata = subscriber_status)  
player_train <- training(player_split)
player_test <- testing(player_split)

Recipe

In [None]:
player_recipe <- recipe(subscriber_status ~ experience_level + Age, data = player_train) |>
                step_scale(all_predictors()) |>
                step_center(all_predictors()) 
                
player_recipe2 <- recipe(subscriber_status ~ experience_level + Age, data = player_train) |>
                step_scale(all_predictors()) |>
                step_center(all_predictors()) |>
                prep()

In [None]:
scaled_data <- bake(player_recipe2, player_data_scaled)
scaled_data

Model

In [None]:
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
            set_engine("kknn") |>
            set_mode("classification")

Fit

In [None]:
player_fit <- workflow() |>
             add_recipe(player_recipe) |>
             add_model(knn_spec) |>
             fit(data = player_train)
player_fit

Predicitons

In [None]:
player_test_predictions <- predict(player_fit, player_test) |>
                          bind_cols(player_test)
player_test_predictions

Prediciton Accuracy 

In [None]:
player_prediction_accuracy <- player_test_predictions |>
                        metrics(truth = subscriber_status, estimate = .pred_class)
player_prediction_accuracy

K results

In [None]:
player_vfold <- vfold_cv(player_train, v = 5, strata = subscriber_status)

player_resample_fit <- workflow() |>
                      add_recipe(player_recipe) |>
                      add_model(knn_spec) |>
                      fit_resamples(resamples = player_vfold)

player_metrics <- player_resample_fit |> collect_metrics()
player_metrics


In [None]:
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
            set_engine("kknn") |>
            set_mode("classification")

k_vals <- tibble(neighbors = seq(from = 1, to = 10, by =1))

knn_results <- workflow() |>
               add_recipe(player_recipe) |>
               add_model(knn_tune) |>
               tune_grid(resamples = player_vfold, grid = k_vals) |>
               collect_metrics()
knn_results

In [None]:
accuracies <- knn_results |>
              filter(.metric == "accuracy")

accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
                     geom_point() +
                     geom_line() +
                     labs(x = "Neighbors", y = "Accuracy Estimate") +
                     theme(text = element_text(size = 20)) +
                     scale_x_continuous(breaks = seq(0, 14, by = 1)) +
                     scale_y_continuous(limits = c(0.4, 1.0))
accuracy_versus_k


In [None]:
best_k <- accuracies |>
        arrange(desc(mean)) |>
        head(1) |>
        pull(neighbors)
best_k

# Data Analysis With Optimal K

Tuned Accuracy 

In [None]:
knn_spec_tuned <- nearest_neighbor(weight_func = "rectangular", neighbors = 9) |>
            set_engine("kknn") |>
            set_mode("classification")

player_fit_tuned <- workflow() |>
             add_recipe(player_recipe) |>
             add_model(knn_spec_tuned) |>
             fit(data = player_train)

player_test_predictions_tuned <- predict(player_fit_tuned, player_test) |>
                          bind_cols(player_test)
player_test_predictions_tuned

player_prediction_accuracy_tuned <- player_test_predictions_tuned |>
                        metrics(truth = subscriber_status, estimate = .pred_class)
player_prediction_accuracy_tuned


Vizualization

# Discussion 