In [None]:
library(tidyverse)
library(repr)
library(infer)
library(cowplot)
library(tidymodels)
options(repr.matrix.max.rows = 6)

<h2>Introduction</h2> 

<h4>Background Information</h4>  

We are interested in what player characteristics and behaviours are most predictive of subscribing to a game-related newsletter, and how these  
features differ between various player types. Specifically, we are interested in if **a player's total playtime and age can predict if that player is  
subscribed to a game-related newsletter or not**. In order to carry out our analysis, we must load the data in and make changes to the data so that it  
is formatted appropriately. This also allows us to properly visualize the data which helps us navigate through our analysis as well as convey information.  
With the properly formatted data and the information provided by the visualizations, we can perform the data analysis. More specifically, we  
will be able to create an algorithm that tries to categorize if a player is subscribed to a game-related newsletter based on total playtime and age.  
This algorithm can be trained and tested and its accuracy will help us come to a conclusion. Additional visualizations will help convey the reasoning  
behind our conclusion.

<h4>Data Description</h4>

Datasets provided for the analysis are **sessions.csv** and **players.csv**

Dataset **sessions.csv** contains data for unique play sessions including the session start and end times and a string unique to each player.  
The columns in this data set are:
1. **hashedEmail**: string variable unique for the player
2. **start_time**: character variable indicating date and time of start of session
3. **end_time**: character variable indicating date and time of end of session
4. **original_start_time**: numeric variable indicating start time of session in unknown format
5. **original_end_time**: numeric variable indicating end time of session in unknown format

Dataset **players.csv** contains data for each unique player including personal and game-related information.  
The columns in this data set are:
1. **experience**: categorical variable stating the experience level of the player
2. **subscribe**: boolean variable stating if the player is subscribed to a game-related newsletter
3. **hashedEmail**: string variable unique for the player
4. **played_hours**: numeric variable stating total hours played for the player
5. **name**: string variable stating the name of the player
6. **gender**: string variable stating the gender of the player
7. **Age**: numeric variable stating the age of the player

<h2>Methods & Results</h2>

<h4>Data Wrangling</h4>

In [None]:
players <- read_csv("https://raw.githubusercontent.com/Kqvii/DSCI100-Group13/refs/heads/main/data/players.csv") |>
    mutate(subscribe = as_factor(subscribe)) |>
    filter(!is.na(Age))
players

In [None]:
players_wrangled <- players |> select(subscribe, played_hours, Age)

In [None]:
subscribed_viz_data <- players_wrangled |>
    group_by(subscribe) |>
    summarize(subscribed = n(),
              average_hours_played = mean(played_hours))
subscribed_viz_data

subscribed_viz <- subscribed_viz_data |>
    ggplot(aes(x = subscribe, y = subscribed, fill = subscribe)) +
        geom_bar(stat = "identity") +
        labs(x = "Subscribed to Game-Related Newsletter",
             y = "Number of Players",
             title = "Bar Plot for Number of Players Subscribed to a Game-Related Newsletter",
             fill = "Subscribed")
subscribed_viz

In [None]:
players_split <- initial_split(players_wrangled, prop = 0.75, strata = subscribe)  
players_train <- training(players_split)   
players_test <- testing(players_split)
players_train
players_test

In [None]:
subscribe_recipe <- recipe(subscribe ~ played_hours + Age, data = players_wrangled) |>
  step_center(all_predictors()) |>  
  step_scale(all_predictors())

subscribe_model <- nearest_neighbor(weight_func = "rectangular", neighbors = 5) |>
      set_engine("kknn") |>
      set_mode("classification")

subscribe_fit <- workflow() |>
      add_recipe(subscribe_recipe) |>
      add_model(subscribe_model) |>
      fit(data = players_train)

In [None]:
players_predictions <- predict(subscribe_fit, players_test)
players_test_predictions <- bind_cols(players_test, players_predictions)

players_test_predictions

In [None]:
players_test
players_predictions

In [None]:
players_plot <- players_wrangled |>
    ggplot(aes(x = Age, y = played_hours)) + 
        geom_point(aes(colour = subscribe)) +
        xlab("Age (years)") + 
        ylab("Overall Playtime (Hours)")
players_plot

In [None]:
players_dist <- players_wrangled |>
    ggplot(aes(x = played_hours, fill = subscribe)) +
    geom_histogram(binwidth = 1, color = "Black") +
    scale_x_continuous(limits = c(-1, 20)) +
    labs(x = "Number of Hours Played", title = "Distribution of Players")

players_dist

In [None]:
new_obs <- tibble(played_hours = 0.1,Age =40)
class_prediction <- predict(subscribe_fit, new_obs)
class_prediction

In [None]:
subscribe_vfold <- vfold_cv(players_train, v = 5, strata = subscribe)

In [None]:
players_resample_fit <- workflow() |> 
  add_recipe(subscribe_recipe) |>   
  add_model(subscribe_model) |>       
  fit_resamples(resamples = subscribe_vfold)

In [None]:
players_metrics <- collect_metrics(players_resample_fit)
players_metrics

In [None]:
players_tune <- nearest_neighbor(mode = "classification", 
                             neighbors = tune(), 
                             weight_func = "rectangular") |> 
  set_engine("kknn")

players_tune

In [None]:
k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

knn_results <- workflow() |> 
  add_recipe(subscribe_recipe) |>  # Add preprocessing recipe
  add_model(players_tune) |>       # Add k-NN model with tuning
  tune_grid(resamples = subscribe_vfold, grid = k_vals) |>  # Perform tuning on cross-validation splits
  collect_metrics()  # Aggregate performance metrics
                 
knn_results

In [None]:
accuracies <- knn_results |> filter(.metric == "accuracy")
accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean))+
      geom_point() +
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") +
      scale_x_continuous(breaks = seq(0, 14, by = 1)) +  # adjusting the x-axis
      scale_y_continuous(limits = c(0.4, 1.0)) # adjusting the y-axis
accuracy_versus_k