In [None]:
# all libraries needed
library(tidyverse)
library(tidymodels)
library(forcats)
library(repr)
library(dplyr)

In [None]:
# wrangling
player_data <- read_csv('data/players.csv')
session_data <- read_csv('data/sessions.csv')


merged_player_data <- left_join(player_data, session_data, by = 'hashedEmail')

write.csv(merged_player_data, "merged_file.csv", row.names = FALSE)

final_data_players <- merged_player_data |>
    group_by(hashedEmail) |>
    summarize(num_sessions = n(), Age = first(Age), played_hours = first(played_hours)) |>
    mutate(
    age_group = if_else(Age < 18, "Underaged", "Adult")
  ) |>
    mutate(age_group = as_factor(age_group)) |>
    select(-Age)

final_data_players

In [None]:
options(repr.plot.width = 8, repr.plot.height = 7)
#first visualization for showing a correlation

session_to_hours_plot <- final_data_players |>
    ggplot(aes(x = num_sessions, y = played_hours, color = age_group)) +
    geom_point() +
    labs(x = 'number of sessions played',
         y = 'total hours played',
         color = 'Age group') +
    xlim(0, 50) +
    ylim(0, 50)
session_to_hours_plot

#if considering that the experince levels go from amatur, beginner, regular, pro, and then veteran. This data actually shows that experinced players play less. 

In [None]:
# split training and testing data
set.seed(3456) 

player_split <- initial_split(final_data_players, prop = .75, strata = age_group)  
player_train <- training(player_split)   
player_test <- testing(player_split)

player_train
player_test

In [None]:
# model test
set.seed(3456) 

player_recipe <- recipe(age_group ~ num_sessions + played_hours , data = player_train) |>
   step_scale(all_predictors()) |>
   step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = 'rectangular', neighbors = 5) |>
      set_engine('kknn') |>
      set_mode('classification')

player_vfold <- vfold_cv(player_train, v = 10, repeats = 3, strata = age_group)

player_resample_fit <- workflow() |>
      add_recipe(player_recipe) |>
      add_model(knn_spec) |>
      fit_resamples(resamples = player_vfold) 

player_metrics <- collect_metrics(player_resample_fit)
player_metrics

In [None]:
#tune model

set.seed(3456) 

player_vfold <- vfold_cv(player_train, v = 10, strata = age_group)

k_vals <- tibble(neighbors = c(1:20))

knn_tune_spec <- nearest_neighbor(neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_workflow <- workflow() |>
  add_recipe(player_recipe) |>
  add_model(knn_tune_spec)

better_tuning_results <- knn_workflow |>
    tune_grid(resamples = player_vfold, grid = k_vals)

# Or see the best performing values of k
results <- show_best(better_tuning_results, metric = "accuracy")
results

In [None]:
# select the best model and update the workflow

best_k <- results |>
  filter(.metric == "accuracy") |>
  filter(mean == max(mean)) |>
  arrange(neighbors) |>
  slice(1) |>
  pull(neighbors)

knn_tune_spec_final <- nearest_neighbor(neighbors = best_k) |>
  set_engine("kknn") |>
  set_mode("classification")

final_knn_workflow <- workflow() |>
  add_recipe(player_recipe) |>
  add_model(knn_tune_spec_final)

final_model_fit <- fit(
  final_knn_workflow,
  data = player_train)

final_model_fit

### Predicting the age of the user

Introduction

Online games allow for an extensive amount of data to be collect on users for understanding user behavior. A research group in Computer Science at the University of British Columbia (UBC), led by Frank Wood, is collecting data from a custom Minecraft server to study how people play video games. They aim to optimize their project by targeting recruitment efforts and ensuring server resources are sufficient for the player base. To do this, they need to better understand the characteristics and behaviors of their players. One question is if  a player's engagement metrics, specifically the total number of sessions played and total hours played, be used to predict whether a player is an adult (18+) or not an adult (under 18) in the UBC's server dataset?

The analysis utilizes two datasets provided by the research group: players.csv, containing unique player attributes, and sessions.csv, containing records of individual play sessions. To prepare the data for analysis, the two datasets were combined based on the common variable hashedEmail. The goal was to isolate the amount of times a player loged in and the total play hours. 

The merged data was then grouped by each unique player (hashedEmail).

From these groups, a final player-level summary dataset was created by aggregating the data to calculate the total number of sessions (num_sessions) and extracting the player's total play time (played_hours) and Age.

A new binary response variable, age_group, was engineered from the Age column to classify players into two categories: "Underaged" (less than 18 years old) and "Adult" (18 years or older). The original Age column was then removed to prevent data leakage.

The final dataset used for analysis consists of 196 observations (unique players) and the following variables: