In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

- Broad Question: What player characteristics and behaviours are most predictive of subscribing to a game-related newsletter, and how do these features differ between various player types?

- Specific Question: Can played_hours and age predict subscribe in players.csv?

In [None]:
cat("\nNumber of observations in players dataset:", nrow(players))
cat("\nNumber of variables in players dataset:", ncol(players))

In [None]:
players <- read_csv("https://raw.githubusercontent.com/Jay7615/Project/refs/heads/main/players.csv", show_col_types = FALSE) |>
  select(played_hours, Age, subscribe) |>
  rename(age = Age) |>
   mutate(
    subscribe = factor(subscribe, levels = c(FALSE, TRUE), labels = c("Non-subscriber", "Subscriber")))
subscription_summary <- players |>
  group_by(subscribe) |>
  summarise(
    n = n(),
    percent = n/nrow(players),
    mean_age = mean(age, na.rm = TRUE),
    mean_hours = mean(played_hours),
    median_age = median(age, na.rm = TRUE),
    median_hours = median(played_hours),
    sd_age = sd(age, na.rm = TRUE),
    sd_hours = sd(played_hours),
    na_age = sum(is.na(age)),
    na_hours = sum(is.na(played_hours)),
  ) |>
  mutate(percent = percent(percent, accuracy = 0.1))

players <- players |>
  mutate(age = replace_na(age, median(age, na.rm = TRUE)))
cat("=== Summary ===\n")
print(subscription_summary,, width = Inf)


In [None]:
ggplot(players, aes(x = age, y = played_hours, color = subscribe)) +
  geom_point(alpha = 0.6) +
  labs(title = "Played Hours vs Age by Subscription Status",
       x = "Age", y = "Hours Played") +
  theme_minimal()


In [None]:
players_split <- initial_split(players, prop = 0.7, strata = subscribe)
training_set <- training(players_split)
testing_set <- testing(players_split)

player_recipe <- recipe(subscribe ~ played_hours + age, 
                    data = training_set) |>
  step_impute_median(age) |>                
  step_normalize(all_numeric_predictors())  

knn_spec <- nearest_neighbor(
  weight_func = "rectangular", 
  neighbors = tune()
) |>
  set_mode("classification") |> 
  set_engine("kknn")

cv_folds <- vfold_cv(training_set, v = 5, strata = subscribe)

knn_workflow <- workflow() |>
  add_recipe(player_recipe) |>
  add_model(knn_spec)

knn_tune <- knn_workflow |> 
  tune_grid(
    resamples = cv_folds,
    grid = tibble(neighbors = seq(1, 100, by = 1)),
    metrics = metric_set(accuracy, roc_auc)
  )


best_k <- select_best(knn_tune, metric = "accuracy")$neighbors
best_knn <- select_best(knn_tune, "accuracy")


In [None]:
final_spec <- nearest_neighbor(
  weight_func = "rectangular",
  neighbors = best_k
) |> 
  set_mode("classification") |>
  set_engine("kknn")

final_fit <- workflow() |>
  add_recipe(player_recipe) |>
  add_model(final_spec) |>
  fit(data = training_set)

player_predictions <- predict(final_fit, new_data = testing_set) |>
  bind_cols(testing_set)

player_results <- player_predictions |>
  rename(predicted_subscription = .pred_class)

player_metrics <- player_results |>
  accuracy(truth = subscribe, estimate = predicted_subscription)

player_conf_mat <- player_results |>
  conf_mat(truth = subscribe, estimate = predicted_subscription)

player_metrics
player_conf_mat