In [None]:
library(tidymodels)
library(tidyverse)

In [None]:
players <- read_csv("https://raw.githubusercontent.com/Modas101/dsci-100-project-final/refs/heads/main/data/players.csv")
sessions <- read_csv("https://raw.githubusercontent.com/Modas101/dsci-100-project-final/refs/heads/main/data/sessions.csv")

players

In [None]:
options(repr.plot.width = 8, repr.plot.height = 8)

clean_players <- players |>
    filter(!is.na(Age))

played_hours_75th_percentile <- clean_players |>
    pull(played_hours) |>
    quantile(0.75)
#played_hours_75th_percentile

clean_players <- clean_players |>
    mutate(high_contributor = played_hours > played_hours_75th_percentile)




# plots
# age amount
clean_players |> ggplot(aes(x = Age, fill = high_contributor)) +
    geom_histogram(binwidth = 2, position = "stack", alpha = 0.8) +
    labs(title = "Amount of High Contributors by Age",
        x = "Age (bins of 2 years)",
        y = "Amount of Players",
        fill = "High Contributor") + 
    theme(element_text(size = 20))
# age proportion
clean_players |> ggplot(aes(x = Age, fill = high_contributor)) +
    geom_histogram(binwidth = 2, position = "fill", alpha = 0.8) +
    labs(title = "Proportion of High Contributors by Age",
        x = "Age (bins of 2 years)",
        y = "Percent of Players",
        fill = "High Contributor") + 
    theme(element_text(size = 20))
# experience amount
clean_players |> ggplot(aes(x = experience, fill = high_contributor)) +
    geom_bar(position = "stack") +
    labs(title = "Amount of High Contributors by Experience Level",
        x = "Experience Level",
        y = "Amount of Players",
        fill = "High Contributor") +
    theme(element_text(size = 20))
# experience proportion
clean_players |> ggplot(aes(x = experience, fill = high_contributor)) +
    geom_bar(position = "fill") +
    labs(title = "Proportion of High Contributors by Experience Level",
        x = "Experience Level",
        y = "Percent of Players",
        fill = "High Contributor") +
    theme(element_text(size = 20))
# subscribed amount
clean_players |> ggplot(aes(x = subscribe, fill = high_contributor)) +
    geom_bar(position = "stack") +
    labs(title = "Amount of High Contributors by Subscription",
        x = "Subscribed to Newsletter",
        y = "Amount of Players",
        fill = "High Contributor") +
    theme(element_text(size = 20))
# subscribed proportion
clean_players |> ggplot(aes(x = subscribe, fill = high_contributor)) +
    geom_bar(position = "fill") +
    labs(title = "Proportion of High Contributors by Subscription",
        x = "Subscribed to Newsletter",
        y = "Percent of Players",
        fill = "High Contributor") +
    theme(element_text(size = 20))

mean_hours <- clean_players |>
  pull(played_hours) |>
  mean()
mean_age <- clean_players |>
  pull(Age) |>
  mean()

#mean_hours
#mean_age

In [None]:
model_data <- clean_players |>
    mutate(high_contributor = as.factor(high_contributor)) |>
    select(high_contributor, Age, experience, subscribe, gender)

set.seed(123)

data_split <- initial_split(model_data, prop = 0.75, strata = high_contributor)

clean_players_training <- training(data_split)
clean_players_testing <- testing(data_split)

knn_recipe <- recipe(high_contributor~., data = clean_players_training) |>
    step_mutate(subscribe = as.integer(subscribe)) |>
    step_mutate(experience = case_match(experience,
        "Beginner" ~ 1,
        "Regular" ~ 2,
        "Amateur" ~ 3,
        "Veteran" ~ 4,
        "Pro" ~ 5)) |>
    step_novel(all_nominal_predictors()) |>
    step_dummy(all_nominal_predictors()) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

knn_spec <- nearest_neighbor(neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

knn_workflow <- workflow() |>
    add_recipe(knn_recipe) |>
    add_model(knn_spec)

cv_folds <- vfold_cv(clean_players_training, v = 5, strata = high_contributor)

k_grid <- tibble(neighbors = seq(from = 1, to = 80, by = 1))

knn_results <- knn_workflow |> tune_grid(resamples = cv_folds, grid = k_grid) |>
    collect_metrics()

accuracies <- knn_results |> 
    filter(.metric == "accuracy")
accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
      geom_point() +
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") +
      scale_x_continuous(breaks = seq(0, 20, by = 1)) +  # adjusting the x-axis
      scale_y_continuous(limits = c(0.4, 1.0)) # adjusting the y-axis
accuracy_versus_k
