In [None]:
# all libraries needed
library(tidyverse)
library(tidymodels)
library(forcats)
library(repr)
library(dplyr)

In [None]:
# wrangling
player_data <- read_csv('data/players.csv')
session_data <- read_csv('data/sessions.csv')


merged_player_data <- left_join(player_data, session_data, by = 'hashedEmail')

write.csv(merged_player_data, "merged_file.csv", row.names = FALSE)

final_data_players <- merged_player_data |>
    group_by(hashedEmail) |>
    summarize(num_sessions = n(), experience = first(experience), played_hours = first(played_hours)) |>
    mutate(experience = as_factor(experience), experience_group = fct_recode(experience,
      "Experienced" = "Advanced",
      "Experienced" = "Expert",
      "Newer"       = "Beginner",
      "Newer"       = "Novice",
      "Newer"       = "Intermediate"
    )) |>
    select()

final_data_players

In [None]:
options(repr.plot.width = 8, repr.plot.height = 7)
#first visualization for showing a correlation

session_to_hours_plot <- final_data_players |>
    ggplot(aes(x = num_sessions, y = played_hours, color = experience)) +
    geom_point() +
    labs(x = 'number of sessions played',
         y = 'total hours played',
         color = 'Minecraft experince') +
    xlim(0, 50) +
    ylim(0, 50)
session_to_hours_plot

#if considering that the experince levels go from amatur, beginner, regular, pro, and then veteran. This data actually shows that experinced players play less. 

In [None]:
# split training and testing data
set.seed(3456) 

player_split <- initial_split(final_data_players, prop = .75, strata = experience)  
player_train <- training(player_split)   
player_test <- testing(player_split)

player_train
player_test

In [None]:
# create workflow

player_recipe <- recipe(experience ~ num_sessions + played_hours , data = player_train) |>
   step_scale(all_predictors()) |>
   step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = 'rectangular', neighbors = 3) |>
      set_engine('kknn') |>
      set_mode('classification')

player_fit <- workflow() |>
      add_recipe(player_recipe) |>
      add_model(knn_spec) |>
      fit(player_train)

In [None]:
# test work

player_test_predictions <- predict(player_fit , player_test) |>
      bind_cols(player_test)

player_test_predictions