# Title: Which players have the highest engagement in the server? 

## Introduction


Background information: <br>
XXX XXX

Question: Can the age and played hours of specific players their subscription status to game newsletters in players.csv data? <br>

Dataset included: player.csv

In [None]:
library(tidyverse)
library(ggplot2)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [None]:
#reading players file
players <- read.csv("data/players.csv")
players

#### Description about `players` dataset: <br>

This data set contains information on 196 different player accounts. This information includes 7 variables:
- Experience (Beginner, Amateur, Regular, Pro, Veteran)
- Whether or not they are subscribed
  - Average 73.5% are subscribed
- The hashed email associated with their account
- Hours spend on the game
  - Average = 5.85 hours
- First name
- Gender
- Age
  - Average = 20.52 years
 
There are some player accounts that are missing a few variables, so it is important that <code>na.rm = TRUE</code> is used so that analysis can be done without issue. Experience, hashed email, name, and gender are in character format. Age and hours played are in double point precision format. Subsriber status is in logical format, which can be quantified as TRUE = 1 and FALSE = 0.

In [None]:
summarize(players, 
          average_age = mean(Age, na.rm = TRUE), 
          average_played_hours = mean(played_hours, na.rm = TRUE))

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
players_plot <- players |>
    ggplot(aes(x = Age, y = played_hours)) + 
        geom_point(aes(colour = subscribe)) +
        labs(title = "Figure 1.1: Overview of the relationship between played time and age/subscription status",x = "Age in years", y = "Played time in hours", colour = "Subscription status") +
        theme(text = element_text(size=20))
players_plot

#### Explanation of the visualization:
XXX

In [None]:
ggplot(players, aes(x = played_hours)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Figure 1.2Distribution of Played Hours", x = "Played Hours", y = "Count")

#### Explanation of the visualization:
XXX

In [None]:
ggplot(players, aes(x = as.factor(subscribe), fill = as.factor(subscribe))) +
  geom_bar() +
  labs(title = "Subscription Status Distribution", x = "Subscribed (TRUE/FALSE)", y = "Count") +
  scale_fill_manual(values = c("red", "green")) +
  theme(text = element_text(size=20))

#### Explanation of the visualization:
XXX

In [None]:
options(repr.plot.width = 12, repr.plot.height = 12)

average_age <- players |>
                select(subscribe, Age) |>
                group_by(subscribe) |>
                summarize(mean=mean(Age, na.rm=TRUE))

average_age_false <- pull(average_age) |> first()
average_age_true <- pull(average_age) |> last()

average_played_hours <- players |>
                        select(subscribe, played_hours) |>
                        group_by(subscribe) |>
                        summarize(mean=mean(played_hours, na.rm=TRUE))

average_played_hours_false <- pull(average_played_hours) |> first()
average_played_hours_true <- pull(average_played_hours) |> last()

played_hours_vs_age_plot <- ggplot(players, aes(x=Age ,y=played_hours,color=as_factor(subscribe))) +
                            geom_point(alpha=0.4) +
                            facet_grid(row=vars(subscribe)) +
                            labs(x="Age of different players in years",
                                y="Number of hours playing on the server",
                                title="Figure 1.3 Number of hours on server vs age of players",
                                color="Subscription to Game Newsletter?") +
                            theme(text=element_text(size=14)) +
                            geom_vline(aes(xintercept=average_age_false), linetype="dashed", data=filter(players, subscribe == FALSE), alpha=0.3) +
                            geom_vline(aes(xintercept=average_age_true), linetype="dashed", data=filter(players, subscribe == TRUE), alpha=0.3) +
                            geom_hline(aes(yintercept=average_played_hours_false), linetype="dashed", data=filter(players, subscribe == FALSE), alpha=0.3) +
                            geom_hline(aes(yintercept=average_played_hours_true), linetype="dashed", data=filter(players, subscribe == TRUE), alpha=0.3) +
                            annotate("text", x=21, y=70, label="Average age of players", angle=90) +
                            annotate("text", x=40, y=15, label="Average hours played")


played_hours_vs_age_plot



#### Explanation of the visualization:
XXX

In [None]:
library(cowplot)

age_hist <- ggplot(players, aes(x=Age, fill = as_factor(subscribe))) +
                geom_histogram(bins=20) +
                labs(x="Age of different players in years",
                     y="Number of players in the range",
                    fill="Subscription to Game Newsletter?",
                    title = "Figure 1.4 Age of players and their subscription status")

played_hours_hist <- ggplot(players, aes(x=played_hours, fill = as_factor(subscribe))) +
                geom_histogram(bins=20) +
                labs(x="Number of hours played",
                     y="Number of players in the range",
                    fill="Subscription to Game Newsletter?",
                    title = "Figure 1.5 Hours of players played on server and their subscription status")

plot_grid(age_hist, played_hours_hist, ncol=1)

#### Explanation of the visualization:
XXX

## Data Analysis

### Methods
XXX <br>
XXX <br>
XXX

In [None]:
## analysis code here for k-nn classifiers
set.seed(5555)

#change subsription status to be a factor rather than logical
players_mod <- mutate(players,subscribe = as_factor(subscribe))|>
    filter(Age > 0)

#players recipe
players_recipe <- recipe(subscribe~ Age + played_hours, data = players_mod)|>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

#model specifications and set values for k
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn")|>
    set_mode("classification")

#split data with training and testing
players_split <- initial_split(players_mod, prop = 0.75, strata = subscribe)
players_train <- training(players_split)
players_test <- testing(players_split)

#train workflow with training data
players_workflow <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_spec3) |>
    fit(data = players_train)

predictions <- predict(players_workflow, players_test)|>
    bind_cols(players_test)|>
    select(.pred_class, subscribe, Age, played_hours)

predictions

table <- predictions |>
    conf_mat(truth = subscribe, estimate = .pred_class)
table

### Results from the predictive analysis

### Visualizations of the predictive analysis

## Final Discussions and concluding thoughts

- summarize what you found
- discuss whether this is what you expected to find?
- discuss what impact could such findings have?
- discuss what future questions could this lead to?