In [20]:
# Importing Libraries
library(tidyverse)
library(repr)
library(tidymodels)

In [33]:
# Raw dataset URLs
player_data_url <- "https://raw.githubusercontent.com/MohakB3/dsci100-project/refs/heads/main/data/players.csv"
sessions_data_url <- "https://raw.githubusercontent.com/MohakB3/dsci100-project/refs/heads/main/data/sessions.csv"

# Reading Data (with the column specification message disabled) and Omitting NAs
player_data <- na.omit(read_csv(player_data_url, show_col_types = FALSE))
sessions_data <- na.omit(read_csv(sessions_data_url, show_col_types = FALSE))

# Wrangling Sessions Data
sessions_wrangled_data <- sessions_data |>
    mutate(session_length = as.POSIXct(end_time,format = "%d/%m/%Y %H:%M") - as.POSIXct(start_time, format = "%d/%m/%Y %H:%M")) |>
    select(hashedEmail, session_length) |>
    group_by(hashedEmail) |>
    summarize(player_average_session_length = as.numeric(mean(session_length)))

# Merging Player Data and Wrangled Sessions Data
merged_data <- left_join(sessions_wrangled_data, player_data, by = "hashedEmail") |> na.omit()

# Calculating Mean Values for all Quantitative Variables
mean_quantitative_data <- merged_data |>
    summarize(mean_age = mean(Age), mean_played_hours = mean(played_hours), mean_session_length = mean(player_average_session_length))
mean_quantitative_data

# Creating a Density Plot for Players' Total Time Played
options(repr.plot.width = 15, repr.plot.height = 5)
time_plot <- merged_data |>
    select(played_hours, subscribe) |>
    filter(played_hours >= 0, played_hours <= 20) |>
    ggplot(aes(x=played_hours, fill = subscribe, colour = subscribe)) + 
    geom_density(alpha = 0.5) +
    labs(x="Time Played (hours)",y="Density",fill = "Is Subscribed?", colour = "Is Subscribed?")

# Creating a Density Plot for Players' Age
options(repr.plot.width = 7, repr.plot.height = 5)
age_plot <- merged_data |>
    select(Age, subscribe) |>
    ggplot(aes(x=Age, fill = subscribe, colour = subscribe)) + 
    geom_density(alpha = 0.5) +
    labs(x="Player Age (years)",y="Density",fill = "Is Subscribed?", colour = "Is Subscribed?")

# Creating a Density Plot for Players' Average Session Length
session_length_plot <- merged_data |>
    select(player_average_session_length, subscribe) |>
    ggplot(aes(x=player_average_session_length, fill = subscribe, colour = subscribe)) + 
    geom_density(alpha = 0.5) +
    labs(x="Average Session Length (minutes)",y="Density",fill = "Is Subscribed?", colour = "Is Subscribed?")

# Rendering all Plots
#time_plot
#age_plot
#session_length_plot

mean_age,mean_played_hours,mean_session_length
<dbl>,<dbl>,<dbl>
21.58537,9.313008,32.85302


# Data Description
The dataset being used is a combination of the player dataset and the sessions dataset.

## **Chosen Broad Question:** What player characteristics and behaviours are most predictive of subscribing to a game-related newsletter, and how do these features differ between various player types?
### **Specific Question:** To what accuracy can a player's *age*, *average session length*, and *total play time predict whether the player is subscribed to a game-related newsletter in the merged player and sessions dataset?