In [2]:
library(dplyr)
library(ggplot2)
library(readr)
library(tidyverse)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [4]:
# Loading data
players <- read_csv("data/players.csv")
sessions <- read_csv("data/sessions.csv")

# Basic information about datasets
summary(players)
summary(sessions)


[1mRows: [22m[34m196[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, age
[33mlgl[39m (3): subscribe, individualId, organizationName

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this messag

  experience        subscribe       hashedEmail         played_hours    
 Length:196         Mode :logical   Length:196         Min.   :  0.000  
 Class :character   FALSE:52        Class :character   1st Qu.:  0.000  
 Mode  :character   TRUE :144       Mode  :character   Median :  0.100  
                                                       Mean   :  5.846  
                                                       3rd Qu.:  0.600  
                                                       Max.   :223.100  
     name              gender               age        individualId  
 Length:196         Length:196         Min.   : 8.00   Mode:logical  
 Class :character   Class :character   1st Qu.:17.00   NA's:196      
 Mode  :character   Mode  :character   Median :19.00                 
                                       Mean   :21.28                 
                                       3rd Qu.:22.00                 
                                       Max.   :99.00                 

 hashedEmail         start_time          end_time         original_start_time
 Length:1535        Length:1535        Length:1535        Min.   :1.712e+12  
 Class :character   Class :character   Class :character   1st Qu.:1.716e+12  
 Mode  :character   Mode  :character   Mode  :character   Median :1.719e+12  
                                                          Mean   :1.719e+12  
                                                          3rd Qu.:1.722e+12  
                                                          Max.   :1.727e+12  
                                                                             
 original_end_time  
 Min.   :1.712e+12  
 1st Qu.:1.716e+12  
 Median :1.719e+12  
 Mean   :1.719e+12  
 3rd Qu.:1.722e+12  
 Max.   :1.727e+12  
 NA's   :2          

In [5]:
# Checking for missing values and data types
sapply(players, function(x) sum(is.na(x)))
sapply(sessions, function(x) sum(is.na(x)))

# Getting column types
str(players)
str(sessions)

spc_tbl_ [196 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ experience      : chr [1:196] "Pro" "Veteran" "Veteran" "Amateur" ...
 $ subscribe       : logi [1:196] TRUE TRUE FALSE TRUE TRUE TRUE ...
 $ hashedEmail     : chr [1:196] "f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d" "f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9" "b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28" "23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5" ...
 $ played_hours    : num [1:196] 30.3 3.8 0 0.7 0.1 0 0 0 0.1 0 ...
 $ name            : chr [1:196] "Morgan" "Christian" "Blake" "Flora" ...
 $ gender          : chr [1:196] "Male" "Male" "Male" "Female" ...
 $ age             : num [1:196] 9 17 17 21 21 17 19 21 17 22 ...
 $ individualId    : logi [1:196] NA NA NA NA NA NA ...
 $ organizationName: logi [1:196] NA NA NA NA NA NA ...
 - attr(*, "spec")=
  .. cols(
  ..   experience = [31mcol_character()[39m,
  ..   subscribe = [33mcol_l

In [6]:
# Summary statistics for session lengths or other relevant metrics
sessions
session_length = (original_end_time - original_start_time)
summary(sessions$session_length)


hashedEmail,start_time,end_time,original_start_time,original_end_time
<chr>,<chr>,<chr>,<dbl>,<dbl>
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,30/06/2024 18:12,30/06/2024 18:24,1.71977e+12,1.71977e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,17/06/2024 23:33,17/06/2024 23:46,1.71867e+12,1.71867e+12
f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3398304c7ae42581fdc,25/07/2024 17:34,25/07/2024 17:57,1.72193e+12,1.72193e+12
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,25/07/2024 03:22,25/07/2024 03:58,1.72188e+12,1.72188e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,25/05/2024 16:01,25/05/2024 16:12,1.71665e+12,1.71665e+12
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,23/06/2024 15:08,23/06/2024 17:10,1.71916e+12,1.71916e+12
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,15/04/2024 07:12,15/04/2024 07:21,1.71317e+12,1.71317e+12
ad6390295640af1ed0e45ffc58a53b2d9074b0eea694b16210addd44d7c81f83,21/09/2024 02:13,21/09/2024 02:30,1.72688e+12,1.72689e+12
96e190b0bf3923cd8d349eee467c09d1130af143335779251492eb4c2c058a5f,21/06/2024 02:31,21/06/2024 02:49,1.71894e+12,1.71894e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,16/05/2024 05:13,16/05/2024 05:52,1.71584e+12,1.71584e+12


ERROR: Error in eval(expr, envir, enclos): object 'original_end_time' not found


In [7]:
# Example histogram for session lengths
ggplot(sessions, aes(x = session_length)) +
  geom_histogram(binwidth = 10) +
  labs(title = "Distribution of Session Lengths",
       x = "Session Length (minutes)",
       y = "Frequency")

# Example line plot for number of players over time
sessions %>%
  group_by(session_date) %>%
  summarise(player_count = n()) %>%
  ggplot(aes(x = session_date, y = player_count)) +
  geom_line() +
  labs(title = "Daily Player Count",
       x = "Date",
       y = "Number of Players")


ERROR while rich displaying an object: [1m[33mError[39m in `geom_histogram()`:[22m
[1m[22m[33m![39m Problem while computing aesthetics.
[36mℹ[39m Error occurred in the 1st layer.
[1mCaused by error:[22m
[33m![39m object 'session_length' not found

Traceback:
1. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
2. tryCatchList(expr, classes, parentenv, handlers)
3. tryCatchOne(expr, names, parentenv, handlers[[1L]])
4. doTryCatch(return(expr), name, parentenv, handler)
5. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         re

ERROR: [1m[33mError[39m in `group_by()`:[22m
[1m[22m[33m![39m Must group by variables found in `.data`.
[31m✖[39m Column `session_date` is not found.


In [9]:
# Example of splitting data (in case you use classification later)
set.seed(123)
sample <- sample.int(n = nrow(sessions), size = floor(.8*nrow(sessions)), replace = F)
train <- sessions[sample, ]
test  <- sessions[-sample, ]
train
test

hashedEmail,start_time,end_time,original_start_time,original_end_time
<chr>,<chr>,<chr>,<dbl>,<dbl>
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,04/05/2024 04:11,04/05/2024 04:31,1.71480e+12,1.71480e+12
90f1495942837b1cde67cc9e3119421e38183502a4c6dee304df3bf0c21e0dd0,19/08/2024 00:52,19/08/2024 01:08,1.72403e+12,1.72403e+12
f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3398304c7ae42581fdc,07/07/2024 01:14,07/07/2024 02:48,1.72031e+12,1.72032e+12
f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,03/08/2024 02:30,03/08/2024 03:54,1.72265e+12,1.72266e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,17/05/2024 23:08,18/05/2024 00:16,1.71599e+12,1.71599e+12
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,27/06/2024 04:17,27/06/2024 04:26,1.71946e+12,1.71946e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,10/05/2024 03:31,10/05/2024 04:50,1.71531e+12,1.71532e+12
e74c60a92c0100e7240be56d66969db85856152b048c639331a3595f901461ae,25/06/2024 23:54,25/06/2024 23:58,1.71936e+12,1.71936e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,09/05/2024 05:30,09/05/2024 05:40,1.71523e+12,1.71523e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,12/05/2024 04:33,12/05/2024 04:53,1.71549e+12,1.71549e+12


hashedEmail,start_time,end_time,original_start_time,original_end_time
<chr>,<chr>,<chr>,<dbl>,<dbl>
f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3398304c7ae42581fdc,25/07/2024 17:34,25/07/2024 17:57,1.72193e+12,1.72193e+12
b622593d2ef8b337dc554acb307d04a88114f2bf453b18fb5d2c80052aeb2319,18/08/2024 00:51,18/08/2024 03:15,1.72394e+12,1.72395e+12
24d4892c2c3ce11d3e54c3bf31ee218901cac7a2f564911d67424c8fdf0874fe,23/05/2024 18:12,23/05/2024 18:22,1.71649e+12,1.71649e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,07/05/2024 19:23,07/05/2024 19:34,1.71511e+12,1.71511e+12
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,27/06/2024 21:40,27/06/2024 21:48,1.71952e+12,1.71952e+12
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,04/05/2024 21:41,04/05/2024 21:50,1.71486e+12,1.71486e+12
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,22/08/2024 03:02,22/08/2024 03:49,1.72430e+12,1.72430e+12
24d4892c2c3ce11d3e54c3bf31ee218901cac7a2f564911d67424c8fdf0874fe,01/08/2024 21:30,01/08/2024 21:33,1.72255e+12,1.72255e+12
6b1cdc07fcc1f7ea09509341fd245dd34fdba386f14a493e9107c18604baedc9,25/06/2024 22:58,25/06/2024 23:09,1.71936e+12,1.71936e+12
ad6390295640af1ed0e45ffc58a53b2d9074b0eea694b16210addd44d7c81f83,23/08/2024 01:10,23/08/2024 01:19,1.72438e+12,1.72438e+12
