In [57]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

https://github.com/IslandOfPencils/dsci-100-2025w-group-21.git

In [58]:
url_players = "https://drive.google.com/uc?id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
url_sessions = "https://drive.google.com/uc?id=14O91N5OlVkvdGxXNJUj5jIsV5RexhzbB"
players = pd.read_csv(url_players)
players = players.drop(columns = ["individualId", "organizationName"])
sessions = pd.read_csv(url_sessions)
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21
...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17


In [59]:
sessions

Unnamed: 0,hashedEmail,start_time,end_time,original_start_time,original_end_time
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,30/06/2024 18:12,30/06/2024 18:24,1.719770e+12,1.719770e+12
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,17/06/2024 23:33,17/06/2024 23:46,1.718670e+12,1.718670e+12
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,25/07/2024 17:34,25/07/2024 17:57,1.721930e+12,1.721930e+12
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,25/07/2024 03:22,25/07/2024 03:58,1.721880e+12,1.721880e+12
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,25/05/2024 16:01,25/05/2024 16:12,1.716650e+12,1.716650e+12
...,...,...,...,...,...
1530,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,10/05/2024 23:01,10/05/2024 23:07,1.715380e+12,1.715380e+12
1531,7a4686586d290c67179275c7c3dfb4ea02f4d317d9ee0e...,01/07/2024 04:08,01/07/2024 04:19,1.719810e+12,1.719810e+12
1532,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,28/07/2024 15:36,28/07/2024 15:57,1.722180e+12,1.722180e+12
1533,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,25/07/2024 06:15,25/07/2024 06:22,1.721890e+12,1.721890e+12


In [60]:
gender_counts = players.groupby('gender').size().reset_index(name='count')
gender_counts

Unnamed: 0,gender,count
0,Agender,2
1,Female,37
2,Male,124
3,Non-binary,15
4,Other,1
5,Prefer not to say,11
6,Two-Spirited,6


                                                                Interpretation
The majority of participants identify as Male (63.3%), making it the dominant gender group.

Female participants make up the second largest group (18.9%).

Non-binary and Two-Spirited individuals are present in smaller but notable proportions (7.7% and 3.1%, respectively), showing some diversity in gender identities.

A small number of participants chose Agender (1.0%) or Other (0.5%), while 11 participants (5.6%) preferred not to disclose their gender.

| Gender            | Count | Percentage |
| ----------------- | ----- | ---------- |
| Agender           | 2     | 1.0%       |
| Female            | 37    | 18.9%      |
| Male              | 124   | 63.3%      |
| Non-binary        | 15    | 7.7%       |
| Other             | 1     | 0.5%       |
| Prefer not to say | 11    | 5.6%       |
| Two-Spirited      | 6     | 3.1%       |


In [61]:
merged = sessions.merge(players, on="hashedEmail")
merged

Unnamed: 0,hashedEmail,start_time,end_time,original_start_time,original_end_time,experience,subscribe,played_hours,name,gender,age
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,30/06/2024 18:12,30/06/2024 18:24,1.719770e+12,1.719770e+12,Regular,True,223.1,Hiroshi,Male,17
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,17/06/2024 23:33,17/06/2024 23:46,1.718670e+12,1.718670e+12,Amateur,True,53.9,Alex,Male,17
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,25/07/2024 17:34,25/07/2024 17:57,1.721930e+12,1.721930e+12,Amateur,True,150.0,Delara,Female,16
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,25/07/2024 03:22,25/07/2024 03:58,1.721880e+12,1.721880e+12,Regular,True,223.1,Hiroshi,Male,17
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,25/05/2024 16:01,25/05/2024 16:12,1.716650e+12,1.716650e+12,Amateur,True,53.9,Alex,Male,17
...,...,...,...,...,...,...,...,...,...,...,...
1530,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,10/05/2024 23:01,10/05/2024 23:07,1.715380e+12,1.715380e+12,Amateur,True,53.9,Alex,Male,17
1531,7a4686586d290c67179275c7c3dfb4ea02f4d317d9ee0e...,01/07/2024 04:08,01/07/2024 04:19,1.719810e+12,1.719810e+12,Veteran,True,1.6,Lane,Female,23
1532,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,28/07/2024 15:36,28/07/2024 15:57,1.722180e+12,1.722180e+12,Amateur,True,56.1,Dana,Male,23
1533,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,25/07/2024 06:15,25/07/2024 06:22,1.721890e+12,1.721890e+12,Amateur,True,56.1,Dana,Male,23


In [62]:
summary = merged.groupby("gender")
merged['age'].value_counts(normalize=True)
alt.Chart(merged).mark_point(opacity = 0.5).encode(
    x="gender",
    y="age",
    color="gender"
)

In [63]:
affect_by_gender = alt.Chart(merged).mark_circle().encode(
    x='gender:N',
    y='played_hours:Q',
    color='gender:N'
)
affect_by_gender

In [64]:
affect_by_age = alt.Chart(merged).mark_point(opacity=0.4).encode(
    x='age:Q',
    y='played_hours:Q',
    color='gender:N'
)
affect_by_age

In [65]:
from sklearn.linear_model import LinearRegression
le = LabelEncoder()
merged["gender_encoded"] = le.fit_transform(merged["gender"])
X = merged[['age', 'gender_encoded']] 
y = merged['played_hours']
lm = LinearRegression().fit(X, y)
importance = pd.Series(abs(lm.coef_), index=X.columns)
importance


age               3.248163
gender_encoded    7.121261
dtype: float64