In [102]:
#importing modules
import pandas as pd
import sklearn as skl
import altair as alt
import numpy as np
from sklearn.preprocessing import StandardScaler


In [4]:
# save the URL
players_url = "https://raw.githubusercontent.com/Lionung/dsci_100_group_project/refs/heads/main/players.csv"

# importing the files as CSV
players = pd.read_csv(players_url)

In [92]:
# removing NAN values, dropping columns with NAN vals
players.dropna(axis=1, inplace=True)

# removing other unnecessary variables to our investigation
players_clean = players.drop(columns=["hashedEmail", "name", "experience", "subscribe", "gender"])
players_clean.head()

Unnamed: 0,played_hours,age
0,30.3,9
1,3.8,17
2,0.0,17
3,0.7,21
4,0.1,21


In [93]:
#finding the bin values 

median_playtime = players_clean["played_hours"].median()

#can use the percentile function in numpy to find the values for the bins
top_20_playtime = np.percentile(players_clean["played_hours"], 80)

print("median playtime is", median_playtime, "hours" "\ntop 20% playtime is", top_20_playtime, "hours")

median playtime is 0.1 hours
top 20% playtime is 1.0 hours


In [94]:
# method is predicting based on age
playtime_distribution = alt.Chart(players_clean).mark_bar().encode(
    x=alt.X("played_hours").title("Played hours").bin(maxbins=90).scale(), 
    y=alt.Y("count()").title("Number of players")
).properties(title="Distribution of Played hours")

median = alt.Chart().mark_rule().encode(x=alt.datum(median_playtime))
top_20 = alt.Chart().mark_rule().encode(x=alt.datum(top_20_playtime))

hist_with_ranges = playtime_distribution + median + top_20
hist_with_ranges

# the vast, vast, vast majority of players barely even get an hour. There is a very small amount of players that have
# a signficant amount of playtime 

# is a significant issue for k means, as we need to create a large amount of data

In [95]:
# for instance, there are only 10 played with played_hours > 20
players_clean[players_clean["played_hours"] >= 20]

Unnamed: 0,played_hours,age
0,30.3,9
17,48.4,17
51,218.1,20
71,53.9,17
74,223.1,17
90,150.0,16
130,56.1,23
144,23.7,24
158,178.2,19
183,32.0,22


In [96]:
#creating bin categories

#did research on pandas cut function, works exactly for categories like this
#used when have to segment and sort data into bins (pandas documentation)

players_clean["category"] = pd.cut(
    players_clean["played_hours"],
    bins=[0, median_playtime, top_20_playtime, float("inf")],
    labels=["low", "medium", "high"],
    right=False, #means that the bins don't include rightmost edge 
    include_lowest=True #include players with 0
) 

players_clean

Unnamed: 0,played_hours,age,category
0,30.3,9,high
1,3.8,17,high
2,0.0,17,low
3,0.7,21,medium
4,0.1,21,medium
...,...,...,...
191,0.0,17,low
192,0.3,22,medium
193,0.0,17,low
194,2.3,17,high


In [97]:
#cleaning data
players_data = players_clean.drop(columns=["played_hours"])
players_data

Unnamed: 0,age,category
0,9,high
1,17,high
2,17,low
3,21,medium
4,21,medium
...,...,...
191,17,low
192,22,medium
193,17,low
194,17,high


In [98]:
# how many exist per category

low = players_data[players_data["category"] == "low"]
medium = players_data[players_data["category"] == "medium"]
high = players_data[players_data["category"] == "high"]

print("There are \n", len(low), ": low\n",
     len(medium), ": medium\n",
     len(high), ": high")

#therefore we need to equalize the categories for the values to have proper k means
#going to make 85 each to make it easiest for ourselves

There are 
 85 : low
 69 : medium
 42 : high


In [99]:

low = players_data[players_data["category"] == "low"]
medium = players_data[players_data["category"] == "medium"]
high = players_data[players_data["category"] == "high"]

high_upsample = high.sample(n=low.shape[0], replace=True)
medium_upsample = medium.sample(n=low.shape[0], replace=True)

upsampled_playtime = pd.concat((low, medium_upsample, high_upsample))
upsampled_playtime["category"].value_counts()

category
low       85
medium    85
high      85
Name: count, dtype: int64

Now the categories are equalized, we can officially start analysis

In [132]:
#train test split

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

x = upsampled_playtime[["age"]]
y = upsampled_playtime["category"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=69, stratify=y)

preprocessor = make_column_transformer((StandardScaler(),
                                      ["age"]))

pipeline = make_pipeline(preprocessor, KNeighborsClassifier())

param_grid = {"kneighborsclassifier__n_neighbors":range(1, 30, 1)}

tune_grid = GridSearchCV(estimator=pipeline, 
                        param_grid=param_grid,
                        cv=10, #used 10 as a good trade off between accuracy and computation/
                        return_train_score=True,
                        n_jobs=-1)

tune_grid.fit(x_train, y_train)

accuracies_grid = pd.DataFrame(tune_grid.cv_results_)
accuracies_grid.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.003436,0.000211,0.003364,0.00012,1,{'kneighborsclassifier__n_neighbors': 1},0.5,0.5,0.555556,0.444444,...,0.475,0.4375,0.4375,0.45625,0.46875,0.45,0.478261,0.447205,0.461922,0.018035
1,0.003341,8.7e-05,0.003331,0.000111,2,{'kneighborsclassifier__n_neighbors': 2},0.388889,0.611111,0.5,0.444444,...,0.45625,0.48125,0.51875,0.525,0.5125,0.54375,0.552795,0.521739,0.502453,0.035129
2,0.003516,0.000378,0.003887,0.001497,3,{'kneighborsclassifier__n_neighbors': 3},0.388889,0.611111,0.611111,0.444444,...,0.5125,0.5125,0.50625,0.525,0.525,0.51875,0.546584,0.521739,0.510582,0.023962
3,0.003246,2.3e-05,0.003198,3.3e-05,4,{'kneighborsclassifier__n_neighbors': 4},0.388889,0.555556,0.444444,0.444444,...,0.4625,0.49375,0.4875,0.50625,0.5,0.525,0.515528,0.521739,0.489977,0.02895
4,0.003277,8.3e-05,0.003183,1.8e-05,5,{'kneighborsclassifier__n_neighbors': 5},0.5,0.444444,0.444444,0.444444,...,0.43125,0.48125,0.5,0.50625,0.50625,0.525,0.521739,0.52795,0.497469,0.027132


In [133]:
cross_val_plot = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("Number of Neighbors").scale(zero=False),
    y=alt.Y("mean_test_score").title("Mean Test Score").scale(zero=False))

cross_val_plot

from the gridsearch, the optimal number of neighbors is 3, based on the accuracy.
we are not worrying about precision or recall, as it is not particularly necessary to minimize false negatives or false positives. There is no lifechanging ethical issue, we just need the most accurate model. However, it is important to note that the accuracy is not great at all even in the best case scenario

In [142]:
optimized = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors=3))
optimized.fit(x_train, y_train)


predictions = pd.DataFrame(y_test).assign(predicted=optimized.predict(x_test))
predictions

Unnamed: 0,category,predicted
142,high,low
132,medium,low
186,medium,medium
171,high,low
2,low,low
...,...,...
121,medium,medium
101,low,low
73,low,low
180,high,medium


In [143]:
#this function is simpler as it just takes the real ones, and the predicted ones

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predictions["category"], predictions["predicted"])
accuracy

0.4025974025974026