In [1]:
### Run this cell before continuing.

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [2]:
url = "https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
players_df = pd.read_csv (url)
players_df

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [3]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   experience        196 non-null    object 
 1   subscribe         196 non-null    bool   
 2   hashedEmail       196 non-null    object 
 3   played_hours      196 non-null    float64
 4   name              196 non-null    object 
 5   gender            196 non-null    object 
 6   age               196 non-null    int64  
 7   individualId      0 non-null      float64
 8   organizationName  0 non-null      float64
dtypes: bool(1), float64(3), int64(1), object(4)
memory usage: 12.6+ KB


In [4]:
players_tidy = players_df[['age','played_hours','subscribe']]
players_tidy

Unnamed: 0,age,played_hours,subscribe
0,9,30.3,True
1,17,3.8,True
2,17,0.0,False
3,21,0.7,True
4,21,0.1,True
...,...,...,...
191,17,0.0,True
192,22,0.3,False
193,17,0.0,False
194,17,2.3,False


In [5]:
players_plot = alt.Chart(players_tidy).mark_point(size=20, opacity = 0.5).encode(
    x = alt.X('age').title('Players Age (in years)'),
    y = alt.Y('played_hours').title('Playing Time (in hours)'),
).configure_axis(titleFontSize=12)
players_plot

In [6]:
players_plot_classified = alt.Chart(players_tidy).mark_point(size=20, opacity = 0.5).encode(
    x = alt.X('age')
    .title('Players Age (in years)'),
    y = alt.Y('played_hours')
    .title('Playing Time (in hours)'),
    color=alt.Color("subscribe")
    .legend(orient="top")
    .scale(scheme="dark2"),
    shape="subscribe"
).configure_axis(titleFontSize=12)
players_plot_classified

In [7]:
players_train, players_test = train_test_split(
    players_tidy, 
    test_size = 0.25,
    random_state = 123
)
players_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 147 entries, 100 to 109
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           147 non-null    int64  
 1   played_hours  147 non-null    float64
 2   subscribe     147 non-null    bool   
dtypes: bool(1), float64(1), int64(1)
memory usage: 3.6 KB


In [8]:
players_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 136 to 82
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           49 non-null     int64  
 1   played_hours  49 non-null     float64
 2   subscribe     49 non-null     bool   
dtypes: bool(1), float64(1), int64(1)
memory usage: 1.2 KB


In [9]:
players_processor = make_column_transformer(
    (StandardScaler(),['age','played_hours']),
    remainder="passthrough",
    verbose_feature_names_out=False
)

In [10]:
knn = KNeighborsClassifier()

X_train = players_train[['age','played_hours']]
y_train = players_train['subscribe']

X_test = players_test[['age','played_hours']]
y_test = players_test['subscribe']

players_pipe = make_pipeline(players_processor, knn)
param_grid = {
    "kneighborsclassifier__n_neighbors": range(2, 25, 1),
}
players_pipe

In [11]:
knn_tune_grid = GridSearchCV(
    players_pipe, param_grid, cv = 10,
)
knn_tune_grid

In [12]:
knn_model_grid = knn_tune_grid.fit(X_train, y_train)

accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)
accuracies_grid

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004927,0.001071,0.004391,0.000427,2,{'kneighborsclassifier__n_neighbors': 2},0.733333,0.533333,0.733333,0.666667,0.266667,0.6,0.6,0.571429,0.5,0.5,0.570476,0.129482,23
1,0.004295,7.4e-05,0.004166,8.2e-05,3,{'kneighborsclassifier__n_neighbors': 3},0.666667,0.666667,0.733333,0.8,0.333333,0.733333,0.8,0.714286,0.642857,0.5,0.659048,0.13623,21
2,0.004386,0.000196,0.004182,0.000141,4,{'kneighborsclassifier__n_neighbors': 4},0.733333,0.6,0.733333,0.733333,0.333333,0.666667,0.533333,0.571429,0.714286,0.5,0.611905,0.124817,22
3,0.004243,0.000112,0.004075,3.7e-05,5,{'kneighborsclassifier__n_neighbors': 5},0.733333,0.733333,0.666667,0.866667,0.533333,0.8,0.8,0.785714,0.785714,0.785714,0.749048,0.087677,2
4,0.004349,0.00034,0.00413,0.000121,6,{'kneighborsclassifier__n_neighbors': 6},0.533333,0.666667,0.666667,0.733333,0.533333,0.733333,0.666667,0.785714,0.785714,0.642857,0.674762,0.085265,20
5,0.00429,0.000117,0.004132,0.000108,7,{'kneighborsclassifier__n_neighbors': 7},0.733333,0.733333,0.733333,0.866667,0.6,0.8,0.8,0.857143,0.785714,0.642857,0.755238,0.081182,1
6,0.004229,8.1e-05,0.004065,2.4e-05,8,{'kneighborsclassifier__n_neighbors': 8},0.6,0.6,0.733333,0.8,0.6,0.8,0.733333,0.785714,0.785714,0.642857,0.708095,0.083382,19
7,0.004332,0.000193,0.00416,0.00014,9,{'kneighborsclassifier__n_neighbors': 9},0.733333,0.666667,0.733333,0.8,0.666667,0.733333,0.733333,0.785714,0.714286,0.785714,0.735238,0.043758,13
8,0.004201,5.8e-05,0.004064,3.1e-05,10,{'kneighborsclassifier__n_neighbors': 10},0.6,0.666667,0.733333,0.8,0.6,0.733333,0.733333,0.785714,0.714286,0.785714,0.715238,0.068571,18
9,0.004225,6.4e-05,0.004841,0.002273,11,{'kneighborsclassifier__n_neighbors': 11},0.733333,0.733333,0.733333,0.733333,0.666667,0.733333,0.8,0.785714,0.571429,0.785714,0.727619,0.063453,15


In [13]:
accuracy_versus_k_grid = alt.Chart(accuracies_grid).mark_line(point = True).encode(
    x=alt.X('param_kneighborsclassifier__n_neighbors')
        .title('K Nearest Neighbor')
        .scale(zero=False),
    y=alt.Y('mean_test_score')
        .title('Mean Accuracy')
        .scale(zero=False)
)
accuracy_versus_k_grid

In [14]:
best_knn = KNeighborsClassifier(n_neighbors = 7)
best_fit = best_knn.fit(X_train, y_train)
best_fit

In [15]:
best_fit_df = players_test.assign(
    predicted = best_fit.predict(X_test)
)
best_fit_acc = best_fit.score(X_test,y_test)
best_fit_acc

0.7346938775510204