In [4]:
### Run this cell before continuing.

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [5]:
url = "https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
players_df = pd.read_csv (url)
players_df

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [6]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   experience        196 non-null    object 
 1   subscribe         196 non-null    bool   
 2   hashedEmail       196 non-null    object 
 3   played_hours      196 non-null    float64
 4   name              196 non-null    object 
 5   gender            196 non-null    object 
 6   age               196 non-null    int64  
 7   individualId      0 non-null      float64
 8   organizationName  0 non-null      float64
dtypes: bool(1), float64(3), int64(1), object(4)
memory usage: 12.6+ KB


In [7]:
players_tidy = players_df[['age','played_hours','subscribe']]
players_tidy

Unnamed: 0,age,played_hours,subscribe
0,9,30.3,True
1,17,3.8,True
2,17,0.0,False
3,21,0.7,True
4,21,0.1,True
...,...,...,...
191,17,0.0,True
192,22,0.3,False
193,17,0.0,False
194,17,2.3,False


In [8]:
players_plot = alt.Chart(players_tidy).mark_point(size=20, opacity = 0.5).encode(
    x = alt.X('age').title('Players Age (in years)'),
    y = alt.Y('played_hours').title('Playing Time (in hours)'),
).configure_axis(titleFontSize=12)
players_plot

In [9]:
players_plot_classified = alt.Chart(players_tidy).mark_point(size=20, opacity = 0.5).encode(
    x = alt.X('age')
    .title('Players Age (in years)'),
    y = alt.Y('played_hours')
    .title('Playing Time (in hours)'),
    color=alt.Color("subscribe")
    .legend(orient="top")
    .scale(scheme="dark2"),
    shape="subscribe"
).configure_axis(titleFontSize=12)
players_plot_classified