# Uncover the factors to help measure how young children learn

### Setting Up

In [8]:
# File system management
import os

# Numpy and pandas for data analysis and manipulation
import numpy as np
import pandas as pd

# Matplotlib pyplot and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms
from sklearn.ensemble import RandomForestClassifier as rf
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgbm

# Sklearn importing folding methods method
from sklearn.model_selection import train_test_split

# Sklearn importing evaluation metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix

# Hyper paramater optimisation
import hyperopt as hopt

In [13]:
# Setting env vars
debug = True
random_seed = 42

### Importing Data

In [54]:
if debug:
    X_train = pd.read_csv("../data/train.csv", nrows=20000)
else:
    X_train = pd.read_csv("../data/train.csv")
X_train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [55]:
if debug:
    y_train = pd.read_csv("../data/train_labels.csv", nrows=10000)
else:
    y_train = pd.read_csv("../data/train_labels.csv")
y_train.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


In [71]:
if debug:
    X_test = pd.read_csv("../data/test.csv", nrows=10000)
else:
    X_test = pd.read_csv("../data/test.csv")
X_test.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


### Exploring data

In [57]:
event_data_train = pd.DataFrame(X_train['event_data'].apply(json.loads).values.tolist())
event_data_train.columns = [f"event_data.{col}" for col in event_data_train.columns]

X_train = pd.concat([X_train.drop(columns=['event_data']), event_data_train],axis=1)
X_train.head()

Unnamed: 0,event_id,game_session,timestamp,installation_id,event_count,event_code,game_time,title,type,world,...,event_data.scale_weights,event_data.weights,event_data.nest,event_data.pillars,event_data.max_position,event_data.end_position,event_data.gate,event_data.dinosaur_weight,event_data.dinosaur_count,event_data.chests
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,...,,,,,,,,,,
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,...,,,,,,,,,,
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,...,,,,,,,,,,
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK,...,,,,,,,,,,
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK,...,,,,,,,,,,


In [58]:
X_train.shape

(20000, 132)

In [75]:
event_data_test = pd.DataFrame(X_test['event_data'].apply(json.loads).values.tolist())
event_data_test.columns = [f"event_data.{col}" for col in event_data_test.columns]

X_test = pd.concat([X_test.drop(columns=['event_data']), event_data_test],axis=1)
X_test.shape # Shapes are mismatched should join the tables before splitting out the columns

(10000, 124)

In [84]:
X_train['event_data.event_code'].dropna().size

20000

In [83]:
X_train['event_data.scale_weights'].dropna().size # Some event columns are very sparse

51