# Uncover the factors to help measure how young children learn

### Setting Up

In [1]:
# File system management
import os
import gc

# Numpy and pandas for data analysis and manipulation
import numpy as np
import pandas as pd

# Json for converting string to dict
import json

# Random for random
import random

# Matplotlib pyplot and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms
from sklearn.ensemble import RandomForestClassifier as rf
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgbm

# Sklearn importing folding methods method
from sklearn.model_selection import train_test_split

# Sklearn importing evaluation metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix

# Hyper paramater optimisation
import hyperopt as hopt

In [2]:
# Setting env vars
debug = True
random_seed = 42
random.seed(random_seed)

### Importing Data

Comment out after first run

In [3]:
# X_train = pd.read_csv("../data/train.csv")
# # Droping ids that never had an assessment
# assessed = X_train[X_train['type'] == "Assessment"]['installation_id']
# X_train = X_train.where(X_train['installation_id'].isin(assessed)).dropna()
# # Saving full training set as feather
# X_train.reset_index().to_feather("../data/X_train.feather")
# # Creating sample for debuging
# sample = y_train['installation_id'].sample(100, random_state=random_seed)
# X_train_sample = X_train.where(X_train['installation_id'].isin(sample)).dropna()
# X_train_sample.reset_index().drop(columns = ['index']).to_feather("../data/X_train_sample.feather")
# X_train_sample.shape

In [4]:
# y_train = pd.read_csv("../data/train_labels.csv")
# # Saving full training set as feather
# y_train.reset_index().to_feather("../data/y_train.feather")
# # Creating sample for debuging
# y_train_sample = y_train.where(y_train['installation_id'].isin(sample)).dropna()
# y_train.drop(columns = ['index']).to_feather("../data/y_train_sample.feather")
# y_train.shape

In [5]:
# X_test = pd.read_csv("../data/test.csv")
# # Saving full training set as feather
# X_test.reset_index().to_feather("../data/X_test.feather")
# # Creating test sample for debuging
# test_sample = X_test['installation_id'].sample(50, random_state=random_seed)
# X_test_sample = X_test.where(X_test['installation_id'].isin(test_sample)).dropna()
# X_test_sample.reset_index().drop(columns = ['index']).to_feather("../data/X_test_sample.feather")
# X_test_sample.shape

In [6]:
# # Loading training set from feather
if debug:
    X_train = pd.read_feather("../data/X_train_sample.feather")
else:
    X_train = pd.read_feather("../data/X_train.feather")
X_train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,a3325b38ad7292c1,2019-08-10T17:46:03.082Z,"{""event_code"": 2000, ""event_count"": 1}",00e17272,1.0,2000.0,0.0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,4d23feebc743419e,2019-08-10T17:46:18.684Z,"{""event_code"": 2000, ""event_count"": 1}",00e17272,1.0,2000.0,0.0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,4b0eef97c4521434,2019-08-10T17:47:13.391Z,"{""event_code"": 2000, ""event_count"": 1}",00e17272,1.0,2000.0,0.0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,c3e85417476eeae5,2019-08-10T17:47:51.101Z,"{""event_code"": 2000, ""event_count"": 1}",00e17272,1.0,2000.0,0.0,Welcome to Lost Lagoon!,Clip,NONE
4,27253bdc,9da134e1cb08364c,2019-08-10T17:48:21.252Z,"{""event_code"": 2000, ""event_count"": 1}",00e17272,1.0,2000.0,0.0,Crystal Caves - Level 1,Clip,CRYSTALCAVES


In [7]:
# Meta data overview
X_train.describe(include='all')

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
count,492271,492271,492271,492271,492271,492271.0,492271.0,492271.0,492271,492271,492271
unique,365,10879,466069,477977,97,,,,44,4,4
top,bb3e370b,18596d22e575632f,2019-09-21T20:00:46.078Z,"{""event_code"": 2000, ""event_count"": 1}",4d7d3d97,,,,Bottle Filler (Activity),Game,MAGMAPEAK
freq,10975,1125,7,5552,32749,,,,42890,237952,213175
mean,,,,,,102.368338,3508.390807,173041.0,,,
std,,,,,,120.827067,678.246526,697583.2,,,
min,,,,,,1.0,2000.0,0.0,,,
25%,,,,,,26.0,3020.0,30019.0,,,
50%,,,,,,61.0,4020.0,72980.0,,,
75%,,,,,,131.0,4035.0,159037.0,,,


In [8]:
# Loading training target set from feather
if debug:
    y_train = pd.read_feather("../data/y_train_sample.feather")
else:
    y_train = pd.read_feather("../data/y_train.feather")
y_train.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,13fb574a0788a342,00e17272,Cart Balancer (Assessment),1.0,0.0,1.0,3.0
1,6c8fab7964639f37,00e17272,Bird Measurer (Assessment),1.0,1.0,0.5,2.0
2,79b23df7134680d6,00e17272,Cart Balancer (Assessment),1.0,0.0,1.0,3.0
3,7c99edfbd81036e7,00e17272,Chest Sorter (Assessment),1.0,1.0,0.5,2.0
4,9444d34477d7f2c5,00e17272,Cauldron Filler (Assessment),1.0,0.0,1.0,3.0


In [9]:
# Meta data overview
y_train.describe(include='all')

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
count,1276,1276,1276,1276.0,1276.0,1276.0,1276.0
unique,1276,97,5,,,,
top,5e85beae44b05019,08987c08,Cauldron Filler (Assessment),,,,
freq,1,156,316,,,,
mean,,,,0.703762,1.958464,0.52534,1.681818
std,,,,0.456776,3.288578,0.43997,1.292761
min,,,,0.0,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0,0.0
50%,,,,1.0,1.0,0.5,2.0
75%,,,,1.0,3.0,1.0,3.0


In [10]:
# Loading testing set from feather
if debug:
    X_test = pd.read_feather("../data/X_test_sample.feather")
else:
    X_test = pd.read_feather("../data/X_test.feather")
X_test.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,8f8b93e30590f263,2019-09-22T20:58:41.078Z,"{""event_code"": 2000, ""event_count"": 1}",1242218,1.0,2000.0,0.0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,7f4ff05e14cea4ea,2019-09-22T20:59:09.640Z,"{""event_code"": 2000, ""event_count"": 1}",1242218,1.0,2000.0,0.0,Tree Top City - Level 1,Clip,TREETOPCITY
2,27253bdc,a09b503c4b8d7000,2019-09-22T20:59:42.371Z,"{""event_code"": 2000, ""event_count"": 1}",1242218,1.0,2000.0,0.0,Magma Peak - Level 1,Clip,MAGMAPEAK
3,77261ab5,ffec6761763be29e,2019-09-22T21:00:12.227Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",1242218,1.0,2000.0,0.0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,b2dba42b,ffec6761763be29e,2019-09-22T21:00:12.331Z,"{""description"":""Let's build a sandcastle! Firs...",1242218,2.0,3010.0,75.0,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [None]:
# Meta data overview
X_test.describe(include='all')

In [None]:
gc.collect()

### Prepairing the Data

In [None]:
# Splitting out event_data into columns
event_data_train = pd.DataFrame(X_train['event_data'].apply(json.loads).values.tolist())
event_data_train.columns = [f"event_data.{col}" for col in event_data_train.columns]
X_train = pd.concat([X_train, event_data_train],axis=1)
X_train = X_train.drop(columns=['event_data', 'event_data.event_count', 'event_data.event_code'])
X_train.head()

In [None]:
X_train.columns.values

In [None]:
X_train.describe()

In [None]:
# Splitting out event_data into columns
event_data_test = pd.DataFrame(X_test['event_data'].apply(json.loads).values.tolist())
event_data_test.columns = [f"event_data.{col}" for col in event_data_test.columns]
X_test = pd.concat([X_test, event_data_test],axis=1)
X_test = X_test.drop(columns=['event_data', 'event_data.event_count', 'event_data.event_code'])
X_test.shape

In [None]:
X_train['event_data.event_code'].dropna().size

In [None]:
X_train['event_data.end_position'].dropna().size # Some event columns are very sparse

In [None]:
y_train.describe(include='all')

In [None]:
# # One-hot encoder for categorical columns with get_dummies
# def one_hot_encoder(df, nan_as_category=True):
#     original_columns = list(df.columns)
#     categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
#     df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
#     new_columns = [c for c in df.columns if c not in original_columns]
#     return df, new_columns

In [None]:
# Too many values dont have the ram
# X_train, _ = one_hot_encoder(X_train, nan_as_category=True)

In [None]:
# X_train.shape