In [None]:
### Fill in missing accuracy group values for all events and
### set index to variables that dataframe will be grouped by
# 1) First set index and sort chronologically within an 
#   [installation_id, world] pair with timestamp in index
train_order = train_order.set_index(['installation_id','world','timestamp'])
train_order = train_order.sort_index()

# 2) Backfill then forwardfill NaN values of events around assessments, 
train_order = train_order.groupby(level=['installation_id','world']).bfill()
# train_order = train_order.groupby(level=['installation_id','world']).ffill()

# 3) Then fill the rest of the events with 0 for those events 
# not associated with installation_ids or game_sessions where an assessment took place.  
train_order = train_order.fillna(0)

# Add accuracy_group and type to index so you can do a multi-level sort
train_order = train_order.set_index(['accuracy_group','type'],append=True)

# Auto-dispatch the count aggregation on multi-level group using the groupby method
# Essentially count the number of event occurences for a particular installation_id ->
# world -> accuracy_group -> type of event to get a rough estimate of time spent in 
# each type of activity
train_order_agg = train_order['event_id'].groupby(['installation_id','world','accuracy_group','type']).agg('count')



In [None]:
# Now unstack the type column to get counts for each type of activity with each 
# [installation_id, world] pair
train_type = train_order_agg.unstack('type')

# Lines below are equivalent to line above but more complex
# train_type = train_order_agg.reset_index(level=3)
#train_type = train_type.pivot_table(index=['installation_id','world','accuracy_group'],
#                           columns='type',values='event_id')

# Fill all [installation_id, world] rows that are missing an activity count with zeros
train_type = train_type.fillna(0)

# Now move the accuracy group out of the index to look at 
# correlation with type of activity counts
train_type.reset_index(level=2, inplace=True)

# Extra line to coerce accuracy_group index to be strings instead of ints
# train_type.index = train_type.index.set_levels(train_type.index.levels[2].astype('str'), level=2)

In [None]:
sns.pairplot(train_type[train_type,
             kind="scatter", 
             hue='accuracy_group')

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig)

xs = train_type['Assessment']
ys = train_type['Activity']
zs = train_type['Game']
color = train_type['accuracy_group']
ax.scatter(xs, ys, zs, c=color, label=color)

plt.show()

In [None]:
# Extract all system-initiated instruction events
sys_instr_events = specs['info'].str.contains('system-initiated instruction')
sys_instr_event_ids = specs[sys_instr_events].event_id.astype('str')

train_instr_bool = train['event_id'].str.contains('|'.join(sys_instr_event_ids))
train_instr = train[train_instr_bool]
train_instr.head()

In [None]:
# Slice out event_data to look at all those events with a 'dwell_time'
dwell_events = train['event_data'].str.contains('dwell')
train_instruction = pd.DataFrame(train[dwell_events])

# Reset index
train_instruction = train_instruction.reset_index()

In [None]:
# Convert event_data from json to nested dictionary
import json
train_instruction_events = train_instruction.event_data.apply(json.loads)

# Convert and flatten the nested dictionaries in train_dwell.event_data into 
# a dataframe
import pandas as pd
from pandas.io.json import json_normalize
event_data_flat =  json_normalize(train_instruction_events)
frames = [train_instruction, event_data_flat]
train_dwell = pd.concat(frames,axis=1)
train_dwell.head()

In [None]:
sns.heatmap(train_dwell)

In [None]:
# trying to pickout the 'dwell_time' from event_data in the train.csv data set.  Below are all
# the strategies I went through

# Tried applying pd.DataFrame.from_dict but this didn't work so well
train_dwell_convert = train_dwell.apply(pd.DataFrame.from_dict, orient='columns', columns = train_dwell[0].keys())

# Tried doing string parsing using str.split and splitting out the dwell_time but this couldn't be applied
# across nested JSONs
train_dwell_reset.event_data[0].split(':')[1].split(',')[0]

# Tried other regular expression functions, but again this was more complicated than necessary
import re

p = re.compile(':*,?')
p.findall()

train_dwell_reset.event_data.apply(p.findall)