# 02. Prepare Training Data

Implement the methods that will be need to train the deep learning model.

In [32]:
import pandas as pd
import numpy as np

from tqdm.std import tqdm
from typing import Tuple
from sklearn.model_selection import train_test_split

## Load Source Data

In [2]:
# load the source training set
df_source = pd.read_csv('data/train.csv.gz', compression='gzip', index_col=1)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(13174211, 19)


Unnamed: 0_level_0,session_id,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,20090312431273200,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [3]:
# load the source training labels
df_source_labels = pd.read_csv('data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Pre-process Data

### Pre-defined Values

In [4]:
vector_columns = ['elapsed_time', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'event_name_checkpoint', 'event_name_cutscene_click', 'event_name_map_click', 'event_name_map_hover', 'event_name_navigate_click', 'event_name_notebook_click', 'event_name_notification_click', 'event_name_object_click', 'event_name_object_hover', 'event_name_observation_click', 'event_name_person_click', 'name_basic', 'name_close', 'name_next', 'name_open', 'name_prev', 'name_undefined', 'level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5', 'level_6', 'level_7', 'level_8', 'level_9', 'level_10', 'level_11', 'level_12', 'level_13', 'level_14', 'level_15', 'level_16', 'level_17', 'level_18', 'level_19', 'level_20', 'level_21', 'level_22', 'fqid_0', 'fqid_archivist', 'fqid_archivist_glasses', 'fqid_block', 'fqid_block_0', 'fqid_block_1', 'fqid_block_badge', 'fqid_block_badge_2', 'fqid_block_magnify', 'fqid_block_nelson', 'fqid_block_tocollection', 'fqid_block_tomap1', 'fqid_block_tomap2', 'fqid_boss', 'fqid_businesscards', 'fqid_businesscards.card_0.next', 'fqid_businesscards.card_1.next', 'fqid_businesscards.card_bingo.bingo', 'fqid_businesscards.card_bingo.next', 'fqid_ch3start', 'fqid_chap1_finale', 'fqid_chap1_finale_c', 'fqid_chap2_finale_c', 'fqid_chap4_finale_c', 'fqid_coffee', 'fqid_colorbook', 'fqid_confrontation', 'fqid_crane_ranger', 'fqid_cs', 'fqid_directory', 'fqid_directory.closeup.archivist', 'fqid_door_block_clean', 'fqid_door_block_talk', 'fqid_doorblock', 'fqid_expert', 'fqid_flag_girl', 'fqid_fox', 'fqid_glasses', 'fqid_gramps', 'fqid_groupconvo', 'fqid_groupconvo_flag', 'fqid_intro', 'fqid_janitor', 'fqid_journals', 'fqid_journals.hub.topics', 'fqid_journals.pic_0.next', 'fqid_journals.pic_1.next', 'fqid_journals.pic_2.bingo', 'fqid_journals.pic_2.next', 'fqid_journals_flag', 'fqid_journals_flag.hub.topics', 'fqid_journals_flag.hub.topics_old', 'fqid_journals_flag.pic_0.bingo', 'fqid_journals_flag.pic_0.next', 'fqid_journals_flag.pic_0_old.next', 'fqid_journals_flag.pic_1.bingo', 'fqid_journals_flag.pic_1.next', 'fqid_journals_flag.pic_1_old.next', 'fqid_journals_flag.pic_2.bingo', 'fqid_journals_flag.pic_2.next', 'fqid_journals_flag.pic_2_old.next', 'fqid_key', 'fqid_lockeddoor', 'fqid_logbook', 'fqid_logbook.page.bingo', 'fqid_magnify', 'fqid_need_glasses', 'fqid_notebook', 'fqid_outtolunch', 'fqid_photo', 'fqid_plaque', 'fqid_plaque.face.date', 'fqid_reader', 'fqid_reader.paper0.next', 'fqid_reader.paper0.prev', 'fqid_reader.paper1.next', 'fqid_reader.paper1.prev', 'fqid_reader.paper2.bingo', 'fqid_reader.paper2.next', 'fqid_reader.paper2.prev', 'fqid_reader_flag', 'fqid_reader_flag.paper0.next', 'fqid_reader_flag.paper0.prev', 'fqid_reader_flag.paper1.next', 'fqid_reader_flag.paper1.prev', 'fqid_reader_flag.paper2.bingo', 'fqid_reader_flag.paper2.next', 'fqid_reader_flag.paper2.prev', 'fqid_remove_cup', 'fqid_report', 'fqid_retirement_letter', 'fqid_savedteddy', 'fqid_seescratches', 'fqid_teddy', 'fqid_tobasement', 'fqid_tocage', 'fqid_tocloset', 'fqid_tocloset_dirty', 'fqid_tocollection', 'fqid_tocollectionflag', 'fqid_toentry', 'fqid_tofrontdesk', 'fqid_togrampa', 'fqid_tohallway', 'fqid_tomap', 'fqid_tomicrofiche', 'fqid_tostacks', 'fqid_tracks', 'fqid_tracks.hub.deer', 'fqid_trigger_coffee', 'fqid_trigger_scarf', 'fqid_tunic', 'fqid_tunic.capitol_0', 'fqid_tunic.capitol_1', 'fqid_tunic.capitol_2', 'fqid_tunic.drycleaner', 'fqid_tunic.flaghouse', 'fqid_tunic.historicalsociety', 'fqid_tunic.hub.slip', 'fqid_tunic.humanecology', 'fqid_tunic.kohlcenter', 'fqid_tunic.library', 'fqid_tunic.wildlife', 'fqid_unlockdoor', 'fqid_wells', 'fqid_wellsbadge', 'fqid_what_happened', 'fqid_worker', 'room_fqid_tunic.capitol_0.hall', 'room_fqid_tunic.capitol_1.hall', 'room_fqid_tunic.capitol_2.hall', 'room_fqid_tunic.drycleaner.frontdesk', 'room_fqid_tunic.flaghouse.entry', 'room_fqid_tunic.historicalsociety.basement', 'room_fqid_tunic.historicalsociety.cage', 'room_fqid_tunic.historicalsociety.closet', 'room_fqid_tunic.historicalsociety.closet_dirty', 'room_fqid_tunic.historicalsociety.collection', 'room_fqid_tunic.historicalsociety.collection_flag', 'room_fqid_tunic.historicalsociety.entry', 'room_fqid_tunic.historicalsociety.frontdesk', 'room_fqid_tunic.historicalsociety.stacks', 'room_fqid_tunic.humanecology.frontdesk', 'room_fqid_tunic.kohlcenter.halloffame', 'room_fqid_tunic.library.frontdesk', 'room_fqid_tunic.library.microfiche', 'room_fqid_tunic.wildlife.center', 'text_fqid_0', 'text_fqid_tunic.capitol_0.hall.boss.talktogramps', 'text_fqid_tunic.capitol_0.hall.chap1_finale_c', 'text_fqid_tunic.capitol_1.hall.boss.haveyougotit', 'text_fqid_tunic.capitol_1.hall.boss.writeitup', 'text_fqid_tunic.capitol_1.hall.chap2_finale_c', 'text_fqid_tunic.capitol_2.hall.boss.haveyougotit', 'text_fqid_tunic.capitol_2.hall.chap4_finale_c', 'text_fqid_tunic.drycleaner.frontdesk.block_0', 'text_fqid_tunic.drycleaner.frontdesk.block_1', 'text_fqid_tunic.drycleaner.frontdesk.logbook.page.bingo', 'text_fqid_tunic.drycleaner.frontdesk.worker.done', 'text_fqid_tunic.drycleaner.frontdesk.worker.done2', 'text_fqid_tunic.drycleaner.frontdesk.worker.hub', 'text_fqid_tunic.drycleaner.frontdesk.worker.takealook', 'text_fqid_tunic.flaghouse.entry.colorbook', 'text_fqid_tunic.flaghouse.entry.flag_girl.hello', 'text_fqid_tunic.flaghouse.entry.flag_girl.hello_recap', 'text_fqid_tunic.flaghouse.entry.flag_girl.symbol', 'text_fqid_tunic.flaghouse.entry.flag_girl.symbol_recap', 'text_fqid_tunic.historicalsociety.basement.ch3start', 'text_fqid_tunic.historicalsociety.basement.gramps.seeyalater', 'text_fqid_tunic.historicalsociety.basement.gramps.whatdo', 'text_fqid_tunic.historicalsociety.basement.janitor', 'text_fqid_tunic.historicalsociety.basement.savedteddy', 'text_fqid_tunic.historicalsociety.basement.seescratches', 'text_fqid_tunic.historicalsociety.cage.confrontation', 'text_fqid_tunic.historicalsociety.cage.glasses.afterteddy', 'text_fqid_tunic.historicalsociety.cage.glasses.beforeteddy', 'text_fqid_tunic.historicalsociety.cage.lockeddoor', 'text_fqid_tunic.historicalsociety.cage.need_glasses', 'text_fqid_tunic.historicalsociety.cage.teddy.trapped', 'text_fqid_tunic.historicalsociety.cage.unlockdoor', 'text_fqid_tunic.historicalsociety.closet.doorblock', 'text_fqid_tunic.historicalsociety.closet.gramps.intro_0_cs_0', 'text_fqid_tunic.historicalsociety.closet.intro', 'text_fqid_tunic.historicalsociety.closet.notebook', 'text_fqid_tunic.historicalsociety.closet.photo', 'text_fqid_tunic.historicalsociety.closet.retirement_letter.hub', 'text_fqid_tunic.historicalsociety.closet.teddy.intro_0_cs_0', 'text_fqid_tunic.historicalsociety.closet.teddy.intro_0_cs_5', 'text_fqid_tunic.historicalsociety.closet_dirty.door_block_clean', 'text_fqid_tunic.historicalsociety.closet_dirty.door_block_talk', 'text_fqid_tunic.historicalsociety.closet_dirty.gramps.archivist', 'text_fqid_tunic.historicalsociety.closet_dirty.gramps.helpclean', 'text_fqid_tunic.historicalsociety.closet_dirty.gramps.news', 'text_fqid_tunic.historicalsociety.closet_dirty.gramps.nothing', 'text_fqid_tunic.historicalsociety.closet_dirty.photo', 'text_fqid_tunic.historicalsociety.closet_dirty.trigger_coffee', 'text_fqid_tunic.historicalsociety.closet_dirty.trigger_scarf', 'text_fqid_tunic.historicalsociety.closet_dirty.what_happened', 'text_fqid_tunic.historicalsociety.collection.cs', 'text_fqid_tunic.historicalsociety.collection.gramps.found', 'text_fqid_tunic.historicalsociety.collection.gramps.look_0', 'text_fqid_tunic.historicalsociety.collection.gramps.lost', 'text_fqid_tunic.historicalsociety.collection.tunic', 'text_fqid_tunic.historicalsociety.collection.tunic.slip', 'text_fqid_tunic.historicalsociety.collection_flag.gramps.flag', 'text_fqid_tunic.historicalsociety.collection_flag.gramps.recap', 'text_fqid_tunic.historicalsociety.entry.block_tocollection', 'text_fqid_tunic.historicalsociety.entry.block_tomap1', 'text_fqid_tunic.historicalsociety.entry.block_tomap2', 'text_fqid_tunic.historicalsociety.entry.boss.flag', 'text_fqid_tunic.historicalsociety.entry.boss.flag_recap', 'text_fqid_tunic.historicalsociety.entry.boss.talktogramps', 'text_fqid_tunic.historicalsociety.entry.directory.closeup.archivist', 'text_fqid_tunic.historicalsociety.entry.gramps.hub', 'text_fqid_tunic.historicalsociety.entry.groupconvo', 'text_fqid_tunic.historicalsociety.entry.groupconvo_flag', 'text_fqid_tunic.historicalsociety.entry.wells.flag', 'text_fqid_tunic.historicalsociety.entry.wells.flag_recap', 'text_fqid_tunic.historicalsociety.entry.wells.talktogramps', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.foundtheodora', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.have_glass', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.have_glass_recap', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.hello', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.need_glass_0', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.need_glass_1', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.newspaper', 'text_fqid_tunic.historicalsociety.frontdesk.archivist.newspaper_recap', 'text_fqid_tunic.historicalsociety.frontdesk.archivist_glasses.confrontation', 'text_fqid_tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap', 'text_fqid_tunic.historicalsociety.frontdesk.block_magnify', 'text_fqid_tunic.historicalsociety.frontdesk.key', 'text_fqid_tunic.historicalsociety.frontdesk.magnify', 'text_fqid_tunic.historicalsociety.stacks.block', 'text_fqid_tunic.historicalsociety.stacks.journals.pic_2.bingo', 'text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_0.bingo', 'text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_1.bingo', 'text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_2.bingo', 'text_fqid_tunic.historicalsociety.stacks.outtolunch', 'text_fqid_tunic.humanecology.frontdesk.block_0', 'text_fqid_tunic.humanecology.frontdesk.block_1', 'text_fqid_tunic.humanecology.frontdesk.businesscards.card_bingo.bingo', 'text_fqid_tunic.humanecology.frontdesk.worker.badger', 'text_fqid_tunic.humanecology.frontdesk.worker.intro', 'text_fqid_tunic.kohlcenter.halloffame.block_0', 'text_fqid_tunic.kohlcenter.halloffame.plaque.face.date', 'text_fqid_tunic.kohlcenter.halloffame.togrampa', 'text_fqid_tunic.library.frontdesk.block_badge', 'text_fqid_tunic.library.frontdesk.block_badge_2', 'text_fqid_tunic.library.frontdesk.block_nelson', 'text_fqid_tunic.library.frontdesk.wellsbadge.hub', 'text_fqid_tunic.library.frontdesk.worker.droppedbadge', 'text_fqid_tunic.library.frontdesk.worker.flag', 'text_fqid_tunic.library.frontdesk.worker.flag_recap', 'text_fqid_tunic.library.frontdesk.worker.hello', 'text_fqid_tunic.library.frontdesk.worker.hello_short', 'text_fqid_tunic.library.frontdesk.worker.nelson', 'text_fqid_tunic.library.frontdesk.worker.nelson_recap', 'text_fqid_tunic.library.frontdesk.worker.preflag', 'text_fqid_tunic.library.frontdesk.worker.wells', 'text_fqid_tunic.library.frontdesk.worker.wells_recap', 'text_fqid_tunic.library.microfiche.block_0', 'text_fqid_tunic.library.microfiche.reader.paper2.bingo', 'text_fqid_tunic.library.microfiche.reader_flag.paper2.bingo', 'text_fqid_tunic.wildlife.center.coffee', 'text_fqid_tunic.wildlife.center.crane_ranger.crane', 'text_fqid_tunic.wildlife.center.expert.recap', 'text_fqid_tunic.wildlife.center.expert.removed_cup', 'text_fqid_tunic.wildlife.center.fox.concern', 'text_fqid_tunic.wildlife.center.remove_cup', 'text_fqid_tunic.wildlife.center.tracks.hub.deer', 'text_fqid_tunic.wildlife.center.wells.animals', 'text_fqid_tunic.wildlife.center.wells.animals2', 'text_fqid_tunic.wildlife.center.wells.nodeer', 'text_fqid_tunic.wildlife.center.wells.nodeer_recap', 'level_group_0-4', 'level_group_13-22', 'level_group_5-12', 
'question_1',
'question_2',
'question_3',
'question_4',
'question_5',
'question_6',
'question_7',
'question_8',
'question_9',
'question_10',
'question_11',
'question_12',
'question_13',
'question_14',
'question_15',
'question_16',
'question_17',
'question_18',]

### Functions to prepare the data

In [5]:
def map_question_to_level_group(question_number):
    """
    Maps the question number to the level group.

    Parameters
    ----------
    question_number : int
        The question number.

    Returns
    -------
    str
        The level group.
    """
    if question_number in [1, 2, 3]:
        return '0-4'
    elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return '5-12'
    elif question_number in [14, 15, 16, 17, 18]:
        return '13-22'
    else:
        return None

def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset and add columns for the level group 
    and the question number.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0].astype(int)) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['session_id', 'question_num', 'correct']]
    
    # add the level group column
    df_labels['level_group'] = df_labels['question_num'].apply(map_question_to_level_group) 

        
    return df_labels

In [6]:
def prepare_main_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the main dataset by removing duplicates and removing 
    columns that are not needed.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)

    return df_main

In [7]:
def vectorize_dataset(data: pd.DataFrame, standardize_coordinates: bool=True) -> pd.DataFrame:
    """
    Vectorizes the dataset for deep learning by one-hot encoding and standardizing.
    
    Parameters
    ----------
    data : pd.DataFrame
        The dataset to prepare.

    Returns
    -------
    pd.DataFrame
        The vectorized dataset.
    """
    categorical_cols = ['event_name', 'name', 'level', 'fqid', 'room_fqid', 'text_fqid', 'level_group']
    numerical_cols = ['elapsed_time']    
    coordinates_cols = ['room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y']

    df_vectorized = data \
        .drop('session_id', axis=1) \
        .fillna(0)

    # standardize the numerical variables
    df_vectorized[numerical_cols] = (df_vectorized[numerical_cols] - df_vectorized[numerical_cols].mean()) / df_vectorized[numerical_cols].std()

    # standardize the coordinates
    if standardize_coordinates:
        df_vectorized[coordinates_cols] = (df_vectorized[coordinates_cols] - df_vectorized[coordinates_cols].mean()) / df_vectorized[coordinates_cols].std()

    # one-hot encode the categorical variables
    df_vectorized = pd.get_dummies(df_vectorized, columns=categorical_cols)
    
    return df_vectorized

In [8]:
def prepare_sequence_matrix(data: pd.DataFrame, vector_columns: list, standardize_coordinates: bool=True) -> pd.DataFrame:
    """
    Prepare a sequence matrix from a DataFrame for a specific session and level group.

    Parameters
    ----------
    data : pd.DataFrame
        The dataframe containing the data.

    vector_columns : list
        The columns that should appear in the sequence matrix.

    standardize_coordinates : bool, optional
        Whether to standardize the coordinates.

    Returns
    -------
    pd.DataFrame
        The sequence matrix.
    """
    df_sequence_matrix = vectorize_dataset(data, standardize_coordinates=standardize_coordinates)

    # add the missing columns
    missing_columns = [column for column in vector_columns if column not in df_sequence_matrix.columns]
    df_sequence_matrix = pd.concat([
        df_sequence_matrix, 
        pd.DataFrame(columns=missing_columns)], axis=1).fillna(0)

    return df_sequence_matrix[vector_columns]

### Perform Pre-Processing

In [9]:
# prepare the label dataset
df_source_labels = prepare_label_dataset(df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
21476,22010116250792520,2,1,0-4
84068,21000111433937450,8,1,5-12
171219,21040510125933256,15,0,13-22


In [10]:
# prepare the main dataset
df_source = prepare_main_dataset(df_source)

with pd.option_context('display.max_columns', None):
    print(df_source.shape)
    display(df_source.head(3))  

(13173445, 13)


Unnamed: 0,session_id,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,cutscene_click,basic,0,-413.991405,-159.314686,380.0,494.0,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1323,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,831,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


## Test Code for creating the training dataset

In [11]:
# get a single session and level group
session_id = 21040510125933256
level_group = '0-4'

df_sample = df_source.query('session_id == @session_id and level_group == @level_group')
print(df_sample.shape)

(172, 13)


In [12]:
# prepare the sequence matrix
df_sequence_matrix = prepare_sequence_matrix(df_sample, vector_columns)

with pd.option_context('display.max_columns', None):
    print(df_sequence_matrix.shape)
    display(df_sequence_matrix.head(3))

(172, 340)


Unnamed: 0,elapsed_time,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,event_name_checkpoint,event_name_cutscene_click,event_name_map_click,event_name_map_hover,event_name_navigate_click,event_name_notebook_click,event_name_notification_click,event_name_object_click,event_name_object_hover,event_name_observation_click,event_name_person_click,name_basic,name_close,name_next,name_open,name_prev,name_undefined,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,level_10,level_11,level_12,level_13,level_14,level_15,level_16,level_17,level_18,level_19,level_20,level_21,level_22,fqid_0,fqid_archivist,fqid_archivist_glasses,fqid_block,fqid_block_0,fqid_block_1,fqid_block_badge,fqid_block_badge_2,fqid_block_magnify,fqid_block_nelson,fqid_block_tocollection,fqid_block_tomap1,fqid_block_tomap2,fqid_boss,fqid_businesscards,fqid_businesscards.card_0.next,fqid_businesscards.card_1.next,fqid_businesscards.card_bingo.bingo,fqid_businesscards.card_bingo.next,fqid_ch3start,fqid_chap1_finale,fqid_chap1_finale_c,fqid_chap2_finale_c,fqid_chap4_finale_c,fqid_coffee,fqid_colorbook,fqid_confrontation,fqid_crane_ranger,fqid_cs,fqid_directory,fqid_directory.closeup.archivist,fqid_door_block_clean,fqid_door_block_talk,fqid_doorblock,fqid_expert,fqid_flag_girl,fqid_fox,fqid_glasses,fqid_gramps,fqid_groupconvo,fqid_groupconvo_flag,fqid_intro,fqid_janitor,fqid_journals,fqid_journals.hub.topics,fqid_journals.pic_0.next,fqid_journals.pic_1.next,fqid_journals.pic_2.bingo,fqid_journals.pic_2.next,fqid_journals_flag,fqid_journals_flag.hub.topics,fqid_journals_flag.hub.topics_old,fqid_journals_flag.pic_0.bingo,fqid_journals_flag.pic_0.next,fqid_journals_flag.pic_0_old.next,fqid_journals_flag.pic_1.bingo,fqid_journals_flag.pic_1.next,fqid_journals_flag.pic_1_old.next,fqid_journals_flag.pic_2.bingo,fqid_journals_flag.pic_2.next,fqid_journals_flag.pic_2_old.next,fqid_key,fqid_lockeddoor,fqid_logbook,fqid_logbook.page.bingo,fqid_magnify,fqid_need_glasses,fqid_notebook,fqid_outtolunch,fqid_photo,fqid_plaque,fqid_plaque.face.date,fqid_reader,fqid_reader.paper0.next,fqid_reader.paper0.prev,fqid_reader.paper1.next,fqid_reader.paper1.prev,fqid_reader.paper2.bingo,fqid_reader.paper2.next,fqid_reader.paper2.prev,fqid_reader_flag,fqid_reader_flag.paper0.next,fqid_reader_flag.paper0.prev,fqid_reader_flag.paper1.next,fqid_reader_flag.paper1.prev,fqid_reader_flag.paper2.bingo,fqid_reader_flag.paper2.next,fqid_reader_flag.paper2.prev,fqid_remove_cup,fqid_report,fqid_retirement_letter,fqid_savedteddy,fqid_seescratches,fqid_teddy,fqid_tobasement,fqid_tocage,fqid_tocloset,fqid_tocloset_dirty,fqid_tocollection,fqid_tocollectionflag,fqid_toentry,fqid_tofrontdesk,fqid_togrampa,fqid_tohallway,fqid_tomap,fqid_tomicrofiche,fqid_tostacks,fqid_tracks,fqid_tracks.hub.deer,fqid_trigger_coffee,fqid_trigger_scarf,fqid_tunic,fqid_tunic.capitol_0,fqid_tunic.capitol_1,fqid_tunic.capitol_2,fqid_tunic.drycleaner,fqid_tunic.flaghouse,fqid_tunic.historicalsociety,fqid_tunic.hub.slip,fqid_tunic.humanecology,fqid_tunic.kohlcenter,fqid_tunic.library,fqid_tunic.wildlife,fqid_unlockdoor,fqid_wells,fqid_wellsbadge,fqid_what_happened,fqid_worker,room_fqid_tunic.capitol_0.hall,room_fqid_tunic.capitol_1.hall,room_fqid_tunic.capitol_2.hall,room_fqid_tunic.drycleaner.frontdesk,room_fqid_tunic.flaghouse.entry,room_fqid_tunic.historicalsociety.basement,room_fqid_tunic.historicalsociety.cage,room_fqid_tunic.historicalsociety.closet,room_fqid_tunic.historicalsociety.closet_dirty,room_fqid_tunic.historicalsociety.collection,room_fqid_tunic.historicalsociety.collection_flag,room_fqid_tunic.historicalsociety.entry,room_fqid_tunic.historicalsociety.frontdesk,room_fqid_tunic.historicalsociety.stacks,room_fqid_tunic.humanecology.frontdesk,room_fqid_tunic.kohlcenter.halloffame,room_fqid_tunic.library.frontdesk,room_fqid_tunic.library.microfiche,room_fqid_tunic.wildlife.center,text_fqid_0,text_fqid_tunic.capitol_0.hall.boss.talktogramps,text_fqid_tunic.capitol_0.hall.chap1_finale_c,text_fqid_tunic.capitol_1.hall.boss.haveyougotit,text_fqid_tunic.capitol_1.hall.boss.writeitup,text_fqid_tunic.capitol_1.hall.chap2_finale_c,text_fqid_tunic.capitol_2.hall.boss.haveyougotit,text_fqid_tunic.capitol_2.hall.chap4_finale_c,text_fqid_tunic.drycleaner.frontdesk.block_0,text_fqid_tunic.drycleaner.frontdesk.block_1,text_fqid_tunic.drycleaner.frontdesk.logbook.page.bingo,text_fqid_tunic.drycleaner.frontdesk.worker.done,text_fqid_tunic.drycleaner.frontdesk.worker.done2,text_fqid_tunic.drycleaner.frontdesk.worker.hub,text_fqid_tunic.drycleaner.frontdesk.worker.takealook,text_fqid_tunic.flaghouse.entry.colorbook,text_fqid_tunic.flaghouse.entry.flag_girl.hello,text_fqid_tunic.flaghouse.entry.flag_girl.hello_recap,text_fqid_tunic.flaghouse.entry.flag_girl.symbol,text_fqid_tunic.flaghouse.entry.flag_girl.symbol_recap,text_fqid_tunic.historicalsociety.basement.ch3start,text_fqid_tunic.historicalsociety.basement.gramps.seeyalater,text_fqid_tunic.historicalsociety.basement.gramps.whatdo,text_fqid_tunic.historicalsociety.basement.janitor,text_fqid_tunic.historicalsociety.basement.savedteddy,text_fqid_tunic.historicalsociety.basement.seescratches,text_fqid_tunic.historicalsociety.cage.confrontation,text_fqid_tunic.historicalsociety.cage.glasses.afterteddy,text_fqid_tunic.historicalsociety.cage.glasses.beforeteddy,text_fqid_tunic.historicalsociety.cage.lockeddoor,text_fqid_tunic.historicalsociety.cage.need_glasses,text_fqid_tunic.historicalsociety.cage.teddy.trapped,text_fqid_tunic.historicalsociety.cage.unlockdoor,text_fqid_tunic.historicalsociety.closet.doorblock,text_fqid_tunic.historicalsociety.closet.gramps.intro_0_cs_0,text_fqid_tunic.historicalsociety.closet.intro,text_fqid_tunic.historicalsociety.closet.notebook,text_fqid_tunic.historicalsociety.closet.photo,text_fqid_tunic.historicalsociety.closet.retirement_letter.hub,text_fqid_tunic.historicalsociety.closet.teddy.intro_0_cs_0,text_fqid_tunic.historicalsociety.closet.teddy.intro_0_cs_5,text_fqid_tunic.historicalsociety.closet_dirty.door_block_clean,text_fqid_tunic.historicalsociety.closet_dirty.door_block_talk,text_fqid_tunic.historicalsociety.closet_dirty.gramps.archivist,text_fqid_tunic.historicalsociety.closet_dirty.gramps.helpclean,text_fqid_tunic.historicalsociety.closet_dirty.gramps.news,text_fqid_tunic.historicalsociety.closet_dirty.gramps.nothing,text_fqid_tunic.historicalsociety.closet_dirty.photo,text_fqid_tunic.historicalsociety.closet_dirty.trigger_coffee,text_fqid_tunic.historicalsociety.closet_dirty.trigger_scarf,text_fqid_tunic.historicalsociety.closet_dirty.what_happened,text_fqid_tunic.historicalsociety.collection.cs,text_fqid_tunic.historicalsociety.collection.gramps.found,text_fqid_tunic.historicalsociety.collection.gramps.look_0,text_fqid_tunic.historicalsociety.collection.gramps.lost,text_fqid_tunic.historicalsociety.collection.tunic,text_fqid_tunic.historicalsociety.collection.tunic.slip,text_fqid_tunic.historicalsociety.collection_flag.gramps.flag,text_fqid_tunic.historicalsociety.collection_flag.gramps.recap,text_fqid_tunic.historicalsociety.entry.block_tocollection,text_fqid_tunic.historicalsociety.entry.block_tomap1,text_fqid_tunic.historicalsociety.entry.block_tomap2,text_fqid_tunic.historicalsociety.entry.boss.flag,text_fqid_tunic.historicalsociety.entry.boss.flag_recap,text_fqid_tunic.historicalsociety.entry.boss.talktogramps,text_fqid_tunic.historicalsociety.entry.directory.closeup.archivist,text_fqid_tunic.historicalsociety.entry.gramps.hub,text_fqid_tunic.historicalsociety.entry.groupconvo,text_fqid_tunic.historicalsociety.entry.groupconvo_flag,text_fqid_tunic.historicalsociety.entry.wells.flag,text_fqid_tunic.historicalsociety.entry.wells.flag_recap,text_fqid_tunic.historicalsociety.entry.wells.talktogramps,text_fqid_tunic.historicalsociety.frontdesk.archivist.foundtheodora,text_fqid_tunic.historicalsociety.frontdesk.archivist.have_glass,text_fqid_tunic.historicalsociety.frontdesk.archivist.have_glass_recap,text_fqid_tunic.historicalsociety.frontdesk.archivist.hello,text_fqid_tunic.historicalsociety.frontdesk.archivist.need_glass_0,text_fqid_tunic.historicalsociety.frontdesk.archivist.need_glass_1,text_fqid_tunic.historicalsociety.frontdesk.archivist.newspaper,text_fqid_tunic.historicalsociety.frontdesk.archivist.newspaper_recap,text_fqid_tunic.historicalsociety.frontdesk.archivist_glasses.confrontation,text_fqid_tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap,text_fqid_tunic.historicalsociety.frontdesk.block_magnify,text_fqid_tunic.historicalsociety.frontdesk.key,text_fqid_tunic.historicalsociety.frontdesk.magnify,text_fqid_tunic.historicalsociety.stacks.block,text_fqid_tunic.historicalsociety.stacks.journals.pic_2.bingo,text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_0.bingo,text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_1.bingo,text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_2.bingo,text_fqid_tunic.historicalsociety.stacks.outtolunch,text_fqid_tunic.humanecology.frontdesk.block_0,text_fqid_tunic.humanecology.frontdesk.block_1,text_fqid_tunic.humanecology.frontdesk.businesscards.card_bingo.bingo,text_fqid_tunic.humanecology.frontdesk.worker.badger,text_fqid_tunic.humanecology.frontdesk.worker.intro,text_fqid_tunic.kohlcenter.halloffame.block_0,text_fqid_tunic.kohlcenter.halloffame.plaque.face.date,text_fqid_tunic.kohlcenter.halloffame.togrampa,text_fqid_tunic.library.frontdesk.block_badge,text_fqid_tunic.library.frontdesk.block_badge_2,text_fqid_tunic.library.frontdesk.block_nelson,text_fqid_tunic.library.frontdesk.wellsbadge.hub,text_fqid_tunic.library.frontdesk.worker.droppedbadge,text_fqid_tunic.library.frontdesk.worker.flag,text_fqid_tunic.library.frontdesk.worker.flag_recap,text_fqid_tunic.library.frontdesk.worker.hello,text_fqid_tunic.library.frontdesk.worker.hello_short,text_fqid_tunic.library.frontdesk.worker.nelson,text_fqid_tunic.library.frontdesk.worker.nelson_recap,text_fqid_tunic.library.frontdesk.worker.preflag,text_fqid_tunic.library.frontdesk.worker.wells,text_fqid_tunic.library.frontdesk.worker.wells_recap,text_fqid_tunic.library.microfiche.block_0,text_fqid_tunic.library.microfiche.reader.paper2.bingo,text_fqid_tunic.library.microfiche.reader_flag.paper2.bingo,text_fqid_tunic.wildlife.center.coffee,text_fqid_tunic.wildlife.center.crane_ranger.crane,text_fqid_tunic.wildlife.center.expert.recap,text_fqid_tunic.wildlife.center.expert.removed_cup,text_fqid_tunic.wildlife.center.fox.concern,text_fqid_tunic.wildlife.center.remove_cup,text_fqid_tunic.wildlife.center.tracks.hub.deer,text_fqid_tunic.wildlife.center.wells.animals,text_fqid_tunic.wildlife.center.wells.animals2,text_fqid_tunic.wildlife.center.wells.nodeer,text_fqid_tunic.wildlife.center.wells.nodeer_recap,level_group_0-4,level_group_13-22,level_group_5-12,question_1,question_2,question_3,question_4,question_5,question_6,question_7,question_8,question_9,question_10,question_11,question_12,question_13,question_14,question_15,question_16,question_17,question_18
7092360,-2.09047,-0.967372,0.861148,-0.134207,-0.721605,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7092361,-2.065485,-0.951334,0.878207,-0.10715,-0.741296,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7092362,-2.046951,-0.951334,0.878207,-0.10715,-0.741296,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# convert it to a numpy array
df_sequence_matrix.to_numpy()

array([[-2.09046958, -0.96737152,  0.8611484 , ...,  0.        ,
         0.        ,  0.        ],
       [-2.06548529, -0.95133352,  0.8782071 , ...,  0.        ,
         0.        ,  0.        ],
       [-2.04695149, -0.95133352,  0.8782071 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.61601768,  0.68144264,  0.65643806, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.63448334,  0.75346571,  0.16894744, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.08763019,  0.08653465,  0.24882472, ...,  0.        ,
         0.        ,  0.        ]])

### Test generating a split dataset

In [14]:
df_source_labels['session_id'].unique()

array([20090312431273200, 20090312433251036, 20090314121766812, ...,
       22100217104993650, 22100219442786200, 22100221145014656])

In [15]:
random_state = 51
train, test = train_test_split(df_source_labels['session_id'].unique(), test_size=0.3)
test, val = train_test_split(test, test_size=0.5)

# print the number of sessions in each set
print(f'Train: {len(train)}')
print(f'Validation: {len(val)}')
print(f'Test: {len(test)}')

Train: 8245
Validation: 1767
Test: 1767


In [16]:
df_source_labels['session_id'].unique()[:10]

array([20090312431273200, 20090312433251036, 20090314121766812,
       20090314363702160, 20090314441803444, 20090315081004164,
       20090315085850788, 20090315101457836, 20090315170769824,
       20090317080721164])

## Putting it all together

### Question Vector Array

In [17]:
def create_vector_array(X: pd.DataFrame, session_id: int, question_number: int) -> np.array:
    """
    Creates a vector array for a specific session and question number.
    """
    # get the level group
    level_group = map_question_to_level_group(question_number)

    # get the data for the session and level group
    df_session = X.query('session_id == @session_id and level_group == @level_group')

    # prepare the sequence matrix
    df_sequence_matrix = prepare_sequence_matrix(df_session, vector_columns)

    # set the question number value
    df_sequence_matrix[f'question_{question_number}'] = 1

    # convert it to a numpy array
    return df_sequence_matrix.to_numpy()


vector_array = create_vector_array(
    X=df_source, 
    session_id=session_id, 
    question_number=3)

print('Shape:', vector_array.shape)
print('Data type:', vector_array.dtype)
print(vector_array)

Shape: (172, 340)
Data type: float64
[[-2.09046958 -0.96737152  0.8611484  ...  0.          0.
   0.        ]
 [-2.06548529 -0.95133352  0.8782071  ...  0.          0.
   0.        ]
 [-2.04695149 -0.95133352  0.8782071  ...  0.          0.
   0.        ]
 ...
 [ 1.61601768  0.68144264  0.65643806 ...  0.          0.
   0.        ]
 [ 1.63448334  0.75346571  0.16894744 ...  0.          0.
   0.        ]
 [ 2.08763019  0.08653465  0.24882472 ...  0.          0.
   0.        ]]


### Create Dataset

In [33]:
def create_dataset(X: pd.DataFrame, y: pd.DataFrame, session_list: list) -> Tuple[np.array, np.array]:
    """
    Creates a dataset for a specific set of sessions and question numbers.

    Parameters
    ----------
    X : pd.DataFrame
        The main dataset.

    y : pd.DataFrame
        The label dataset.

    session_ids : list
        The list of session ids.

    Returns
    -------
    Tuple[np.array, np.array]
        The dataset and the labels.
    """
    X_dataset = []
    y_dataset = []

    for session_id in tqdm(session_list):
        # get the session labels
        df_session_labels = y.query('session_id == @session_id')

        # iterate over all the questions answered in the session
        for _, row in tqdm(df_session_labels.iterrows(), total=df_session_labels.shape[0]):
            question_number = row['question_num']
            correct = row['correct']

            # get the vector array
            vector_array = create_vector_array(X, session_id, question_number)

            # add the vector array to the dataset
            X_dataset.append(vector_array)

            # add the label to the dataset
            y_dataset.append(correct)

    return np.array(X_dataset, dtype=object), np.array(y_dataset)

# test the function
session_list = df_source_labels['session_id'].unique()[:10]
X_dataset, y_dataset = create_dataset(X=df_source, y=df_source_labels, session_list=session_list)

# print the length of the datasets
print(f'X dataset: {len(X_dataset)}')
print(f'y dataset: {len(y_dataset)}')

100%|██████████| 18/18 [00:05<00:00,  3.47it/s]
100%|██████████| 18/18 [00:05<00:00,  3.30it/s]
100%|██████████| 18/18 [00:05<00:00,  3.31it/s]
100%|██████████| 18/18 [00:05<00:00,  3.34it/s]
100%|██████████| 18/18 [00:05<00:00,  3.37it/s]
100%|██████████| 18/18 [00:05<00:00,  3.32it/s]
100%|██████████| 18/18 [00:05<00:00,  3.35it/s]
100%|██████████| 18/18 [00:05<00:00,  3.38it/s]
100%|██████████| 18/18 [00:05<00:00,  3.30it/s]
100%|██████████| 18/18 [00:05<00:00,  3.23it/s]
100%|██████████| 10/10 [00:54<00:00,  5.40s/it]

X dataset: 180
y dataset: 180



