# 01. Data Preparation

1. Load the source training set and source training labels.
2. Add columns that will be needed for generating the collections of event sequences per question.
3. Perform basic EDA.

In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.metrics import classification_report

## Load Source Data

In [2]:
# load the source training set
df_source = pd.read_csv('data/train.csv.gz', compression='gzip', index_col=1)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(13174211, 19)


Unnamed: 0_level_0,session_id,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,20090312431273200,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [3]:
# load the source training labels
df_source_labels = pd.read_csv('data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Handle Duplicate Rows

In [4]:
# show the number of duplicates
print('Number of duplicates in the source training set: {} ({})'.format(df_source.duplicated().sum(), df_source.duplicated().sum() / df_source.shape[0]))
print('Number of duplicates in the source training labels: {}'.format(df_source_labels.index.duplicated().sum()))

Number of duplicates in the source training set: 766 (5.814389947147499e-05)
Number of duplicates in the source training labels: 0


In [5]:
# remove duplicate rows and reset the index
df_unique = df_source \
    .drop_duplicates() \
    .reset_index(drop=True)

print(df_unique.shape)

(13173445, 19)


## Prepare the Main Dataset

In [6]:
# find columns that are empty
df_unique.columns[df_unique.isnull().all()].tolist()

['fullscreen', 'hq', 'music']

In [7]:
# how many rows have missing values in the page column as a percentage of the total number of rows
df_unique['page'].isnull().sum() / df_unique.shape[0]

0.9783850769483609

In [8]:
# how many rows have missing values in the hover_duration column as a percentage of the total number of rows
df_unique['hover_duration'].isnull().sum() / df_unique.shape[0]

0.9240337664141764

In [9]:
# drop the empty columns
empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']
df_no_empty_colums = df_unique.drop(empty_columns, axis=1)

print(df_no_empty_colums.shape)
with pd.option_context('display.max_columns', None):
    display(df_no_empty_colums.head(3))

(13173445, 14)


Unnamed: 0,session_id,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,text,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,cutscene_click,basic,0,-413.991405,-159.314686,380.0,494.0,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1323,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,831,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


In [10]:
# drop the text column
df_core = df_no_empty_colums.drop('text', axis=1)
df_core.dtypes

session_id         int64
elapsed_time       int64
event_name        object
name              object
level              int64
room_coor_x      float64
room_coor_y      float64
screen_coor_x    float64
screen_coor_y    float64
fqid              object
room_fqid         object
text_fqid         object
level_group       object
dtype: object

In [11]:
# group the data by session_id and level_group
for name, group in df_core.groupby(['session_id', 'level_group']):
    display(group.info())
    break

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165 entries, 0 to 164
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   session_id     165 non-null    int64  
 1   elapsed_time   165 non-null    int64  
 2   event_name     165 non-null    object 
 3   name           165 non-null    object 
 4   level          165 non-null    int64  
 5   room_coor_x    156 non-null    float64
 6   room_coor_y    156 non-null    float64
 7   screen_coor_x  156 non-null    float64
 8   screen_coor_y  156 non-null    float64
 9   fqid           104 non-null    object 
 10  room_fqid      165 non-null    object 
 11  text_fqid      62 non-null     object 
 12  level_group    165 non-null    object 
dtypes: float64(4), int64(3), object(6)
memory usage: 18.0+ KB


None

## Prepare the Label Dataset

Additional columns need to be added to the labeling data set to allow matching to a collection of events.

1. session_id split from question
2. question number as an integer
3. the leve_group the question belongs to


In [12]:
# add the columns to determine the level group
df_augmented_labels = df_source_labels \
    .rename(columns={'session_id': 'id'}) \
    .assign(session_id=lambda df: df['id'].str.split('_').str[0]) \
    .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
    .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
    [['session_id', 'question_num', 'correct']]


with pd.option_context('display.max_columns', None):
    display(df_augmented_labels.head(3))

Unnamed: 0,session_id,question_num,correct
0,20090312431273200,1,1
1,20090312433251036,1,0
2,20090314121766812,1,1


In [13]:
def map_question_to_level_group(question_number):
        if question_number in [1, 2, 3]:
            return '0-4'
        elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
            return '5-12'
        elif question_number in [14, 15, 16, 17, 18]:
            return '13-22'
        else:
            return None
        
# add the level group column
df_augmented_labels['level_group'] = df_augmented_labels['question_num'].apply(map_question_to_level_group)

with pd.option_context('display.max_columns', None):
    display(df_augmented_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
21476,22010116250792520,2,1,0-4
84068,21000111433937450,8,1,5-12
171219,21040510125933256,15,0,13-22


### Get an idea how many questions gets answered correctly on average

In [14]:
df_augmented_labels \
    .groupby(['session_id', 'level_group']) \
    .agg({'correct': ['sum']}) \
    .reset_index() \
    .drop(columns=['session_id']) \
    .droplevel(1, axis=1) \
    .groupby('level_group') \
    .agg({'correct': ['min', 'max', 'mean']}) \
    .droplevel(0, axis=1) \
    .assign(correct_perc=lambda df: df['mean'] / df['max']) \


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0_level_0,min,max,mean,correct_perc
level_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-4,0,3,2.634944,0.878315
13-22,0,5,3.566771,0.713354
5-12,0,10,6.46931,0.646931


In [15]:
# how about we just classify everything as correct, what is the f1 score?
print(classification_report(df_augmented_labels['correct'], np.ones(df_augmented_labels.shape[0]).astype(int), zero_division=1))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00     62770
           1       0.70      1.00      0.83    149252

    accuracy                           0.70    212022
   macro avg       0.85      0.50      0.41    212022
weighted avg       0.79      0.70      0.58    212022



#### Let's try to cheat by creating "predictor" that uses the percentage of correct answers per question

In [16]:
# for each question, how many times was it answered correctly?
df_question_perf = df_augmented_labels \
    .groupby(['question_num']) \
    .agg({'correct': ['count', 'sum']}) \
    .droplevel(0, axis=1) \
    .assign(correct_perc=lambda df: df['sum'] / df['count']) \
    
df_question_perf

Unnamed: 0_level_0,count,sum,correct_perc
question_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11779,8528,0.724
2,11779,11529,0.978776
3,11779,10980,0.932167
4,11779,9415,0.799304
5,11779,6436,0.546396
6,11779,9094,0.772052
7,11779,8590,0.729264
8,11779,7236,0.614314
9,11779,8663,0.735461
10,11779,5894,0.500382


In [17]:
def predict_cheater(x : list, rations: list):
    y_pred = []
    np.random.seed(1230)

    for question_num in tqdm(x):
        y_pred.append(np.random.binomial(1, rations[question_num - 1]))

    return y_pred
        
y_pred = predict_cheater(
    x=df_augmented_labels.question_num.values,
    rations=df_question_perf.correct_perc.values
)

# show the classification report
print(classification_report(df_augmented_labels['correct'], y_pred))

  0%|          | 0/212022 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.40      0.40      0.40     62770
           1       0.75      0.75      0.75    149252

    accuracy                           0.64    212022
   macro avg       0.57      0.57      0.57    212022
weighted avg       0.64      0.64      0.64    212022



## Putting it all together

### Prepare the Label Dataset

In [18]:
def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0]) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['session_id', 'question_num', 'correct']]
    
    # add the level group column
    def map_question_to_level_group(question_number):
            if question_number in [1, 2, 3]:
                return '0-4'
            elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
                return '5-12'
            elif question_number in [14, 15, 16, 17, 18]:
                return '13-22'
            else:
                return None

    # add the level group column
    df_labels['level_group'] = df_labels['question_num'].apply(map_question_to_level_group) 

        
    return df_labels

# prepare the label dataset
df_labels = prepare_label_dataset(df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
21476,22010116250792520,2,1,0-4
84068,21000111433937450,8,1,5-12
171219,21040510125933256,15,0,13-22


### Perpare the Main Dataset

In [19]:
def prepare_main_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the main dataset.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)

    return df_main     

# prepare the main dataset
df_main = prepare_main_dataset(df_source)

with pd.option_context('display.max_columns', None):
    print(df_main.shape)
    display(df_main.head(3))    

(13173445, 13)


Unnamed: 0,session_id,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,cutscene_click,basic,0,-413.991405,-159.314686,380.0,494.0,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1323,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,831,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


#### -- DETOUR --

We need to do the encoding consistently, and we cannot focus on just a subset of data to come up with the categories.

In [21]:
# vectorize the dataset
def vectorize_dataset(data: pd.DataFrame, standardize_coordinates: bool=True) -> pd.DataFrame:
    """
    Vectorizes the dataset for deep learning.
    
    Parameters
    ----------
    data : pd.DataFrame
        The dataset to prepare.

    Returns
    -------
    pd.DataFrame
        The vectorized dataset.
    """
    categorical_cols = ['event_name', 'name', 'level', 'fqid', 'room_fqid', 'text_fqid', 'level_group']
    numerical_cols = ['elapsed_time']    
    coordinates_cols = ['room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y']

    df_vectorized = data \
        .drop('session_id', axis=1) \
        .fillna(0)

    # standardize the numerical variables
    df_vectorized[numerical_cols] = (df_vectorized[numerical_cols] - df_vectorized[numerical_cols].mean()) / df_vectorized[numerical_cols].std()

    # standardize the coordinates
    if standardize_coordinates:
        df_vectorized[coordinates_cols] = (df_vectorized[coordinates_cols] - df_vectorized[coordinates_cols].mean()) / df_vectorized[coordinates_cols].std()

    # one-hot encode the categorical variables
    df_vectorized = pd.get_dummies(df_vectorized, columns=categorical_cols)
    
    return df_vectorized

# df_vectorized = vectorize_dataset(
#     df_main.query('session_id == 20090312431273200 and level_group == "0-4"'))

df_vectorized = vectorize_dataset(df_main)

with pd.option_context('display.max_columns', None):
    print(df_vectorized.shape)
    display(df_vectorized.head(3))  

(13173445, 322)


Unnamed: 0,elapsed_time,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,event_name_checkpoint,event_name_cutscene_click,event_name_map_click,event_name_map_hover,event_name_navigate_click,event_name_notebook_click,event_name_notification_click,event_name_object_click,event_name_object_hover,event_name_observation_click,event_name_person_click,name_basic,name_close,name_next,name_open,name_prev,name_undefined,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,level_10,level_11,level_12,level_13,level_14,level_15,level_16,level_17,level_18,level_19,level_20,level_21,level_22,fqid_0,fqid_archivist,fqid_archivist_glasses,fqid_block,fqid_block_0,fqid_block_1,fqid_block_badge,fqid_block_badge_2,fqid_block_magnify,fqid_block_nelson,fqid_block_tocollection,fqid_block_tomap1,fqid_block_tomap2,fqid_boss,fqid_businesscards,fqid_businesscards.card_0.next,fqid_businesscards.card_1.next,fqid_businesscards.card_bingo.bingo,fqid_businesscards.card_bingo.next,fqid_ch3start,fqid_chap1_finale,fqid_chap1_finale_c,fqid_chap2_finale_c,fqid_chap4_finale_c,fqid_coffee,fqid_colorbook,fqid_confrontation,fqid_crane_ranger,fqid_cs,fqid_directory,fqid_directory.closeup.archivist,fqid_door_block_clean,fqid_door_block_talk,fqid_doorblock,fqid_expert,fqid_flag_girl,fqid_fox,fqid_glasses,fqid_gramps,fqid_groupconvo,fqid_groupconvo_flag,fqid_intro,fqid_janitor,fqid_journals,fqid_journals.hub.topics,fqid_journals.pic_0.next,fqid_journals.pic_1.next,fqid_journals.pic_2.bingo,fqid_journals.pic_2.next,fqid_journals_flag,fqid_journals_flag.hub.topics,fqid_journals_flag.hub.topics_old,fqid_journals_flag.pic_0.bingo,fqid_journals_flag.pic_0.next,fqid_journals_flag.pic_0_old.next,fqid_journals_flag.pic_1.bingo,fqid_journals_flag.pic_1.next,fqid_journals_flag.pic_1_old.next,fqid_journals_flag.pic_2.bingo,fqid_journals_flag.pic_2.next,fqid_journals_flag.pic_2_old.next,fqid_key,fqid_lockeddoor,fqid_logbook,fqid_logbook.page.bingo,fqid_magnify,fqid_need_glasses,fqid_notebook,fqid_outtolunch,fqid_photo,fqid_plaque,fqid_plaque.face.date,fqid_reader,fqid_reader.paper0.next,fqid_reader.paper0.prev,fqid_reader.paper1.next,fqid_reader.paper1.prev,fqid_reader.paper2.bingo,fqid_reader.paper2.next,fqid_reader.paper2.prev,fqid_reader_flag,fqid_reader_flag.paper0.next,fqid_reader_flag.paper0.prev,fqid_reader_flag.paper1.next,fqid_reader_flag.paper1.prev,fqid_reader_flag.paper2.bingo,fqid_reader_flag.paper2.next,fqid_reader_flag.paper2.prev,fqid_remove_cup,fqid_report,fqid_retirement_letter,fqid_savedteddy,fqid_seescratches,fqid_teddy,fqid_tobasement,fqid_tocage,fqid_tocloset,fqid_tocloset_dirty,fqid_tocollection,fqid_tocollectionflag,fqid_toentry,fqid_tofrontdesk,fqid_togrampa,fqid_tohallway,fqid_tomap,fqid_tomicrofiche,fqid_tostacks,fqid_tracks,fqid_tracks.hub.deer,fqid_trigger_coffee,fqid_trigger_scarf,fqid_tunic,fqid_tunic.capitol_0,fqid_tunic.capitol_1,fqid_tunic.capitol_2,fqid_tunic.drycleaner,fqid_tunic.flaghouse,fqid_tunic.historicalsociety,fqid_tunic.hub.slip,fqid_tunic.humanecology,fqid_tunic.kohlcenter,fqid_tunic.library,fqid_tunic.wildlife,fqid_unlockdoor,fqid_wells,fqid_wellsbadge,fqid_what_happened,fqid_worker,room_fqid_tunic.capitol_0.hall,room_fqid_tunic.capitol_1.hall,room_fqid_tunic.capitol_2.hall,room_fqid_tunic.drycleaner.frontdesk,room_fqid_tunic.flaghouse.entry,room_fqid_tunic.historicalsociety.basement,room_fqid_tunic.historicalsociety.cage,room_fqid_tunic.historicalsociety.closet,room_fqid_tunic.historicalsociety.closet_dirty,room_fqid_tunic.historicalsociety.collection,room_fqid_tunic.historicalsociety.collection_flag,room_fqid_tunic.historicalsociety.entry,room_fqid_tunic.historicalsociety.frontdesk,room_fqid_tunic.historicalsociety.stacks,room_fqid_tunic.humanecology.frontdesk,room_fqid_tunic.kohlcenter.halloffame,room_fqid_tunic.library.frontdesk,room_fqid_tunic.library.microfiche,room_fqid_tunic.wildlife.center,text_fqid_0,text_fqid_tunic.capitol_0.hall.boss.talktogramps,text_fqid_tunic.capitol_0.hall.chap1_finale_c,text_fqid_tunic.capitol_1.hall.boss.haveyougotit,text_fqid_tunic.capitol_1.hall.boss.writeitup,text_fqid_tunic.capitol_1.hall.chap2_finale_c,text_fqid_tunic.capitol_2.hall.boss.haveyougotit,text_fqid_tunic.capitol_2.hall.chap4_finale_c,text_fqid_tunic.drycleaner.frontdesk.block_0,text_fqid_tunic.drycleaner.frontdesk.block_1,text_fqid_tunic.drycleaner.frontdesk.logbook.page.bingo,text_fqid_tunic.drycleaner.frontdesk.worker.done,text_fqid_tunic.drycleaner.frontdesk.worker.done2,text_fqid_tunic.drycleaner.frontdesk.worker.hub,text_fqid_tunic.drycleaner.frontdesk.worker.takealook,text_fqid_tunic.flaghouse.entry.colorbook,text_fqid_tunic.flaghouse.entry.flag_girl.hello,text_fqid_tunic.flaghouse.entry.flag_girl.hello_recap,text_fqid_tunic.flaghouse.entry.flag_girl.symbol,text_fqid_tunic.flaghouse.entry.flag_girl.symbol_recap,text_fqid_tunic.historicalsociety.basement.ch3start,text_fqid_tunic.historicalsociety.basement.gramps.seeyalater,text_fqid_tunic.historicalsociety.basement.gramps.whatdo,text_fqid_tunic.historicalsociety.basement.janitor,text_fqid_tunic.historicalsociety.basement.savedteddy,text_fqid_tunic.historicalsociety.basement.seescratches,text_fqid_tunic.historicalsociety.cage.confrontation,text_fqid_tunic.historicalsociety.cage.glasses.afterteddy,text_fqid_tunic.historicalsociety.cage.glasses.beforeteddy,text_fqid_tunic.historicalsociety.cage.lockeddoor,text_fqid_tunic.historicalsociety.cage.need_glasses,text_fqid_tunic.historicalsociety.cage.teddy.trapped,text_fqid_tunic.historicalsociety.cage.unlockdoor,text_fqid_tunic.historicalsociety.closet.doorblock,text_fqid_tunic.historicalsociety.closet.gramps.intro_0_cs_0,text_fqid_tunic.historicalsociety.closet.intro,text_fqid_tunic.historicalsociety.closet.notebook,text_fqid_tunic.historicalsociety.closet.photo,text_fqid_tunic.historicalsociety.closet.retirement_letter.hub,text_fqid_tunic.historicalsociety.closet.teddy.intro_0_cs_0,text_fqid_tunic.historicalsociety.closet.teddy.intro_0_cs_5,text_fqid_tunic.historicalsociety.closet_dirty.door_block_clean,text_fqid_tunic.historicalsociety.closet_dirty.door_block_talk,text_fqid_tunic.historicalsociety.closet_dirty.gramps.archivist,text_fqid_tunic.historicalsociety.closet_dirty.gramps.helpclean,text_fqid_tunic.historicalsociety.closet_dirty.gramps.news,text_fqid_tunic.historicalsociety.closet_dirty.gramps.nothing,text_fqid_tunic.historicalsociety.closet_dirty.photo,text_fqid_tunic.historicalsociety.closet_dirty.trigger_coffee,text_fqid_tunic.historicalsociety.closet_dirty.trigger_scarf,text_fqid_tunic.historicalsociety.closet_dirty.what_happened,text_fqid_tunic.historicalsociety.collection.cs,text_fqid_tunic.historicalsociety.collection.gramps.found,text_fqid_tunic.historicalsociety.collection.gramps.look_0,text_fqid_tunic.historicalsociety.collection.gramps.lost,text_fqid_tunic.historicalsociety.collection.tunic,text_fqid_tunic.historicalsociety.collection.tunic.slip,text_fqid_tunic.historicalsociety.collection_flag.gramps.flag,text_fqid_tunic.historicalsociety.collection_flag.gramps.recap,text_fqid_tunic.historicalsociety.entry.block_tocollection,text_fqid_tunic.historicalsociety.entry.block_tomap1,text_fqid_tunic.historicalsociety.entry.block_tomap2,text_fqid_tunic.historicalsociety.entry.boss.flag,text_fqid_tunic.historicalsociety.entry.boss.flag_recap,text_fqid_tunic.historicalsociety.entry.boss.talktogramps,text_fqid_tunic.historicalsociety.entry.directory.closeup.archivist,text_fqid_tunic.historicalsociety.entry.gramps.hub,text_fqid_tunic.historicalsociety.entry.groupconvo,text_fqid_tunic.historicalsociety.entry.groupconvo_flag,text_fqid_tunic.historicalsociety.entry.wells.flag,text_fqid_tunic.historicalsociety.entry.wells.flag_recap,text_fqid_tunic.historicalsociety.entry.wells.talktogramps,text_fqid_tunic.historicalsociety.frontdesk.archivist.foundtheodora,text_fqid_tunic.historicalsociety.frontdesk.archivist.have_glass,text_fqid_tunic.historicalsociety.frontdesk.archivist.have_glass_recap,text_fqid_tunic.historicalsociety.frontdesk.archivist.hello,text_fqid_tunic.historicalsociety.frontdesk.archivist.need_glass_0,text_fqid_tunic.historicalsociety.frontdesk.archivist.need_glass_1,text_fqid_tunic.historicalsociety.frontdesk.archivist.newspaper,text_fqid_tunic.historicalsociety.frontdesk.archivist.newspaper_recap,text_fqid_tunic.historicalsociety.frontdesk.archivist_glasses.confrontation,text_fqid_tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap,text_fqid_tunic.historicalsociety.frontdesk.block_magnify,text_fqid_tunic.historicalsociety.frontdesk.key,text_fqid_tunic.historicalsociety.frontdesk.magnify,text_fqid_tunic.historicalsociety.stacks.block,text_fqid_tunic.historicalsociety.stacks.journals.pic_2.bingo,text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_0.bingo,text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_1.bingo,text_fqid_tunic.historicalsociety.stacks.journals_flag.pic_2.bingo,text_fqid_tunic.historicalsociety.stacks.outtolunch,text_fqid_tunic.humanecology.frontdesk.block_0,text_fqid_tunic.humanecology.frontdesk.block_1,text_fqid_tunic.humanecology.frontdesk.businesscards.card_bingo.bingo,text_fqid_tunic.humanecology.frontdesk.worker.badger,text_fqid_tunic.humanecology.frontdesk.worker.intro,text_fqid_tunic.kohlcenter.halloffame.block_0,text_fqid_tunic.kohlcenter.halloffame.plaque.face.date,text_fqid_tunic.kohlcenter.halloffame.togrampa,text_fqid_tunic.library.frontdesk.block_badge,text_fqid_tunic.library.frontdesk.block_badge_2,text_fqid_tunic.library.frontdesk.block_nelson,text_fqid_tunic.library.frontdesk.wellsbadge.hub,text_fqid_tunic.library.frontdesk.worker.droppedbadge,text_fqid_tunic.library.frontdesk.worker.flag,text_fqid_tunic.library.frontdesk.worker.flag_recap,text_fqid_tunic.library.frontdesk.worker.hello,text_fqid_tunic.library.frontdesk.worker.hello_short,text_fqid_tunic.library.frontdesk.worker.nelson,text_fqid_tunic.library.frontdesk.worker.nelson_recap,text_fqid_tunic.library.frontdesk.worker.preflag,text_fqid_tunic.library.frontdesk.worker.wells,text_fqid_tunic.library.frontdesk.worker.wells_recap,text_fqid_tunic.library.microfiche.block_0,text_fqid_tunic.library.microfiche.reader.paper2.bingo,text_fqid_tunic.library.microfiche.reader_flag.paper2.bingo,text_fqid_tunic.wildlife.center.coffee,text_fqid_tunic.wildlife.center.crane_ranger.crane,text_fqid_tunic.wildlife.center.expert.recap,text_fqid_tunic.wildlife.center.expert.removed_cup,text_fqid_tunic.wildlife.center.fox.concern,text_fqid_tunic.wildlife.center.remove_cup,text_fqid_tunic.wildlife.center.tracks.hub.deer,text_fqid_tunic.wildlife.center.wells.animals,text_fqid_tunic.wildlife.center.wells.animals2,text_fqid_tunic.wildlife.center.wells.nodeer,text_fqid_tunic.wildlife.center.wells.nodeer_recap,level_group_0-4,level_group_13-22,level_group_5-12
0,-0.142403,-0.727483,-0.245883,-0.157351,0.859873,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,-0.142354,-0.727483,-0.245883,-0.157351,0.859873,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,-0.142372,-0.727483,-0.245883,-0.157351,0.859873,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [26]:
for column in df_vectorized.columns:
    print(column)

elapsed_time
room_coor_x
room_coor_y
screen_coor_x
screen_coor_y
event_name_checkpoint
event_name_cutscene_click
event_name_map_click
event_name_map_hover
event_name_navigate_click
event_name_notebook_click
event_name_notification_click
event_name_object_click
event_name_object_hover
event_name_observation_click
event_name_person_click
name_basic
name_close
name_next
name_open
name_prev
name_undefined
level_0
level_1
level_2
level_3
level_4
level_5
level_6
level_7
level_8
level_9
level_10
level_11
level_12
level_13
level_14
level_15
level_16
level_17
level_18
level_19
level_20
level_21
level_22
fqid_0
fqid_archivist
fqid_archivist_glasses
fqid_block
fqid_block_0
fqid_block_1
fqid_block_badge
fqid_block_badge_2
fqid_block_magnify
fqid_block_nelson
fqid_block_tocollection
fqid_block_tomap1
fqid_block_tomap2
fqid_boss
fqid_businesscards
fqid_businesscards.card_0.next
fqid_businesscards.card_1.next
fqid_businesscards.card_bingo.bingo
fqid_businesscards.card_bingo.next
fqid_ch3start
fqid_ch

#### -- END DETOUR --