In [66]:
import numpy as np
import os
import re

from collections import defaultdict

In [7]:
# set experiment
exp='/exp_1a'

In [80]:
def s2i(sents:list, w2i:dict):
    all_sents = []
    for sent in sents:
        indices = np.zeros(len(sent), dtype=int)
        for i, w in enumerate(sent):
            indices[i] += w2i[w]
        all_sents.append(indices)
    return np.array(all_sents)

In [81]:
def sort_dict(some_dict:dict): return dict(sorted(some_dict.items(), key=lambda kv:kv[1], reverse=True))

In [82]:
def w2i(vocab:dict):
    w2i = {'<SOS>': 0, '<EOS>': 1, '<UNK>': 2}
    n_special_toks = len(w2i)
    for i, w in enumerate(vocab.keys()):
        w2i[w] = i + n_special_toks
    i2w = dict(enumerate(w2i.keys()))
    return w2i, i2w

In [83]:
def load_dataset(exp:str, split:str, subdir:str='./data'):
    """
    Args: 
        exp (str): experiment
        split (str): train or test dataset
    Returns:
        cmd_vocab (dict): word2freq dictionary 
        
    """
    file = subdir+exp+split+'/'+os.listdir(subdir+exp+split).pop()
    cmd_start = 'IN:'
    act_start = 'OUT:'
    cmds, acts = [], []
    cmd_vocab, act_vocab = defaultdict(int), defaultdict(int)
    
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            cmd = line[line.index(cmd_start)+len(cmd_start):line.index(act_start)].strip().split()
            act = line[line.index(act_start)+len(act_start):].strip().split()
            for w in cmd: cmd_vocab[w] += 1
            for w in act: act_vocab[w] += 1
            cmds.append(cmd)
            acts.append(act)

    cmd_vocab = sort_dict(cmd_vocab)
    act_vocab = sort_dict(act_vocab)
    # create w2i and i2w mappings
    w2i_cmds, i2w_cmds = w2i(cmd_vocab)
    w2i_acts, i2w_acts = w2i(act_vocab)
    return cmd_vocab, w2i_cmds, i2w_cmds, cmds, act_vocab, w2i_acts, i2w_acts, acts

In [84]:
cmd_vocab, w2i_cmds, i2w_cmds, cmds, act_vocab, w2i_acts, i2w_acts, acts = load_dataset(exp='/exp_1a', split='/train')

In [71]:
train_commands = s2i(cmds, w2i_cmds)
train_actions = s2i(acts, w2i_acts)

In [72]:
train_commands

array([array([ 9,  5,  1,  3,  8, 13,  5,  1,  4]),
       array([11,  5,  2,  7, 12,  1]), array([12,  7, 11,  6,  1,  3]),
       ..., array([11,  1,  4,  8, 13,  6,  1,  3]),
       array([11,  5,  1,  4,  8, 12,  3]),
       array([10,  5,  1,  3,  7,  9,  6,  1,  3])], dtype=object)