In [1]:
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

## Loading files

In [2]:
df = pd.read_csv(os.path.join("../dataset/train.csv"))

df.head()

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description
0,1005,0,9,amihretu,put the :BOTTLE to the left of :BOTTLE
1,1011,0,9,amihretu,move the :BOTTLE left
2,1012,0,9,amihretu,put the :BOTTLE to the right of :MUG
3,1013,0,9,amihretu,shift the :CUP backwards
4,1015,0,9,amihretu,shift the :BOTTLE forwards


In [3]:
with open(os.path.join("../dataset/motor.txt"), "r") as f: 
    motor_cmds = f.readlines()
    
motor_cmds[:5]

["0@  :BOTTLE BLUE POSE-4 :BOTTLE  #'*leftward-transformation*  :BOTTLE\n",
 "1@  :BOTTLE BLUE POSE-6 :CUP BLUE POSE-1 :BOTTLE  #'*forward-transformation*  :CUP\n",
 "2@  :BOTTLE GREEN POSE-7 :WEISSWURST GREEN POSE-2 :BOTTLE  #'*backward-transformation*  :WEISSWURST\n",
 '3@  :CUP GREEN POSE-8 :MUG BLUE POSE-3 :CUP NIL :MUG\n',
 "4@  :BOTTLE BLUE POSE-7 :MUG BLUE POSE-5 :BOTTLE  #'*rightward-transformation*  :MUG\n"]

## Filter cmds

In [66]:
cmd_df = pd.DataFrame({
    "sample_ID": [int(cmd.split("@")[0]) for cmd in motor_cmds],
    "motor_cmd": [cmd.split("@")[1].strip() for cmd in motor_cmds]
}).reset_index(drop=True)

filtered_cmds = cmd_df.loc[df.sample_ID.tolist()]

filtered_cmds

Unnamed: 0,sample_ID,motor_cmd
1005,1005,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...
1011,1011,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...
1012,1012,:BOTTLE BLUE POSE-7 :MUG RED POSE-3 :BOTTLE #...
1013,1013,:CUP RED POSE-4 :CUP #'*backward-transformati...
1015,1015,:BOTTLE GREEN POSE-3 :BOTTLE #'*forward-trans...
...,...,...
1005,1005,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...
1011,1011,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...
1012,1012,:BOTTLE BLUE POSE-7 :MUG RED POSE-3 :BOTTLE #...
1013,1013,:CUP RED POSE-4 :CUP #'*backward-transformati...


In [67]:
df.shape, filtered_cmds.shape

((1984, 5), (1984, 2))

## Map cmds with their corresponding sample index

In [68]:
final_df = pd.merge(left=df, right=filtered_cmds, on="sample_ID", how='outer').drop_duplicates(subset=["sample_ID"])

final_df

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd
0,1005,0,9,amihretu,put the :BOTTLE to the left of :BOTTLE,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...
4,1011,0,9,amihretu,move the :BOTTLE left,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...
8,1012,0,9,amihretu,put the :BOTTLE to the right of :MUG,:BOTTLE BLUE POSE-7 :MUG RED POSE-3 :BOTTLE #...
12,1013,0,9,amihretu,shift the :CUP backwards,:CUP RED POSE-4 :CUP #'*backward-transformati...
16,1015,0,9,amihretu,shift the :BOTTLE forwards,:BOTTLE GREEN POSE-3 :BOTTLE #'*forward-trans...
...,...,...,...,...,...,...
2003,990,0,9,dmusingu,put the :CUP to the right of :BREAKFAST-CEREAL,:CUP BLUE POSE-10 :BREAKFAST-CEREAL BLUE POSE-...
2004,993,0,9,dmusingu,put the :CUP in front of :GLASSES,:CUP RED POSE-9 :GLASSES RED POSE-3 :CUP #'*f...
2005,994,0,9,dmusingu,move the :CUP forwards,:CUP RED POSE-1 :CUP #'*forward-transformatio...
2006,995,0,9,dmusingu,shift the :BOTTLE left,:BOTTLE GREEN POSE-3 :BOTTLE #'*leftward-tran...


## Save updated training dataset

In [69]:
final_df.to_csv(path_or_buf=os.path.join("../dataset/updated_train.csv"), index=False)

## Load consolidated dataset

In [8]:
final_df = pd.read_csv("../../dataset/updated_train.csv")
# final_df["len_action_desc"] = final_df["action_description"].apply(lambda ad: len(ad))
# final_df["len_motor_cmd"] = final_df["motor_cmd"].apply(lambda mc: len(mc))

final_df.head()

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd
0,1005,0,9,amihretu,put the :BOTTLE to the left of :BOTTLE,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...,38,84
1,1011,0,9,amihretu,move the :BOTTLE left,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...,21,65
2,1012,0,9,amihretu,put the :BOTTLE to the right of :MUG,:BOTTLE BLUE POSE-7 :MUG RED POSE-3 :BOTTLE #...,36,79
3,1013,0,9,amihretu,shift the :CUP backwards,:CUP RED POSE-4 :CUP #'*backward-transformati...,24,55
4,1015,0,9,amihretu,shift the :BOTTLE forwards,:BOTTLE GREEN POSE-3 :BOTTLE #'*forward-trans...,26,65


In [9]:
final_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sample_ID,1972.0,2082.621197,1157.723272,5.0,1098.75,2128.0,3087.5,4025.0
in_state,1972.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
goal_state,1972.0,8.994422,0.074497,8.0,9.0,9.0,9.0,9.0
len_action_desc,1972.0,29.116126,7.594706,18.0,23.0,27.0,35.0,49.0
len_motor_cmd,1972.0,73.448276,13.019162,51.0,63.0,72.0,82.0,108.0


In [7]:
# final_df.to_csv(path_or_buf=os.path.join("../../dataset/updated_train.csv"), index=False)

## Tokenization
Adapted from: [https://huggingface.co](https://huggingface.co/learn/nlp-course/chapter6/8?fw=pt#building-a-wordpiece-tokenizer-from-scratch)

In [35]:
# ! pip install tokenizers

In [10]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


In [11]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFD(), 
    # normalizers.Lowercase(), 
    normalizers.StripAccents()
])
tokenizer

<tokenizers.Tokenizer at 0x55fa2e79d810>

In [12]:
tokenizer.normalizer.normalize_str("move the :BOWL left")

'move the :BOWL left'

In [13]:
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [14]:
tokenizer.pre_tokenizer.pre_tokenize_str("move the :SPOON forward")


[('move', (0, 4)), ('the', (5, 8)), (':SPOON', (9, 15)), ('forward', (16, 23))]

### Create corpus

In [84]:
# with open("../dataset/jeps_corpus.txt", "w", encoding="utf-8") as f:
#     for i in tqdm(range(final_df.shape[0])):
#         ad = final_df.iloc[i].action_description
#         cmd = final_df.iloc[i].motor_cmd
#         f.write(ad+ "\n")    
#         f.write(cmd+ "\n")    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1972/1972 [00:00<00:00, 5244.28it/s]


In [46]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

trainer = trainers.WordPieceTrainer(
    vocab_size=300, 
    special_tokens=special_tokens
)

In [47]:
tokenizer.train(files=["../../dataset/jeps_corpus.txt"], trainer=trainer)






#### Test tokenizer

##### Encoding

In [48]:
encoding = tokenizer.encode(":CUP RED POSE-8 :MONDAMIN BLUE POSE-2 :CUP  #'*forward-transformation*  :MONDAMIN")
print(encoding.tokens)

['[CLS]', ':CUP', 'RED', 'POSE-8', ':MONDAMIN', 'BLUE', 'POSE-2', ':CUP', "#'*forward-transformation*", ':MONDAMIN', '[SEP]']


In [49]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [50]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [51]:
encoding = tokenizer.encode("put the :CUP to the left of :GLOVE")
print(encoding.tokens)
print(encoding.type_ids)


['[CLS]', 'put', 'the', ':CUP', 'to', 'the', 'left', 'of', ':GLOVE', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


##### Decoding

In [52]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [53]:
tokenizer.decode(encoding.ids)


'put the :CUP to the left of :GLOVE'

## Save tokenizer

In [54]:
tokenizer.save("../../dataset/jeps_tokenizer.json")

## Load and use trained tokenizer

In [55]:
tok = Tokenizer.from_file("../../dataset/jeps_tokenizer.json")

In [56]:
tok

<tokenizers.Tokenizer at 0x55fa336640d0>

In [57]:
enc = tok.encode(
    sequence=":BOTTLE GREEN POSE-3 :BOTTLE  #'*forward-transformation*  :BOTTLE",)

print(enc)
print(enc.tokens)
print(enc.ids)


Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', ':BOTTLE', 'GREEN', 'POSE-3', ':BOTTLE', "#'*forward-transformation*", ':BOTTLE', '[SEP]']
[2, 138, 171, 228, 138, 184, 138, 3]


In [58]:
tokenizer.decode(enc.ids)


":BOTTLE GREEN POSE-3 :BOTTLE #'*forward-transformation* :BOTTLE"

In [59]:
# !pip install transformers

In [60]:
from transformers import PreTrainedTokenizerFast

In [61]:
tt = PreTrainedTokenizerFast(
    tokenizer_file="../../dataset/jeps_tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [62]:
enc = tt(
    text=":BOTTLE GREEN POSE-3 :BOTTLE  #'*forward-transformation*  :BOTTLE",
    padding="max_length", 
    truncation=False, 
    max_length=128,
    return_tensors="pt"
)


In [63]:
enc

{'input_ids': tensor([[  2, 138, 171, 228, 138, 184, 138,   3,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0