# Colorful Extended Cleanup World (CECW)
The Colorful Extended Cleanup World (CECW) dataset is a color-extended version of the Cleanup World (CW) borrowed from the mobile-manipulation robot domain [(MacGlashan et al., 2015)](http://cs.brown.edu/~jmacglashan/pubpdfs/rss_commands.pdf). CW refers to a world equipped with a movable object as well as four rooms in four colors, including "blue," "green," "red," and "yellow," which is designed as a simulation environment where the agent can act based on the instructions received [(Gopalan et al., 2018)](http://roboticsproceedings.org/rss14/p67.pdf).

# Data Preprocessing 
This notebook shows how we make use of CECW to investigate models' compositional learning.

In [1]:
# dependency
import os
# import json
import random
# import numpy as np
from collections import Counter
from helper_functions import *

In [2]:
# path definition
RAW_TRAIN_SRC_PATH = 'train_src.txt' # souce data for training
RAW_TRAIN_TAR_PATH = 'train_tar.txt' # target data for training
RAW_TEST_SRC_PATH = 'test_src.txt' # source data for testing
RAW_TEST_TAR_PATH = 'test_tar.txt' # target data for testing

PP_DATA_PATH = 'data_dict.json' # to save output data dictionary
VOCAB_PATH = 'vocab_dict.json' # to save output vocabulary dictionary

B_DATA_PATH = os.path.join('B', PP_DATA_PATH) # with entity for B
B_VOCAB_PATH = os.path.join('B', VOCAB_PATH)
BC_DATA_PATH = os.path.join('BC', PP_DATA_PATH) # with entity for B and C
BC_VOCAB_PATH = os.path.join('BC', VOCAB_PATH)
BCR_DATA_PATH = os.path.join('BCR', PP_DATA_PATH) # with entity for B, C and R
BCR_VOCAB_PATH = os.path.join('BCR', VOCAB_PATH)
BCRY_DATA_PATH = os.path.join('BCRY', PP_DATA_PATH) # with entity for B, C, R and Y
BCRY_VOCAB_PATH = os.path.join('BCRY', VOCAB_PATH)
Colorless_DATA_PATH = os.path.join('Colorless', PP_DATA_PATH) # with no entity
Colorless_VOCAB_PATH = os.path.join('Colorless', VOCAB_PATH)
ALL_DATA_PATH = os.path.join('ALL', PP_DATA_PATH) # original dataset
ALL_VOCAB_PATH = os.path.join('ALL', VOCAB_PATH)

In [3]:
raw_train_src_list = read_txt(RAW_TRAIN_SRC_PATH) # training source
raw_train_tar_list = read_txt(RAW_TRAIN_TAR_PATH) # training target
raw_test_src_list = read_txt(RAW_TEST_SRC_PATH) # test source
raw_test_tar_list = read_txt(RAW_TEST_TAR_PATH) # test target

raw_train_size = len(raw_train_src_list)
raw_test_size = len(raw_test_src_list)
print('raw training size:', raw_train_size)
print('raw test size:', raw_test_size)

raw training size: 8922
raw test size: 2231


In [4]:
# white space tokenization
tk_train_src_list = [seq.split() for seq in raw_train_src_list]
tk_train_tar_list = [seq.split() for seq in raw_train_tar_list]
tk_test_src_list = [seq.split() for seq in raw_test_src_list]
tk_test_tar_list = [seq.split() for seq in raw_test_tar_list]

In [5]:
# take a look
index = random.choice(range(raw_train_size))
print('CECW training command:', tk_train_src_list[index])
print('CECW training expression:', tk_train_tar_list[index])
index = random.choice(range(raw_test_size))
print('CECW test command:', tk_test_src_list[index])
print('CECW test expression:', tk_test_tar_list[index])

CECW training command: ['go', 'to', 'orange', 'room', 'push', 'yellow', 'thing', 'into', 'green', 'room']
CECW training expression: ['F', '&', 'R', 'F', 'Z']
CECW test command: ['go', 'to', 'the', 'yellow', 'room', 'but', 'avoid', 'the', 'blue', 'room']
CECW test expression: ['&', 'F', 'Y', 'G', '!', 'B']


In [6]:
# token frequency dictionary
# for source side
src_c = Counter()
for train_src in tk_train_src_list:
    src_c.update(train_src)
for test_src in tk_test_src_list:
    src_c.update(test_src)
# for target side
tar_c = Counter()
for train_tar in tk_train_tar_list:
    tar_c.update(train_tar)
for test_tar in tk_test_tar_list:
    tar_c.update(test_tar)

In [7]:
# vocab frequency dict for source side
src_freq_dict = dict(src_c)
print('CECW souce vocab size:', len(src_c))
print('CECW souce vocab frequency')
print(src_c.most_common())

CECW souce vocab size: 193
CECW souce vocab frequency
[('the', 19623), ('room', 18760), ('to', 11655), ('go', 6898), ('through', 5429), ('blue', 4226), ('green', 4148), ('red', 3920), ('yellow', 3502), ('move', 2565), ('and', 2309), ('or', 2004), ('into', 1934), ('get', 1752), ('enter', 1632), ('then', 1447), ('purple', 1284), ('pink', 1265), ('navy', 1254), ('olive', 1213), ('lime', 1213), ('orange', 1157), ('not', 1138), ('brown', 976), ('tan', 976), ('avoiding', 963), ('but', 881), ('that', 820), ('by', 724), ('reach', 681), ('without', 675), ('going', 609), ('area', 575), ('avoid', 565), ('chair', 540), ('robot', 535), ('while', 527), ('t', 515), ('box', 514), ('only', 505), ('rooms', 452), ('is', 445), ('do', 431), ('from', 427), ('pass', 384), ('a', 368), ('push', 358), ('isn', 335), ('large', 316), ('either', 310), ('travel', 305), ('via', 277), ('way', 272), ('first', 268), ('towards', 265), ('entering', 262), ('in', 254), ('you', 239), ('square', 235), ('small', 216), ('object

In [8]:
# vocab frequency dict for target side
tar_freq_dict = dict(tar_c)
print('CECW target vocab size:', len(tar_c))
print('CECW target vocab frequency')
print(tar_c.most_common())

CECW target vocab size: 11
CECW target vocab frequency
[('F', 18054), ('&', 10888), ('C', 6223), ('B', 6101), ('R', 5918), ('Y', 5030), ('G', 3987), ('!', 3987), ('|', 1970), ('X', 428), ('Z', 311)]


In [9]:
# generate source vocabulary2index dictionary
src_vocab_dict = dict()
src_vocab_dict['<s>'] = 0
src_vocab_dict['</s>'] = 1
src_vocab_dict['<pad>'] = 2
src_vocab_dict['<unk>'] = 3

i = len(src_vocab_dict)

for token in src_freq_dict:
    src_vocab_dict[token] = i
    i += 1

print('CECW souce vocab size:', len(src_vocab_dict))

CECW souce vocab size: 197


In [10]:
# source index2vocabulary dictionary
src_index_dict = {v:k for (k,v) in src_vocab_dict.items()}

In [11]:
# generate target vocabulary2index dictionary
tar_vocab_dict = dict()
tar_vocab_dict['<s>'] = 0
tar_vocab_dict['</s>'] = 1
tar_vocab_dict['<pad>'] = 2

i = len(tar_vocab_dict)

for token in tar_freq_dict:
    tar_vocab_dict[token] = i
    i += 1

print('CECW target vocab size:', len(tar_vocab_dict))

CECW target vocab size: 14


In [12]:
# target index2vocabulary dictionary
tar_index_dict = {v:k for (k,v) in tar_vocab_dict.items()}

In [13]:
# save data list to a dictionary
# x for source input command
# y for target output expression
train_dict = dict()
test_dict = dict()

train_dict['x'] = tk_train_src_list
train_dict['y'] = tk_train_tar_list

test_dict['x'] = tk_test_src_list
test_dict['y'] = tk_test_tar_list

# ALL
As a "subset" of CECW, ALL actually is the original CECW dataset without any modification.

In [14]:
# train size and test size
print('ALL train size:', len(train_dict['x']))
print('ALL test size:', len(test_dict['x']))

ALL train size: 8922
ALL test size: 2231


In [15]:
# make everything a dictionary
data_dict = dict()
data_dict['train_dict'] = train_dict
data_dict['test_dict'] = test_dict

vocab_dict = dict()
vocab_dict['src_vocab2index_dict'] = src_vocab_dict
vocab_dict['src_index2vocab_dict'] = src_index_dict
vocab_dict['tar_vocab2index_dict'] = tar_vocab_dict
vocab_dict['tar_index2vocab_dict'] = tar_index_dict

# output data dict and vocab dict as two json files
save_json(ALL_DATA_PATH, data_dict)
save_json(ALL_VOCAB_PATH, vocab_dict)

# Colorless
As a subset of CECW, Colorless only contains data related to original four color words, that is, "blue," "green," "red," and "yellow."

In [16]:
# fresh color words for each entity in CECW
B_list = ['purple', 'navy']
C_list = ['olive', 'lime']
R_list = ['pink', 'orange']
Y_list = ['brown', 'tan']

In [17]:
# remove all sequence pairs having new color words from the training set
for i in range(len(tk_train_src_list)):
    # replace new primitive with original color words
    train_command_list = tk_train_src_list[i]
    # blue for purple and navy
    for b in B_list:
        index_list = [index for index, token in enumerate(train_command_list) if token == b]
        for j in index_list:
            tk_train_src_list[i][j] = 'blue'
    for c in C_list:
        index_list = [index for index, token in enumerate(train_command_list) if token == c]
        for j in index_list:
            tk_train_src_list[i][j] = 'green'
    for r in R_list:
        index_list = [index for index, token in enumerate(train_command_list) if token == r]
        for j in index_list:
            tk_train_src_list[i][j] = 'red'
    for y in Y_list:
        index_list = [index for index, token in enumerate(train_command_list) if token == y]
        for j in index_list:
            tk_train_src_list[i][j] = 'yellow'

In [18]:
# remove duplicates
src_list = []
tar_list = []
for i in range(len(tk_train_src_list)):
    if tk_train_src_list[i] not in src_list:
        src_list.append(tk_train_src_list[i])
        tar_list.append(tk_train_tar_list[i])
        
print('Colorless train size:', len(src_list))

Colorless train size: 2118


In [19]:
# save data list to a dictionary
# x for source input command
# y for target output expression
train_dict = dict()
train_dict['x'] = src_list
train_dict['y'] = tar_list

In [20]:
# there is no change in test dict and vocab dict
# make everything a dictionary
data_dict['train_dict'] = train_dict

# output data dict and vocab dict as two json files
save_json(Colorless_DATA_PATH, data_dict)
save_json(Colorless_VOCAB_PATH, vocab_dict)

## B
As a subset of CECW, B is Colorless in addition to two primitive rules including "purple" $\rightarrow$ "B" and "navy" $\rightarrow$ "B."

In [21]:
# add two primitive rules
for b in B_list:
    src_list, tar_list = add_primitive_rule([b], ['B'], src_list, tar_list)

In [22]:
print('B train size:', len(src_list))

B train size: 2120


In [23]:
# save data list to a dictionary
# x for source input command
# y for target output expression
train_dict = dict()
train_dict['x'] = src_list
train_dict['y'] = tar_list

# there is no change in test dict and vocab dict
# make everything a dictionary
data_dict['train_dict'] = train_dict

# output data dict and vocab dict as two json files
save_json(B_DATA_PATH, data_dict)
save_json(B_VOCAB_PATH, vocab_dict)

## BC
As a subset of CECW, BC is B in addition to two primitive rules including "olive" $\rightarrow$ "C" and "lime" $\rightarrow$ "C."

In [24]:
# add two primitive rules
for c in C_list:
    src_list, tar_list = add_primitive_rule([c], ['C'], src_list, tar_list)

In [25]:
print('BC train size:', len(src_list))

BC train size: 2122


In [26]:
# save data list to a dictionary
# x for source input command
# y for target output expression
train_dict = dict()
train_dict['x'] = src_list
train_dict['y'] = tar_list

# there is no change in test dict and vocab dict
# make everything a dictionary
data_dict['train_dict'] = train_dict

# output data dict and vocab dict as two json files
save_json(BC_DATA_PATH, data_dict)
save_json(BC_VOCAB_PATH, vocab_dict)

## BCR
As a "subset" of CECW, BCR is BC in addition to two primitive rules including "pink" $\rightarrow$ "R" and "orange" $\rightarrow$ "R."

In [27]:
# add two primitive rules
for r in R_list:
    src_list, tar_list = add_primitive_rule([r], ['R'], src_list, tar_list)

In [28]:
print('BCR train size:', len(src_list))

BCR train size: 2124


In [29]:
# save data list to a dictionary
# x for source input command
# y for target output expression
train_dict = dict()
train_dict['x'] = src_list
train_dict['y'] = tar_list

# there is no change in test dict and vocab dict
# make everything a dictionary
data_dict['train_dict'] = train_dict

# output data dict and vocab dict as two json files
save_json(BCR_DATA_PATH, data_dict)
save_json(BCR_VOCAB_PATH, vocab_dict)

## BCRY
As a subset of CECW, BCRY is BCR in addition to two primitive rules including "brown" $\rightarrow$ "Y" and "tan" $\rightarrow$ "Y."

In [30]:
# add two primitive rules
for y in Y_list:
    src_list, tar_list = add_primitive_rule([y], ['Y'], src_list, tar_list)

In [31]:
print('BCRY train size:', len(src_list))

BCRY train size: 2126


In [32]:
# save data list to a dictionary
# x for source input command
# y for target output expression
train_dict = dict()
train_dict['x'] = src_list
train_dict['y'] = tar_list

# there is no change in test dict and vocab dict
# make everything a dictionary
data_dict['train_dict'] = train_dict

# output data dict and vocab dict as two json files
save_json(BCRY_DATA_PATH, data_dict)
save_json(BCRY_VOCAB_PATH, vocab_dict)

## Reference

1. Squire, S., Tellex, S., Arumugam, D., & Yang, L. Grounding English Commands to Reward Functions.
2. Gopalan, N., Arumugam, D., Wong, L. L., & Tellex, S. (2018). Sequence-to-Sequence Language Grounding of Non-Markovian Task Specifications. In Robotics: Science and Systems.