# Colorful Extended Cleanup World (CECW)
The Colorful Extended Cleanup World (CECW) dataset is a color-extended version of the Cleanup World (CW) borrowed from the mobile-manipulation robot domain [(MacGlashan et al., 2015)](http://cs.brown.edu/~jmacglashan/pubpdfs/rss_commands.pdf). CW refers to a world equipped with a movable object as well as four rooms in four colors, including "blue," "green," "red," and "yellow," which is designed as a simulation environment where the agent can act based on the instructions received [(Gopalan et al., 2018)](http://roboticsproceedings.org/rss14/p67.pdf).

# Data Extraction, Cleaning, and Splitting
This notebook shows how we format CW to our CECW.

In [1]:
# dependency
import random
import numpy as np
from collections import Counter
from helper_functions import *

In [2]:
# path definition
# credits to MacGlashan et al., 2015 and Gopalan et al., 2018
# more info here - https://github.com/jmacglashan/commandsToTasks
RAW_SRC_PATH = 'CW/hard_pc_src_syn.txt' # input source data
RAW_TAR_PATH = 'CW/hard_pc_tar_syn.txt' # input target data

TRAIN_SRC_PATH = 'train_src.txt' # output training source data
TRAIN_TAR_PATH = 'train_tar.txt' # output training target data
TEST_SRC_PATH = 'test_src.txt' # output test source data
TEST_TAR_PATH = 'test_tar.txt' # output test target data

In [3]:
raw_src_list = read_txt(RAW_SRC_PATH) # input source data list
raw_tar_list = read_txt(RAW_TAR_PATH) # input target data list
# 3382 commands reflecting a total of 39 GLTL expressions
raw_data_size = len(raw_src_list)
print('raw data size:', raw_data_size)

raw data size: 3382


In [4]:
print('kinds of commands', len(set(raw_src_list)))
print('kinds of expressions', len(set(raw_tar_list)))

kinds of commands 2130
kinds of expressions 39


In [5]:
# take a look
index = random.choice(range(raw_data_size))
print('command:', raw_src_list[index])
print('expression:', raw_tar_list[index])

command: go through the yellow or green room to reach the blue room
expression: F & | C Y F B


In [6]:
# sum up tokens from source side
src_counter = Counter()
for src in raw_src_list:
    src_counter.update(src.split())
src_token_freq_dict = dict(src_counter)
print('source vocab size:', len(src_token_freq_dict))
print('source vocab frequency')
print(src_counter.most_common())

source vocab size: 188
source vocab frequency
[('the', 5236), ('room', 5224), ('to', 3476), ('go', 2243), ('blue', 1744), ('green', 1708), ('red', 1529), ('yellow', 1387), ('through', 1275), ('move', 846), ('and', 486), ('into', 474), ('get', 411), ('enter', 342), ('or', 321), ('then', 320), ('not', 264), ('avoiding', 244), ('but', 243), ('that', 180), ('without', 170), ('avoid', 159), ('while', 155), ('area', 145), ('by', 142), ('reach', 138), ('t', 125), ('going', 123), ('chair', 123), ('rooms', 101), ('only', 100), ('do', 100), ('a', 98), ('robot', 95), ('is', 90), ('entering', 84), ('large', 83), ('box', 78), ('push', 78), ('pass', 77), ('via', 75), ('from', 75), ('isn', 75), ('pink', 68), ('towards', 65), ('travel', 65), ('small', 63), ('in', 58), ('way', 55), ('first', 52), ('you', 51), ('don', 50), ('on', 48), ('square', 45), ('using', 44), ('either', 44), ('up', 41), ('passing', 40), ('one', 39), ('proceed', 38), ('object', 37), ('it', 36), ('walk', 35), ('door', 31), ('of', 31

In [7]:
# sum up tokens from target side
tar_counter = Counter()
for tar in raw_tar_list:
    tar_counter.update(tar.split())
tar_token_freq_dict = dict(tar_counter)
print('target vocab size:', len(tar_token_freq_dict))
print('target vocab frequency')
print(tar_counter.most_common())

target vocab size: 11
target vocab frequency
[('F', 4962), ('&', 2585), ('C', 1648), ('B', 1616), ('R', 1537), ('Y', 1329), ('G', 1005), ('!', 1005), ('|', 319), ('X', 99), ('Z', 57)]


In [8]:
# color words to each entity
# primitive rules:
# blue -> B
# green -> C
# red -> R
# yellow -> Y
B_list = ['blue', 'purple', 'navy']
C_list = ['green', 'olive', 'lime']
R_list = ['red', 'pink', 'orange']
Y_list = ['yellow', 'brown', 'tan']

In [9]:
# replace the color word in the command
# but keep it map to the same expression
exp_src_list = [src for src in raw_src_list]
exp_tar_list = [tar for tar in raw_tar_list]
for src, tar in zip(raw_src_list, raw_tar_list):
    for b in B_list:
        if b in src.split():
            for color in B_list:
                if b != color:
                    new_src = src.replace(b, color)
                    exp_src_list.append(new_src)
                    exp_tar_list.append(tar)
    for r in R_list:
        if r in src.split():
            for color in R_list:
                if r != color:
                    new_src = src.replace(r, color)
                    exp_src_list.append(new_src)
                    exp_tar_list.append(tar)
    for y in Y_list:
        if y in src.split():
            for color in Y_list:
                if y != color:
                    new_src = src.replace(y, color)
                    exp_src_list.append(new_src)
                    exp_tar_list.append(tar)
    for c in C_list:
        if c in src.split():
            for color in C_list:
                if c != color:
                    new_src = src.replace(c, color)
                    exp_src_list.append(new_src)
                    exp_tar_list.append(tar)

In [10]:
# remove duplicate
new_src_list, new_tar_list = [], []
for src, tar in zip(exp_src_list, exp_tar_list):
    if src not in new_src_list:
        new_src_list.append(src)
        new_tar_list.append(tar)

In [11]:
print('CECW souze data size:', len(new_src_list))
print('CECW target data size:', len(new_tar_list))

CECW souze data size: 11153
CECW target data size: 11153


In [12]:
# take a look
new_data_size = len(new_src_list)
index = random.choice(range(new_data_size))
print('command:', new_src_list[index])
print('expression:', new_tar_list[index])

command: move to the large blue room while avoiding the small lime room
expression: & F B G ! C


In [13]:
# 20% for training
# 80% for test
train_test_rate = 0.8
index = np.random.permutation(new_data_size)
index_list = np.split(index, [int(train_test_rate*new_data_size), new_data_size])
train_index, test_index = index_list[0], index_list[1]
print('CECW training size:', len(train_index))
print('CECW test size', len(test_index))

CECW training size: 8922
CECW test size 2231


In [14]:
# train test split
train_src_list = np.asarray(new_src_list)[train_index].tolist()
train_tar_list = np.asarray(new_tar_list)[train_index].tolist()
test_src_list = np.asarray(new_src_list)[test_index].tolist()
test_tar_list = np.asarray(new_tar_list)[test_index].tolist()

In [15]:
# save output as .txt files
save_txt(TRAIN_SRC_PATH, train_src_list)
save_txt(TRAIN_TAR_PATH, train_tar_list)
save_txt(TEST_SRC_PATH, test_src_list)
save_txt(TEST_TAR_PATH, test_tar_list)