In [11]:
import h5py
import tensorflow as tf
import os
import numpy as np
import json
import shutil

In [12]:
num_episodes1 = 700
num_episodes2 = 300
episodes_per_file = 10

# input_hdf5_dir = '/media/sbr-tech/Desk SSD/aloha/datas_cucumber_franka'
# dataset_name = "franka_dual_sim_cucumber_dataset"
# task_name = "franka_dual_sim_cucumber_dataset"

# input_hdf5_dir = '/media/sbr-tech/Desk SSD/aloha/datas_cucumber_mobile'
# dataset_name = "mobile_aloha_sim_cucumber_dataset"
# task_name = "mobile_aloha_sim_cucumber_dataset"

# input_hdf5_dir = '/media/sbr-tech/Desk SSD/datas_couple_franka_bimanual'
# dataset_name = "franka_dual_bimanual_sim_couple_dataset"
# task_name = "franka_dual_bimanual_sim_couple_dataset"

# input_hdf5_dir = '/media/sbr-tech/Desk SSD/datas_quadruple_franka_bimanual'
# dataset_name = "franka_dual_bimanual_sim_quadruple_dataset"
# task_name = "franka_dual_bimanual_sim_quadruple_dataset"

input_hdf5_dir1 = '/media/sbr-tech/Desk SSD/datas_cushion_v'
input_hdf5_dir2 = '/media/sbr-tech/Desk SSD/datas_cushion_recovery'
dataset_name = "franka_dual_sim_cushion_dataset_recovery_30"
task_name = "franka_dual_sim_cushion_dataset"


input_hdf5_basename = "episode_{}.hdf5"
version = "1.0.0"

output_root = "/media/sbr-tech/Desk SSD/"
output_basename = task_name + "-train.tfrecord-{}-of-{}"

mobile = "mobile" in dataset_name
franka = "franka" in dataset_name
bimanual = "bimanual" in dataset_name
cushion = "cushion" in dataset_name

num_files = (num_episodes1 + num_episodes2) // episodes_per_file
episodes_per_file1 = int(episodes_per_file * num_episodes1 / (num_episodes1 + num_episodes2))

In [13]:
output_dir = os.path.join(output_root, dataset_name, task_name, version)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [14]:

with open("dataset_info_template.json", "r") as f:
    dataset_info = json.load(f)

dataset_info["moduleName"] = task_name + "." + task_name
dataset_info["name"] = task_name
dataset_info["version"] = version
dataset_info["splits"][0]["shardLengths"] = [str(episodes_per_file)] * num_files 

with open(os.path.join(output_dir, "dataset_info.json"), "w") as f:
    json.dump(dataset_info, f, ensure_ascii=False, indent=2, sort_keys=True, separators=(',', ': '))

In [15]:
# if not mobile:
if not franka:
    ret = shutil.copy("features_template_static.json", os.path.join(output_dir, "features.json"))
else:
    ret = shutil.copy("features_template_franka.json", os.path.join(output_dir, "features.json"))
ret

'/media/sbr-tech/Desk SSD/franka_dual_sim_cushion_dataset_recovery_30/franka_dual_sim_cushion_dataset/1.0.0/features.json'

In [16]:
sample_hdf5_file = os.path.join(input_hdf5_dir1, input_hdf5_basename.format(0))

In [17]:
if franka:
    if bimanual:
        NUM_EPISODE = 800 
    elif cushion:
        NUM_EPISODE = 600
    else:
        NUM_EPISODE = 720         
elif mobile:
    NUM_EPISODE = 720 
else:
    NUM_EPISODE = 600

IS_FIRSTS = np.zeros(NUM_EPISODE, dtype=int)
IS_FIRSTS[0] = 1
DISCOUNTS = np.ones(NUM_EPISODE, dtype=float)
IS_LASTS = np.zeros(NUM_EPISODE, dtype=int)
IS_LASTS[-1] = 1
if not bimanual:
    LANGUAGE_INSTRUCTION = b"pick up the cucumber and put it in the bucket"
elif cushion:
    LANGUAGE_INSTRUCTION = b"put the cushion in the empty space"    
else:
    LANGUAGE_INSTRUCTION = b"pick up the cucumber and the red cube and put them in the bucket"
LANGUAGE_INSTRUCTIONS = np.array([LANGUAGE_INSTRUCTION]*NUM_EPISODE)
REWARDS = np.zeros(NUM_EPISODE, dtype=float)
REWARDS[-1] = 1
IS_TARMINALS = np.zeros(NUM_EPISODE, dtype=int)
IS_TARMINALS[-1] = 1
METADATA = sample_hdf5_file

In [18]:
def _image_bytes_feature(images):    
    values = [tf.image.encode_jpeg(image).numpy() for image in images]
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))

def _language_bytes_feature(langs):
    values = [lang.tobytes() for lang in langs]
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))
    
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [19]:
def numpy_to_tf_example(states, images_top, images_angle, images_left_wrist, images_right_wrist, actions):    
    feature = {
        "steps/is_first": _int64_feature(IS_FIRSTS),
        "steps/action": _float_feature(actions.flatten()),
        "steps/discount": _float_feature(DISCOUNTS),
        "steps/is_last": _int64_feature(IS_LASTS),
        "steps/language_instruction": _language_bytes_feature(LANGUAGE_INSTRUCTIONS),
        "steps/rewards": _float_feature(REWARDS),
        "steps/observation/top": _image_bytes_feature(images_top),
        "steps/observation/angle": _image_bytes_feature(images_angle),
        "steps/observation/left_wrist": _image_bytes_feature(images_left_wrist),
        "steps/observation/right_wrist": _image_bytes_feature(images_right_wrist),
        "steps/is_terminal": _int64_feature(IS_TARMINALS),
        "steps/observation/state": _float_feature(states.flatten()),
        "episode_metadata/file_path": _bytes_feature(METADATA.encode())
    }

    features = tf.train.Features(feature=feature)
    example = tf.train.Example(features=features)
    
    return example

In [20]:
states_all = []
actions_all = []

episode_idx1 = 0
episode_idx2 = 0

for file_idx in range(num_files):
    print ("\nfile_idx: ", file_idx)
    output_tfrecord_file = os.path.join(output_dir, output_basename.format(str(file_idx).zfill(5), str(num_files).zfill(5)))
    with tf.io.TFRecordWriter(output_tfrecord_file) as writer:
        for i in range(episodes_per_file):

            if i < episodes_per_file1:
                hdf5_file = os.path.join(input_hdf5_dir1, input_hdf5_basename.format(episode_idx1))
                print ("episode1: ", episode_idx1)
                episode_idx1 += 1
            else:
                hdf5_file = os.path.join(input_hdf5_dir2, input_hdf5_basename.format(episode_idx2))                
                print ("episode2: ", episode_idx2)
                episode_idx2 += 1
            with h5py.File(hdf5_file, 'r') as hdf5_f:
                states = hdf5_f["observations"]["qpos"][:].copy()
                images_top = hdf5_f["observations"]["images"]["top"][:].copy()
                images_angle = hdf5_f["observations"]["images"]["angle"][:].copy()
                images_left_wrist = hdf5_f["observations"]["images"]["left_wrist"][:].copy()
                images_right_wrist = hdf5_f["observations"]["images"]["right_wrist"][:].copy()
                actions = hdf5_f["action"][:].copy()

                states_all.append(states)
                actions_all.append(actions)

            example = numpy_to_tf_example(states, images_top, images_angle, images_left_wrist, images_right_wrist, actions)
                
            writer.write(example.SerializeToString())


file_idx:  0


episode1:  0
episode1:  1
episode1:  2
episode1:  3
episode1:  4
episode1:  5
episode1:  6
episode2:  0
episode2:  1
episode2:  2

file_idx:  1
episode1:  7
episode1:  8
episode1:  9
episode1:  10
episode1:  11
episode1:  12
episode1:  13
episode2:  3
episode2:  4
episode2:  5

file_idx:  2
episode1:  14
episode1:  15
episode1:  16
episode1:  17
episode1:  18
episode1:  19
episode1:  20
episode2:  6
episode2:  7
episode2:  8

file_idx:  3
episode1:  21
episode1:  22
episode1:  23
episode1:  24
episode1:  25
episode1:  26
episode1:  27
episode2:  9
episode2:  10
episode2:  11

file_idx:  4
episode1:  28
episode1:  29
episode1:  30
episode1:  31
episode1:  32
episode1:  33
episode1:  34
episode2:  12
episode2:  13
episode2:  14

file_idx:  5
episode1:  35
episode1:  36
episode1:  37
episode1:  38
episode1:  39
episode1:  40
episode1:  41
episode2:  15
episode2:  16
episode2:  17

file_idx:  6
episode1:  42
episode1:  43
episode1:  44
episode1:  45
episode1:  46
episode1:  47
episode1:  4

In [21]:
states_all = np.concatenate(states_all)
actions_all = np.concatenate(actions_all)

In [22]:
statistics_dic = {}
statistics_dic["action"] = {}
statistics_dic["action"]["mean"] = actions_all.mean(axis=0).tolist()
statistics_dic["action"]["std"] = actions_all.std(axis=0).tolist()
statistics_dic["action"]["max"] = actions_all.max(axis=0).tolist()
statistics_dic["action"]["min"] = actions_all.min(axis=0).tolist()
statistics_dic["action"]["p99"] = np.percentile(actions_all, 99, axis=0).tolist()
statistics_dic["action"]["p01"] = np.percentile(actions_all, 1, axis=0).tolist()

statistics_dic["num_transitions"] = (num_episodes1 + num_episodes2) * NUM_EPISODE
statistics_dic["num_trajectories"] = (num_episodes1 + num_episodes2)

statistics_dic["proprio"] = {}
statistics_dic["proprio"]["mean"] = states_all.mean(axis=0).tolist()
statistics_dic["proprio"]["std"] = states_all.std(axis=0).tolist()
statistics_dic["proprio"]["max"] = states_all.max(axis=0).tolist()
statistics_dic["proprio"]["min"] = states_all.min(axis=0).tolist()
statistics_dic["proprio"]["p99"] = np.percentile(states_all, 99, axis=0).tolist()
statistics_dic["proprio"]["p01"] = np.percentile(states_all, 1, axis=0).tolist()


In [23]:
with open(os.path.join(output_dir, "dataset_statistics.json"), "w") as f:
    json.dump(statistics_dic, f)