In [1]:
import torch
import numpy as np
from model.action_predictor import VideoToAction
import yaml
import os
import cv2
import imageio
from PIL import Image

In [2]:
def animate_trajectories(orig_trajectory, pred_trajectory, path='./traj_anim.gif', duration=4 / 50, rec_to_pred_t=10,
                         title=None):
    # rec_to_pred_t: the timestep from which prediction transitions from reconstruction to generation
    # prepare images
    font = cv2.FONT_HERSHEY_SIMPLEX
    origin = (5, 15)
    fontScale = 0.4
    color = (255, 255, 255)
    gt_border_color = (255, 0, 0)
    rec_border_color = (0, 0, 255)
    gen_border_color = (0, 255, 0)
    border_size = 2
    thickness = 1
    gt_traj_prep = []
    pred_traj_prep = []
    for i in range(orig_trajectory.shape[0]):
        image = (orig_trajectory[i] * 255).astype(np.uint8).copy()
        image = cv2.putText(image, f'GT:{i}', origin, font, fontScale, color, thickness, cv2.LINE_AA)
        # add border
        image = cv2.copyMakeBorder(image, border_size, border_size, border_size, border_size, cv2.BORDER_CONSTANT,
                                   value=gt_border_color)
        gt_traj_prep.append(image)

        text = f'REC:{i}' if i < rec_to_pred_t else f'PRED:{i}'
        image = (pred_trajectory[i].clip(0, 1) * 255).astype(np.uint8).copy()
        image = cv2.putText(image, text, origin, font, fontScale, color, thickness, cv2.LINE_AA)
        # add border
        border_color = rec_border_color if i < rec_to_pred_t else gen_border_color
        image = cv2.copyMakeBorder(image, border_size, border_size, border_size, border_size, cv2.BORDER_CONSTANT,
                                   value=border_color)
        pred_traj_prep.append(image)

    total_images = []
    for i in range(len(orig_trajectory)):
        white_border = (np.ones((gt_traj_prep[i].shape[0], 4, gt_traj_prep[i].shape[-1])) * 255).astype(np.uint8)
        concat_img = np.concatenate([gt_traj_prep[i],
                                     white_border,
                                     pred_traj_prep[i]], axis=1)
        if title is not None:
            text_color = (0, 0, 0)
            fontScale = 0.25
            thickness = 1
            font = cv2.FONT_HERSHEY_SIMPLEX
            h = 25
            w = concat_img.shape[1]
            text_plate = (np.ones((h, w, 3)) * 255).astype(np.uint8)
            w_orig = orig_trajectory.shape[1] // 2
            origin = (w_orig // 6, h // 2)
            text_plate = cv2.putText(text_plate, title, origin, font, fontScale, text_color, thickness,
                                     cv2.LINE_AA)
            concat_img = np.concatenate([text_plate, concat_img], axis=0)
        # total_images.append((concat_img * 255).astype(np.uint8))
        total_images.append(concat_img)
    imageio.mimsave(path, total_images, duration=duration, loop=0)  # 1/50


In [3]:
# Load Config
config_path = 'configs/config.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

model_cfg = config['model']
train_cfg = config['train']

# Load Model
model_path = 'results/run2/model.100000.pt'
state = torch.load(model_path)
model = VideoToAction(
    input_dim=model_cfg['input_dim'],
    model_dim=model_cfg['model_dim'],
    action_dim=model_cfg['action_dim'],
    encoder_depth=model_cfg['encoder_depth'],
    decoder_depth=model_cfg['decoder_depth'],
    heads=model_cfg['heads'],
    dim_head=model_cfg['dim_head'],
    ff_mult=model_cfg['ff_mult'],
    attn_dropout=model_cfg['attn_dropout'],
    ff_dropout=model_cfg['ff_dropout'],
    use_rel_pos_spatial=model_cfg['use_rel_pos_spatial'],
    use_rel_pos_temporal=model_cfg['use_rel_pos_temporal'],
    use_peg_spatial_layers_enc=model_cfg['use_peg_spatial_layers_enc'],
    use_peg_temporal_layers_enc=model_cfg['use_peg_temporal_layers_enc'],
    use_peg_spatial_layers_dec=model_cfg['use_peg_spatial_layers_dec'],
    use_peg_temporal_layers_dec=model_cfg['use_peg_temporal_layers_dec'],
    attn_num_null_kv=model_cfg['attn_num_null_kv'],
    loss_type=train_cfg['loss_type'],
    tokenizer_config=config,
    use_tokenizer=True
)
model.load_state_dict(state['model'])

loaded pretrained LPIPS loss from /scratch/iew/Learning-From-Human-Demonstrations/pretraining/sequence_tokenizer/src/modules/OmniTokenizer/modules/cache/vgg.pth


<All keys matched successfully>

In [4]:
def load_video(vid_path):
    orig_images = []
    imgs = []
    image_files = os.listdir(vid_path)
    image_files_sorted = sorted(image_files, key=lambda x: int(os.path.splitext(x)[0]))
    for path in image_files_sorted[30:50]:
        img = Image.open(os.path.join(vid_path, path))
        img = img.resize((128, 128))
        inp = torch.tensor(np.array(img).transpose(2, 1, 0).reshape((1, 3, 128, 128)), dtype=torch.float32)
        orig_images.append(inp / 255)
        inp = 2 * (inp / 255) - 1
        imgs.append(inp)

    inp = torch.concatenate(imgs, dim=0)
    inp = inp.cuda()
    return inp, orig_images

In [5]:
# Load Video
# 165134 pouring juice into a glass
# 190456 pouring milk into a glass
demo_dir = '/scratch/iew/sthv2/frames/frames/165134'
obs_dir = '/scratch/iew/sthv2/frames/frames/190456'

demo_inp, demo_imgs = load_video(demo_dir)
obs_inp, obs_imgs = load_video(obs_dir)
#inp, orig_images = load_video('/scratch/iew/sthv2/frames/frames/730')

In [6]:
width = 16
height = 16

In [7]:
# Convert Video to Embeddings and Extract Latent Actions
V, A, min_val, max_val = model.tokenizer.encode(demo_inp, reconstructions=False, return_min_max=True)
V = V.reshape((1, V.shape[0], width, height, V.shape[2])).cpu()
#S = V.detach().clone()
S, A, min_val, max_val = model.tokenizer.encode(obs_inp, reconstructions=False, return_min_max=True)
S = S.reshape((1, S.shape[0], width, height, S.shape[2])).cpu()

In [None]:
model.eval().cpu()
S_t = torch.zeros_like(V)
S_t[:, 0] = S[:, 0] 
sequence_len = V.shape[1]
mask = torch.zeros(V.shape[1], dtype=torch.bool).unsqueeze(0)
for i in range(sequence_len-1):
   mask[:, :i] = True
   A_hat = model(V, S_t, A, return_loss=False, temporal_mask_S=mask)
   A_hat = A_hat[:, :, 0, 0, :]
   recons, recon_vid = model.tokenizer.reconstruct(S_t, A_hat[:, 1:], min_val, max_val, actions_only=False)
   S_t[:, i+1] = recons[i]

tensor([[[[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]],

          [[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]],

          [[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]],

          ...,

          [[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           

In [9]:
# Run video through action predictor


# Flatten Actions


In [10]:
recon_vid = recon_vid.detach().cpu().numpy()
recon_vid = np.concat((obs_inp.cpu()[0].unsqueeze(0), recon_vid), axis=0)

In [11]:
#V, latent_recon_vid = model.tokenizer.encode(inp, reconstructions=True, latent_actions=False, return_min_max=False)

In [12]:

images = []
for i in range(recon_vid.shape[0]):
    img_rec = (((recon_vid[i] + 1)/2)).transpose(2, 1, 0)
    images.append(img_rec)

# la_images = []
# for i in range(latent_recon_vid.shape[0]):
#     img_rec = (((latent_recon_vid[i] + 1)/2)).transpose(2, 1, 0)
#     la_images.append(img_rec)
print(obs_imgs[0].shape)
orig_images = np.concatenate(obs_imgs[:20]).transpose((0,3,2,1))
images = np.array(images)

torch.Size([1, 3, 128, 128])


In [13]:
animate_trajectories(orig_images, images, duration=10)