# I/O check

In [1]:
!nvidia-smi

Fri Jul 28 11:58:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.199.02   Driver Version: 470.199.02   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   63C    P0    25W /  N/A |    375MiB /  6069MiB |      5%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# reload sccripts on change
import IPython
%load_ext autoreload

%autoreload 2

In [3]:
# import scripts
import sys
sys.path.append("../src/")

# Libs

In [18]:
import os
import os.path as osp

import pandas as pd

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from torchinfo import summary

In [5]:
from config import Config as cfg
from jepsam_tokenizer import SimpleTokenizer
import vocabulary as vocab

from dataloader import get_dataloaders,SimpleJEPSAMDataset

# Get data

In [6]:
tdf = pd.read_csv(
    osp.join(cfg.DATASET['PATH'], "v1/updated_train.csv")
)

tdf.head()

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd
0,1005,0,9,amihretu,put the :BOTTLE to the left of :BOTTLE,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...,8,11
1,1011,0,9,amihretu,move the :BOTTLE left,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...,4,8
2,1012,0,9,amihretu,put the :BOTTLE to the right of :MUG,:BOTTLE BLUE POSE-7 :MUG RED POSE-3 :BOTTLE #...,8,11
3,1013,0,9,amihretu,shift the :CUP backwards,:CUP RED POSE-4 :CUP #'*backward-transformati...,4,8
4,1015,0,9,amihretu,shift the :BOTTLE forwards,:BOTTLE GREEN POSE-3 :BOTTLE #'*forward-trans...,4,8


In [47]:
B = 5

s = tdf.sample(n=B)

s

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd
324,1701,0,9,amihretu,put the :CUP in front of :BREAKFAST-CEREAL,:CUP BLUE POSE-8 :BREAKFAST-CEREAL RED POSE-1 ...,7,11
1699,444,0,9,dmusingu,move the :BOTTLE forwards,:BOTTLE BLUE POSE-2 :BOTTLE #'*forward-transf...,4,8
1009,3373,0,9,mdanso,put the :CEREAL to the right of :CAP,:CEREAL RED POSE-13 :CAP BLUE POSE-10 :CEREAL ...,8,11
64,1166,0,9,amihretu,move the :BOTTLE backwards,:BOTTLE BLUE POSE-2 :BOTTLE #'*backward-trans...,4,8
344,1734,0,9,amihretu,put the :BOTTLE in front of :SHOE,:BOTTLE GREEN POSE-7 :SHOE BLUE POSE-3 :BOTTLE...,7,11


In [8]:
s.action_description.values.tolist()

['put the :CUP in front of :PLATE',
 'put the :FORK to the right of :RED-METAL-PLATE',
 'put the :CEREAL to the right of :RED-METAL-PLATE',
 'move the :CEREAL left',
 'put the :BOTTLE to the left of :RED-METAL-PLATE']

# Model utils

1. Episodic embedding `(Module)`
2. Semantic embedding `(Module)`
3. Sequence (action) encoder `(RNN)`
4. Feature mixer `(Linear or RNN)`
5. Attention module `(Module)`
6. Image generator `(Module)`
7. Action generator `(Module)`
8. Image reconstructor `(Module)`
9. Action description reconstructor `(Module)`

#### Episodic embedding

In [9]:
from utils.ae_resnet import get_configs, ResNetEncoder, ResNetDecoder

class EpisodicEncoder(nn.Module):
    def __init__(
        self, 
        cnn_backbone_name:str="resnet50",
        hidden_dim:int=cfg.MODEL["CNN_FC_DIM"]
    ):
        super().__init__()
        
        configs, bottleneck = get_configs(cnn_backbone_name)
        
        self.feature_extractor = ResNetEncoder(configs, bottleneck)
        
        n_ftrs = 2048 * (cfg.IMAGE_SIZE // 32) * (cfg.IMAGE_SIZE // 32)
        self.projection_layer = nn.Linear(
            in_features=n_ftrs,
            out_features=hidden_dim
        )
        

    def forward(self, x_perceived):
        
        B, C, H, W = x_perceived.shape
        
        ftrs = self.feature_extractor(x_perceived)
        # print("ftrs: ", ftrs.shape)
        out = self.projection_layer(ftrs.view(B, -1))
        # print("out: ", out.shape)
        
        return out

In [10]:
epEmb = EpisodicEncoder(cnn_backbone_name="resnet50").cuda()

summary(
    model=epEmb
)

Layer (type:depth-idx)                        Param #
EpisodicEncoder                               --
├─ResNetEncoder: 1-1                          --
│    └─Sequential: 2-1                        --
│    │    └─Conv2d: 3-1                       9,408
│    │    └─BatchNorm2d: 3-2                  128
│    │    └─ReLU: 3-3                         --
│    └─EncoderBottleneckBlock: 2-2            --
│    │    └─MaxPool2d: 3-4                    --
│    │    └─EncoderBottleneckLayer: 3-5       75,008
│    │    └─EncoderBottleneckLayer: 3-6       70,400
│    │    └─EncoderBottleneckLayer: 3-7       70,400
│    └─EncoderBottleneckBlock: 2-3            --
│    │    └─EncoderBottleneckLayer: 3-8       379,392
│    │    └─EncoderBottleneckLayer: 3-9       280,064
│    │    └─EncoderBottleneckLayer: 3-10      280,064
│    │    └─EncoderBottleneckLayer: 3-11      280,064
│    └─EncoderBottleneckBlock: 2-4            --
│    │    └─EncoderBottleneckLayer: 3-12      1,512,448
│    │    └─EncoderBo

In [48]:
ex = torch.randn((B,3,128,128))

ep_emb = epEmb(ex.cuda()).cpu().detach()

ep_emb.shape

torch.Size([5, 256])

#### Semantic Embedding

In [13]:
tokenizer = SimpleTokenizer(vocab=vocab)
text_ex = s.action_description.values.tolist()

tok_text = tokenizer.batch_encode(text_ex)
tok_text

[tensor([48, 59, 62, 12, 54, 53, 57, 20, 46]),
 tensor([48, 59, 62, 13, 63, 62, 60, 57, 22, 46]),
 tensor([48, 59, 62, 10, 63, 62, 60, 57, 22, 46]),
 tensor([48, 56, 62, 10, 55, 46]),
 tensor([48, 59, 62,  5, 63, 62, 55, 57, 22, 46])]

In [57]:
lens = torch.as_tensor([t.shape[0] for t in tok_text])
lens

tensor([ 9, 10, 10,  6, 10])

In [58]:
padded = pad_sequence(
            tok_text, 
            batch_first=True, 
            padding_value=cfg.DATASET["PAD"]
        ).unsqueeze(1)

padded, padded.shape

(tensor([[[48, 59, 62, 12, 54, 53, 57, 20, 46, 47]],
 
         [[48, 59, 62, 13, 63, 62, 60, 57, 22, 46]],
 
         [[48, 59, 62, 10, 63, 62, 60, 57, 22, 46]],
 
         [[48, 56, 62, 10, 55, 46, 47, 47, 47, 47]],
 
         [[48, 59, 62,  5, 63, 62, 55, 57, 22, 46]]]),
 torch.Size([5, 1, 10]))

In [59]:
packed = pack_padded_sequence(
    input=padded, 
    lengths= lens, 
    batch_first=True, 
    enforce_sorted=False
)

packed

PackedSequence(data=tensor([[48, 59, 62, 13, 63, 62, 60, 57, 22, 46],
        [48, 59, 62, 10, 63, 62, 60, 57, 22, 46],
        [48, 59, 62,  5, 63, 62, 55, 57, 22, 46],
        [48, 59, 62, 12, 54, 53, 57, 20, 46, 47],
        [48, 56, 62, 10, 55, 46, 47, 47, 47, 47]]), batch_sizes=tensor([5, 5, 5, 5, 5, 5, 4, 4, 4, 3]), sorted_indices=tensor([1, 2, 4, 0, 3]), unsorted_indices=tensor([3, 0, 1, 4, 2]))

In [60]:
input_data, batch_sizes, _, _ = packed
input_data, batch_sizes

(tensor([[48, 59, 62, 13, 63, 62, 60, 57, 22, 46],
         [48, 59, 62, 10, 63, 62, 60, 57, 22, 46],
         [48, 59, 62,  5, 63, 62, 55, 57, 22, 46],
         [48, 59, 62, 12, 54, 53, 57, 20, 46, 47],
         [48, 56, 62, 10, 55, 46, 47, 47, 47, 47]]),
 tensor([5, 5, 5, 5, 5, 5, 4, 4, 4, 3]))

In [61]:
class SemanticEncoder(nn.Module):
    def __init__(
        self, 
        vocab_size:int=cfg.DATASET["NUM_TOTAL_TOKENS"], 
        embedding_dim:int=cfg.MODEL["CNN_FC_DIM"],
        num_layers:int=2
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embedding_dim,
            # padding_idx=cfg.DATASET["PAD"]
        )
        
        self.action_encoder = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=embedding_dim // 2,
            num_layers=num_layers,
            bidirectional = True,
            batch_first=True
        ) 

    def forward(self, packed_ad, ad_lens):
        
        # Unpack the packed sequence
        input_data, batch_sizes, _, _ = packed_ad
        embedded = self.embedding(input_data)
        # Pack the embedded sequence back
        packed_embedded = pack_padded_sequence(embedded, ad_lens, enforce_sorted=False, batch_first=True)
        # Apply LSTM on the packed sequence
        packed_output, (h_n, c_n) = self.action_encoder(packed_embedded)
        # Unpack the LSTM output
        out, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        return out, h_n, c_n

In [62]:
semEmb = SemanticEncoder()
    
summary(
    model=semEmb
)

Layer (type:depth-idx)                   Param #
SemanticEncoder                          --
├─Embedding: 1-1                         19,456
├─LSTM: 1-2                              790,528
Total params: 809,984
Trainable params: 809,984
Non-trainable params: 0

In [63]:
semantic_emb, h_ad, c_ad = semEmb(packed, lens)
semantic_emb.shape, h_ad.shape, c_ad.shape

(torch.Size([5, 10, 256]), torch.Size([4, 5, 128]), torch.Size([4, 5, 128]))

#### Concat operation

In [64]:
_, seq_len, _ = semantic_emb.shape

repeated_ep_emb = ep_emb.cpu().detach().unsqueeze(1).expand(
    ep_emb.shape[0], 
    seq_len, 
    ep_emb.shape[-1]
)

repeated_ep_emb.shape

torch.Size([5, 10, 256])

In [65]:
concat_ftrs = torch.cat((repeated_ep_emb, semantic_emb), dim=-1)

concat_ftrs.shape

torch.Size([5, 10, 512])

In [66]:
feature_mixer = nn.LSTM(
    input_size=cfg.MODEL["CNN_FC_DIM"]*2, 
    hidden_size=cfg.MODEL["CNN_FC_DIM"],
    num_layers= 2,
    bidirectional = True,
    batch_first=True
)

feature_mixer

LSTM(512, 256, num_layers=2, batch_first=True, bidirectional=True)

In [67]:
fused_inp, (h, c) = feature_mixer(concat_ftrs)

fused_inp.shape, h.shape, c.shape

(torch.Size([5, 10, 512]), torch.Size([4, 5, 256]), torch.Size([4, 5, 256]))

#### Attention module

In [None]:
class Attention(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Attention, self).__init__()
        self.W_query = nn.Linear(input_dim, hidden_dim)
        self.W_key = nn.Linear(input_dim, hidden_dim)
        self.W_value = nn.Linear(input_dim, hidden_dim)
        self.output_dim = input_dim

    def forward(self, query, key, value):
        Q = self.W_query(query)
        K = self.W_key(key)
        V = self.W_value(value)
        attention_scores = torch.matmul(Q, K.transpose(-2, -1))
        attention_scores = F.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_scores, V)
        return output



class CrossModalAttention(nn.Module):
    def __init__(self, ) -> None:
        super().__init__()

    def forward(self):
        pass



#### Image generator

In [None]:
class GeneratorCNN(nn.Module):
    def __init__(self, latent_dim, output_channels):
        super(GeneratorCNN, self).__init__()
        self.latent_dim = latent_dim
        self.output_channels = output_channels

        # CNN layers for the generator
        self.conv1 = nn.Conv2d(
            latent_dim, 128, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(32, output_channels,
                               kernel_size=3, stride=1, padding=1)

        # Activation function
        self.relu = nn.ReLU()

    def forward(self, latent_repr):
        # Reshape latent representation to a 4D tensor (batch_size, channels, height, width)
        latent_repr = latent_repr.view(
            latent_repr.size(0), self.latent_dim, 1, 1)

        # Pass the latent representation through the CNN layers
        x = self.relu(self.conv1(latent_repr))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        generated_image = self.conv4(x)

        return generated_image


class ImageGenerationModule(nn.Module):
    def __init__(
        self,
        input_channels,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        motor_commands_dim,
        latent_dim
    ):
        super().__init__()

        # Attrs
        self.latent_dim = latent_dim

        # Semantic Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Encoder
        self.encoder_lstm = nn.LSTM(embedding_dim + input_channels, hidden_dim)

        # Episodic Memory (if applicable)
        self.episodic_memory = EpisodicMemory(hidden_dim, hidden_dim)

        # Attention Mechanism (if applicable)
        self.attention = Attention(hidden_dim, hidden_dim)

        # Decoder
        self.generator = GeneratorCNN(latent_dim, output_dim)

    def forward(self, action_desc, initial_state):
        # 1. Encode
        # Semantic Embedding
        embedded_action = self.embedding(action_desc)

        concatenated_input = torch.cat((embedded_action, initial_state), dim=1)

        # Encoder
        encoder_output, (hidden, cell) = self.encoder_lstm(concatenated_input)

        # Episodic Memory
        episodic_output, _ = self.episodic_memory(encoder_output)

        # Attention Mechanism
        attended_output = self.attention(
            episodic_output, hidden, encoder_output)

        # 2. Decode
        latent_repr = torch.randn(encoder_output.size(
            0), self.latent_dim).to(encoder_output.device)
        generated_image = self.generator(latent_repr)

        return generated_image

#### Action generator

In [None]:
class ActionGenerationModule(nn.Module):
    def __init__(
            self,
            latent_dim,
            output_dim
    ) -> None:
        super().__init__()

    def forward(self):
        pass


### JEPSAMEncoder

In [68]:
class JEPSAMEncoder(nn.Module):
    def __init__(
            self,
            vocab_size:int=cfg.DATASET["NUM_TOTAL_TOKENS"], 
            embedding_dim:int=cfg.MODEL["CNN_FC_DIM"],
            num_layers:int=2
    ):
        super().__init__()
        
        # Semantic Encoder
        self.semantic_encoder = SemanticEncoder(
            vocab_size=vocab_size,
            num_layers=num_layers,
            embedding_dim=embedding_dim
        )

        # Episodic Encoder
        self.episodic_encoder = EpisodicEncoder(hidden_dim=embedding_dim)

        # Features mixer
        self.feature_mixer = nn.LSTM(
            input_size=embedding_dim*2, 
            hidden_size=embedding_dim,
            num_layers= num_layers,
            bidirectional = True,
            batch_first=True
        )

    def forward(self, x_ad, x_ad_lens, x_perceived):
        
        # 1. Semantic Embedding & encoding
        semantic_enc, h_ad, c_ad = self.semantic_encoder(x_ad, x_ad_lens)
        _, seq_len, _ = semantic_enc.shape
        
        # 2. Episodic encoding
        episodic_enc = self.episodic_encoder(x_perceived)
        
        repeated_ep_emb = episodic_enc.unsqueeze(1).expand(
            episodic_enc.shape[0], 
            seq_len, 
            episodic_enc.shape[-1]
        )

        # 3. Fusion
        concat_ftrs = torch.cat((repeated_ep_emb, semantic_enc), dim=-1)
        fused_data, h_fused, c_fused = feature_mixer(concat_ftrs)

        return fused_data, h_fused, c_fused, h_ad, c_ad

In [72]:
jepsam_encoder = JEPSAMEncoder().to(cfg.TRAIN["GPU_DEVICE"])

summary(model=jepsam_encoder)

Layer (type:depth-idx)                             Param #
JEPSAMEncoder                                      --
├─SemanticEncoder: 1-1                             --
│    └─Embedding: 2-1                              19,456
│    └─LSTM: 2-2                                   790,528
├─EpisodicEncoder: 1-2                             --
│    └─ResNetEncoder: 2-3                          --
│    │    └─Sequential: 3-1                        9,536
│    │    └─EncoderBottleneckBlock: 3-2            215,808
│    │    └─EncoderBottleneckBlock: 3-3            1,219,584
│    │    └─EncoderBottleneckBlock: 3-4            7,098,368
│    │    └─EncoderBottleneckBlock: 3-5            14,964,736
│    └─Linear: 2-4                                 8,388,864
├─LSTM: 1-3                                        3,153,920
Total params: 35,860,800
Trainable params: 35,860,800
Non-trainable params: 0

In [73]:
E, hn_E, cn_E, hn_ad, cn_ad = jepsam_encoder(packed.cuda(), lens, ex.cuda())

RuntimeError: Input and parameter tensors are not at the same device, found input tensor at cuda:0 and parameter tensor at cpu

### JEPSAMDecoder