The goal here is to build a FSQ-VAE which builds latent vectors for spatio-temporal "tublets" described in the ViViT. Sequences of tublet latent vectors are to then be modeled by a Transformer Decoder

![](https://i.imgur.com/9G7QTfV.png)

In [1]:
import torch
from torch import nn

In [2]:
# I want to take a video tensor of shape (B, C, T, H, W) and split it into
# patches of shape (t, p, p) aka "tubelets", but instead of performing a single
# Conv3d operation on them such that the tublet dims are the kernel dims, I want
# to have tublets be the input to a small VAE so that the resulting latents
# can be mapped back to pixel space

# video dimensions
B, C, T, H, W = 4, 3, 64, 256, 256

# patch dim
p = 16
t = 4

vid = torch.randn(B, C, T, H, W)

In [3]:
# I want to take a
vid_unfolded_h = vid.unfold(3, p, p)
print(vid_unfolded_h.shape)
vid_unfolded_wh = vid_unfolded_h.unfold(4, p, p)
print(vid_unfolded_wh.shape)

torch.Size([4, 3, 64, 16, 256, 16])
torch.Size([4, 3, 64, 16, 16, 16, 16])


In [4]:
vid_patch_seq = vid_unfolded_wh.reshape(B, C, T, -1, p, p)

In [5]:
print(vid_patch_seq.shape)

torch.Size([4, 3, 64, 256, 16, 16])


In [6]:
vid_patch_seq = vid_patch_seq.permute(0, 3, 1, 2, 4, 5).contiguous()
print(vid_patch_seq.shape)

torch.Size([4, 256, 3, 64, 16, 16])


In [7]:
vid_patch_seq = vid_patch_seq.view(-1, C, T, p, p)
print(vid_patch_seq.shape)

torch.Size([1024, 3, 64, 16, 16])


In [8]:
# TODO: I'm no sure this is correct, I want a sequence of tubelets
# like the illustration, ie. cat([x_1, x_k], [x_k+1, x_j], ...)
vid_patch_seq = vid_patch_seq.unfold(2, t, t)
print(vid_patch_seq.shape)

torch.Size([1024, 3, 16, 16, 16, 4])


In [None]:
conv3d_1 = nn.Conv3d(in_channels=C, out_channels=32, kernel_size=(t, p, p), stride=2, padding=1)
out1 = conv3d_1(vid)
print(out1.shape)

torch.Size([1, 32, 16, 126, 126])
