In [2]:
## the structor is ResNet50 -> Transformation -> Position Encoding -> Transformer Encoder-> MLP head
## (B , T , 3 , 224 , 224) -> (B , T , 1024 , 14 , 14) -> (B, T , 2048, 7 , 7) 
## -> (B * T , 2048 , 1 , 1) 
## -> (B*T , 2048 ) -> (B*T , 256) -> (B , T , 256) -> (B , T , 256) -> (B , T , 256) -> (B , T , 256) -> (B*T , 256) 
## -> (B * T , 128) -> (B , T , 128)

In [3]:
import torch
import torchvision.models as models
import torch.nn as nn
import math
class ResNet50(nn.Module):
    """
    The resnet50 layer in the carl paper, the resnet is used to extract feature. We freeze all layers prior to -3 
        then train the following layers for our use.
    """
    def __init__(self):
        """
        Download the pretrain model.
        Specify layers to use
        """
        super().__init__()
        self.model = models.resnet50(pretrained=True)
        self.backbone = nn.Sequential(*list(self.model.children())[:-3])
        self.finetune_layer = nn.Sequential(*list(self.model.children())[-3])
        
        self.resnet_pool = nn.AdaptiveAvgPool2d(1)
    def forward(self,x):
        """
        Use Resnet 50 to extract per-frame features.
        -------
        Input:
            x: (B , T , 3 , 224 , 224)
        Output:
            out: (B , T , 2048) (The output dimension is the same as the -2 layer of ResNet, the only different
            is that we finetune layers between ResNet[-3:-1])
        """
        B , T , C , W , H = x.shape
        frames_per_batch = 25 ## Configuration of how many frames resnet can take once.
        num_blocks = int(math.ceil(float(T) / frames_per_batch))
        output = []
        for i in range(num_blocks): 
            ## make sure the boundary case is considered
            if (i+1)*frames_per_batch > T:
                processing = x[:, i*frames_per_batch:]
            else:
                processing = x[:, i*frames_per_batch:(i+1)*frames_per_batch]
            print(processing.shape)
            processing = processing.contiguous().view(-1,C,W,H)
            print(processing.shape)
            ## feed into ResNet
            self.backbone.eval()

            with torch.no_grad():
                resnet_frame = self.backbone(processing)
            ## append finetune part
            finetune_frame = self.finetune_layer(resnet_frame)
            
            processing = finetune_frame.contiguous().view(B,-1,2048,7,7)
            
            output.append(processing)
        x = torch.cat(output,dim=1)
        x = self.resnet_pool(x)
        x = x.flatten(start_dim=2)
        return x
            
r = ResNet50()

  from .autonotebook import tqdm as notebook_tqdm
  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


In [4]:
from torchvision.io import read_video
from torchvision.transforms import Resize,functional
x,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
y,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
x = torch.tensor(torch.cat((x.unsqueeze(0),y.unsqueeze(0)),dim=0)).float()
x = x[:,:,:224,:224]
x = x.permute(0,1,4,2,3)
x.shape

  """


torch.Size([2, 66, 3, 224, 224])

In [5]:
resnet_out = r(x)
print(resnet_out.shape)

torch.Size([2, 25, 3, 224, 224])
torch.Size([50, 3, 224, 224])
torch.Size([2, 25, 3, 224, 224])
torch.Size([50, 3, 224, 224])
torch.Size([2, 16, 3, 224, 224])
torch.Size([32, 3, 224, 224])
torch.Size([2, 66, 2048])


In [6]:
class Transformation(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_layer = []
        
        inchannel = 2048
        for layer in range(2):
            self.fc_layer.append(nn.Dropout(0.1))
            self.fc_layer.append(nn.Linear(inchannel,512))
            self.fc_layer.append(nn.BatchNorm1d(512))
            self.fc_layer.append(nn.ReLU(True))
            inchannel = 512
        self.fc_layer = nn.Sequential(*self.fc_layer)
        
        self.video_emb = nn.Linear(512,256)
    def forward(self,x):
        B , T , R = x.shape
        x = x.view(-1,R)
        x = self.fc_layer(x)
        
        x = self.video_emb(x)
        x = x.view(B,T , x.shape[1])
        return x
    
t = Transformation()

In [7]:
transformed = t(resnet_out)
transformed.shape

torch.Size([2, 66, 256])

In [8]:
import numpy as np
class PositionalEncoder(nn.Module):
    def __init__(self):
        super(PositionalEncoder,self).__init__() ## There is NO DIFFERENCES b/t super() and super(className,self) AFTER python3
        self.drop_out = nn.Dropout(0.1)
    def generate_position_encoding(self,seq_len,d_model):
        """
        Position Encoding:
            Generate multiple seq, the dimesion will be (seq_len,d_model)
            For even number of dimension "d_model", generate sin wave.
            For odd numbers, generate cosine wave.
        """
        pos_matrix = np.zeros((seq_len,d_model))
        for pos in range(seq_len):
            for i in np.arange(d_model/2):
                pos_matrix[pos,int(2*i)] = np.sin(pos / 10000**(2*i / d_model))
                pos_matrix[pos,int(2*i)+1] = np.cos(pos / 10000**(2*i / d_model))
        return torch.from_numpy(pos_matrix).unsqueeze(0)
        
    def forward(self,x):
        B , T , D = x.shape
        pos_matrix = self.generate_position_encoding(T,D)
        print(x.shape,pos_matrix.shape)
        x = x + pos_matrix.type_as(x)
        x = self.drop_out(x)
        return x

PE = PositionalEncoder()
pe = PE(transformed)
pe.shape

torch.Size([2, 66, 256]) torch.Size([1, 66, 256])


torch.Size([2, 66, 256])

In [9]:
class Attention(nn.Module):
    def __init__(self,embed_size,heads):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = self.embed_size // self.heads
        assert self.head_dim * self.heads == self.embed_size, "dim not compatible"
        
        self.Q2d = nn.Linear(embed_size,embed_size)
        self.K2d = nn.Linear(embed_size,embed_size)
        self.V2d = nn.Linear(embed_size,embed_size)
        
        self.d2o = nn.Linear(embed_size,embed_size)
        
        self.drop_out = nn.Dropout(0.1)
    def forward(self,Q,K,V,mask= None):
        ## in essence, Q, K and V comes from the same tensor(which is X). So Q.shape = K.shape = V.shape
        B , T , _ = Q.shape
        
        ## generate embeddings for query, key ,value
        Q = self.Q2d(Q)
        K = self.K2d(K)
        V = self.V2d(V)
        ## split QKV to n_heads
        Q = Q.view(B , -1 , self.head_dim,self.heads)
        K = K.view(B , -1 , self.head_dim,self.heads)
        V = V.view(B , -1 , self.head_dim,self.heads)
        ## do inner-product for queries and keys
        inner_product = torch.einsum("bqhd,bkhd->bhqk",[Q,K])
        print("inner_product shape" , inner_product.shape)
        ## apply mask in case some of the inputs are padded instead of real things
        if mask is not None:
            inner_product = inner_product.mask_filled(mask==0,float("-1e20"))
        ## find how many attention to pay for each place
        attention = torch.softmax(inner_product / (self.embed_size**(1/2)),dim=3) ## divided by self.embed_size ^ 1/2 according to the paper
        ## sum up the values, multiplied by attention in palce
        out = torch.einsum("bhqk,bvhd->bqhd",[attention,V])
        print(out.shape)
        # out.permute(0,3,1,2)
        # print(out.shape)
        ## apply drop out
        out = self.drop_out(out).reshape(B,T,-1)
        
        out = self.d2o(out)
        print(out.shape)
        return out
A = Attention(transformed.shape[2],8)
a = A(transformed,transformed,transformed)

inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])


In [10]:
class ResidualConnection(nn.Module):
    def __init__(self,embed_size,d_out):
        super().__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.drop_out = nn.Dropout(d_out)
    def forward(self,x,sublayer):
        res = self.norm(x)
        res = sublayer(res)
        res= self.drop_out(res)
        return x + res

In [11]:
class EncodingLayer(nn.Module):
    """
    Transformer's encoding layer, in each layer it will do attention mechanism and feedforward
    """
    def __init__(self,embed_size,d_ff=1024):
        super().__init__()
        self.reslayer = ResidualConnection(embed_size,.1)
        self.attention = Attention(embed_size,8)
        
        self.feedforward = nn.Sequential(
            nn.Linear(embed_size,d_ff),
            nn.ReLU(True),
            nn.Dropout(0.1),
            nn.Linear(d_ff,embed_size),
        )
    def forward(self,x):
        sublayer = lambda x:self.attention(x,x,x,None)
        x = self.reslayer(x,sublayer)
        print(x.shape)
        x = self.reslayer(x,self.feedforward)
        return x

EL = EncodingLayer(transformed.shape[2])
el = EL(a)
el.shape

inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])


torch.Size([2, 66, 256])

In [12]:
class TransformerEncoder(nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.positional_encoder = PositionalEncoder()
        self.encode_layers = nn.ModuleList([
            EncodingLayer(embed_size) for _ in range(3)
        ])
        self.embedding_layer = nn.Linear(256,128)
    def forward(self,x):
        x = self.positional_encoder(x)
        for encode_layer in self.encode_layers:
            x = encode_layer(x)
        x = self.embedding_layer(x)
        return x
Transformer = TransformerEncoder(256)
tfr = Transformer(transformed)

tfr.shape

torch.Size([2, 66, 256]) torch.Size([1, 66, 256])
inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])
inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])
inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])


torch.Size([2, 66, 128])

In [13]:
class CARL(nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.resnet = ResNet50()
        self.transformation = Transformation()
        self.transformerEncoder = TransformerEncoder(embed_size)
    def forward(self,x):
        resnet = self.resnet(x)
        transformed = self.transformation(resnet)
        encoding = self.transformerEncoder(transformed)
        return encoding
carl = CARL(256)
from torchvision.io import read_video
from torchvision.transforms import Resize,functional
x,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
y,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
x = torch.tensor(torch.cat((x.unsqueeze(0),y.unsqueeze(0)),dim=0)).float()
x = x[:,:,:224,:224]
x = x.permute(0,1,4,2,3)
print(x.shape)

y = carl(x)
print(y.shape)

  app.launch_new_instance()


torch.Size([2, 66, 3, 224, 224])
torch.Size([2, 25, 3, 224, 224])
torch.Size([50, 3, 224, 224])
torch.Size([2, 25, 3, 224, 224])
torch.Size([50, 3, 224, 224])
torch.Size([2, 16, 3, 224, 224])
torch.Size([32, 3, 224, 224])
torch.Size([2, 66, 256]) torch.Size([1, 66, 256])
inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])
inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])
inner_product shape torch.Size([2, 32, 66, 66])
torch.Size([2, 66, 32, 8])
torch.Size([2, 66, 256])
torch.Size([2, 66, 256])
torch.Size([2, 66, 128])


In [89]:
class MOCA(nn.Module):
    def __init__(self):
        super().__init__()
        self.phi = nn.Conv1d(2048,1024,kernel_size=1,stride=1)
        self.theta = nn.Conv1d(2048,1024,kernel_size=1,stride=1)
        self.g = nn.Conv1d(2048,1024,kernel_size=1,stride=1)
        self.ruo = nn.Conv2d(2,1,kernel_size=1,stride=1)
        self.W = nn.Conv1d(1024,2048,kernel_size=1,stride=1)
        self.video_emb=nn.Linear(2048,256)
    def forward(self,x):
        print(x.shape)
        x = x.permute(0,2,1)
        B, D, T = x.size()
        ## NSSM
        x_ = x.permute(0,2,1)
        NSSM = x_.matmul(x).softmax(dim=-1)
        print("NSSM shape:" , NSSM)
        # AttentionMap
        x_theta = self.theta(x)
        x_phi = self.phi(x).permute(0,2,1).contiguous()
        AttentionMap = x_phi.matmul(x_theta).softmax(dim=-1)
        print("Attention shape:" , AttentionMap)
        # MocaMap
        x_concat = torch.cat((NSSM,AttentionMap),dim=0)
        print("original x concat shape:" , x_concat.shape)
        x_concat = x_concat.view(NSSM.size(0),-1,NSSM.size(1),NSSM.size(2))
        print("x concat shape: " , x_concat.shape)
        MocaMap = self.ruo(x_concat).reshape(B,T,T)
        print("MocaMap shape:" , MocaMap.shape)
        # g branch
        x_g = self.g(x).permute(0,2,1)
        print("X G shape:" , x_g.shape)
        
        Y = MocaMap.matmul(x_g).permute(0,2,1)
        Wz = self.W(Y)
        Z = Wz+x
        Z = Z.reshape(B,T,D)
        Z = self.video_emb(Z)
        return Z

In [74]:
data = torch.rand(2,10,2048)
m = MOCA()
Z = m(data)
print("Z shape:" , Z.shape)
print("Z :" , Z)

torch.Size([2, 10, 2048])
NSSM shape: torch.Size([2, 10, 10])
Attention shape: torch.Size([2, 10, 10])
original x concat shape: torch.Size([4, 10, 10])
x concat shape:  torch.Size([2, 2, 10, 10])
MocaMap shape: torch.Size([2, 10, 10])
X G shape: torch.Size([2, 10, 1024])
Z shape: torch.Size([2, 10, 256])
Z : tensor([[[ 3.6941e-01,  4.1282e-01,  3.5105e-01,  ...,  3.9634e-01,
          -8.2744e-01, -2.8974e-01],
         [ 3.7809e-01,  1.4918e-01,  5.5495e-01,  ...,  3.8702e-01,
          -4.4206e-01,  3.2910e-01],
         [ 6.5945e-01, -4.2198e-01, -3.2817e-02,  ...,  9.3894e-01,
          -4.6161e-04, -3.7253e-01],
         ...,
         [-1.0380e+00, -1.6090e-01,  5.5413e-01,  ...,  3.9025e-01,
           1.3534e+00, -2.0494e-01],
         [-5.8026e-01, -1.1518e-01,  2.5479e-01,  ...,  1.2233e+00,
          -3.6950e-01, -1.0978e-02],
         [-1.3043e-01,  1.1606e-01,  4.8518e-02,  ...,  6.7899e-01,
          -3.6790e-01, -5.2244e-01]],

        [[ 1.1995e-01, -4.3977e-03,  1.3517e

In [99]:
new_carl = CARL(256)
new_carl.transformation = MOCA2()

In [100]:
from torchvision.io import read_video
from torchvision.transforms import Resize,functional
x,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
y,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")

print(x.shape)
print(y.shape)

#x = torch.rand(200,224,224,3)
#y = torch.rand(200,224,224,3)

x = torch.tensor(torch.cat((x.unsqueeze(0),y.unsqueeze(0)),dim=0)).float()
x = x[:,:,:224,:224]
x = x.permute(0,1,4,2,3)
print(x.shape)

y = new_carl(x)
print(y.shape)
print(y)

torch.Size([66, 720, 404, 3])
torch.Size([66, 720, 404, 3])
torch.Size([2, 66, 3, 224, 224])
torch.Size([2, 25, 3, 224, 224])
torch.Size([50, 3, 224, 224])


  if sys.path[0] == "":


torch.Size([2, 25, 3, 224, 224])
torch.Size([50, 3, 224, 224])
torch.Size([2, 16, 3, 224, 224])
torch.Size([32, 3, 224, 224])
NSSM shape: tensor([[[1.1842e-08, 2.2539e-16, 1.0000e+00,  ..., 9.1323e-36,
          2.8325e-35, 1.3968e-34],
         [6.9568e-16, 2.5110e-14, 1.0000e+00,  ..., 2.3888e-36,
          2.4585e-36, 2.4070e-35],
         [2.1654e-25, 7.0157e-26, 1.0000e+00,  ..., 1.3901e-42,
          2.4657e-41, 2.4158e-40],
         ...,
         [4.3682e-27, 3.7019e-28, 3.0694e-09,  ..., 4.6514e-07,
          5.2711e-06, 1.4452e-05],
         [1.0525e-27, 2.9599e-29, 4.2313e-09,  ..., 4.0950e-07,
          5.2761e-03, 3.6984e-02],
         [2.2724e-28, 1.2687e-29, 1.8149e-09,  ..., 4.9152e-08,
          1.6191e-03, 7.5612e-01]],

        [[1.1842e-08, 2.2539e-16, 1.0000e+00,  ..., 9.1323e-36,
          2.8325e-35, 1.3968e-34],
         [6.9568e-16, 2.5110e-14, 1.0000e+00,  ..., 2.3888e-36,
          2.4585e-36, 2.4070e-35],
         [2.1654e-25, 7.0157e-26, 1.0000e+00,  ..., 1.

In [98]:
class MOCA2(nn.Module):
    def __init__(self):
        super().__init__()
        self.theta = nn.Conv1d(2048 , 1024 , kernel_size=1 , stride=1 )
        self.phi = nn.Conv1d(2048 , 1024 , kernel_size=1 , stride=1 )
        self.g = nn.Conv1d(2048 , 1024 , kernel_size=1 , stride=1 )
        self.rou = nn.Conv2d(2,1,kernel_size=1,stride=1)
        self.w = nn.Conv1d(1024,2048,kernel_size=1,stride=1)

        self.video_emb = nn.Linear(2048,256)
    def forward(self,x,B=2,T=66):
        x = x.view(B,T,2048)
        # B , T , D = x.size()
        ## NSSM
        x_ = x.permute(0,2,1) # B,D,T
        NSSM = x.matmul(x_).softmax(dim=-1) # B,T,T
        print("NSSM shape:" , NSSM)
        ## AttetnionMap

        x = x.permute(0,2,1) # B,D,T
        x_theta = self.theta(x) # B,D/2,T
        x_phi = self.phi(x).permute(0,2,1) # B,T,D/2
        AttentionMap = x_phi.matmul(x_theta).softmax(dim=-1) # B,T,T
        print("Attention shape:" , AttentionMap)
        ## MocaMap
        x_concat = torch.cat([NSSM,AttentionMap],dim=0) # 2B,T,T
        x_concat = x_concat.view(B,-1,T,T) # B,2,T,T
        MocaMap = self.rou(x_concat).squeeze(1).softmax(dim=-1) # B,T,T
        ## G branch
        x_g = self.g(x) # B,D/2,T

        # print("MocaMap.shape: ",MocaMap.shape)
        # print("x_g.shape: ",x_g.shape)

        Y = x_g.matmul(MocaMap) # B,D/2,T
        Wz = self.w(Y)          # B,D,T
        Z = (Wz+x).permute(0,2,1)                # B,T,D
        Z = self.video_emb(Z)                    # B,T,256
        return Z

In [2]:
import torch
import sys
sys.path.append('/home/c1l1mo/projects/VideoAlignment')
import model
imp

In [3]:
cfg=None
carl = model.carl_transformer.transformer.TransformerModel(cfg)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


AttributeError: 'NoneType' object has no attribute 'MODEL'