In [None]:
## the structor is ResNet50 -> Transformation -> Position Encoding -> Transformer Encoder-> MLP head
## (B , T , 3 , 224 , 224) -> (B , T , 1024 , 14 , 14) -> (B, T , 2048, 7 , 7) 
## -> (B * T , 2048 , 1 , 1) 
## -> (B*T , 2048 ) -> (B*T , 256) -> (B , T , 256) -> (B , T , 256) -> (B , T , 256) -> (B , T , 256) -> (B*T , 256) 
## -> (B * T , 128) -> (B , T , 128)

In [None]:
import torch
import torchvision.models as models
import torch.nn as nn
import math
class ResNet50(nn.Module):
    """
    The resnet50 layer in the carl paper, the resnet is used to extract feature. We freeze all layers prior to -3 
        then train the following layers for our use.
    """
    def __init__(self):
        """
        Download the pretrain model.
        Specify layers to use
        """
        super().__init__()
        self.model = models.resnet50(pretrained=True)
        self.backbone = nn.Sequential(*list(self.model.children())[:-3])
        self.finetune_layer = nn.Sequential(*list(self.model.children())[-3])
        
        self.resnet_pool = nn.AdaptiveAvgPool2d(1)
    def forward(self,x):
        """
        Use Resnet 50 to extract per-frame features.
        -------
        Input:
            x: (B , T , 3 , 224 , 224)
        Output:
            out: (B , T , 2048) (The output dimension is the same as the -2 layer of ResNet, the only different
            is that we finetune layers between ResNet[-3:-1])
        """
        B , T , C , W , H = x.shape
        frames_per_batch = 25 ## Configuration of how many frames resnet can take once.
        num_blocks = int(math.ceil(float(T) / frames_per_batch))
        output = []
        for i in range(num_blocks): 
            ## make sure the boundary case is considered
            if (i+1)*frames_per_batch > T:
                processing = x[:, i*frames_per_batch:]
            else:
                processing = x[:, i*frames_per_batch:(i+1)*frames_per_batch]
            print(processing.shape)
            processing = processing.contiguous().view(-1,C,W,H)
            print(processing.shape)
            ## feed into ResNet
            self.backbone.eval()

            with torch.no_grad():
                resnet_frame = self.backbone(processing)
            ## append finetune part
            finetune_frame = self.finetune_layer(resnet_frame)
            
            processing = finetune_frame.contiguous().view(B,-1,2048,7,7)
            
            output.append(processing)
        x = torch.cat(output,dim=1)
        x = self.resnet_pool(x)
        x = x.flatten(start_dim=2)
        return x
            
r = ResNet50()

In [None]:
from torchvision.io import read_video
from torchvision.transforms import Resize,functional
x,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
y,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
x = torch.tensor(torch.cat((x.unsqueeze(0),y.unsqueeze(0)),dim=0)).float()
x = x[:,:,:224,:224]
x = x.permute(0,1,4,2,3)
x.shape

In [None]:
resnet_out = r(x)
print(resnet_out.shape)

In [None]:
class Transformation(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_layer = []
        
        inchannel = 2048
        for layer in range(2):
            self.fc_layer.append(nn.Dropout(0.1))
            self.fc_layer.append(nn.Linear(inchannel,512))
            self.fc_layer.append(nn.BatchNorm1d(512))
            self.fc_layer.append(nn.ReLU(True))
            inchannel = 512
        self.fc_layer = nn.Sequential(*self.fc_layer)
        
        self.video_emb = nn.Linear(512,256)
    def forward(self,x):
        B , T , R = x.shape
        x = x.view(-1,R)
        x = self.fc_layer(x)
        
        x = self.video_emb(x)
        x = x.view(B,T , x.shape[1])
        return x
    
t = Transformation()

In [None]:
transformed = t(resnet_out)
transformed.shape

In [None]:
import numpy as np
class PositionalEncoder(nn.Module):
    def __init__(self):
        super(PositionalEncoder,self).__init__() ## There is NO DIFFERENCES b/t super() and super(className,self) AFTER python3
        self.drop_out = nn.Dropout(0.1)
    def generate_position_encoding(self,seq_len,d_model):
        """
        Position Encoding:
            Generate multiple seq, the dimesion will be (seq_len,d_model)
            For even number of dimension "d_model", generate sin wave.
            For odd numbers, generate cosine wave.
        """
        pos_matrix = np.zeros((seq_len,d_model))
        for pos in range(seq_len):
            for i in np.arange(d_model/2):
                pos_matrix[pos,int(2*i)] = np.sin(pos / 10000**(2*i / d_model))
                pos_matrix[pos,int(2*i)+1] = np.cos(pos / 10000**(2*i / d_model))
        return torch.from_numpy(pos_matrix).unsqueeze(0)
        
    def forward(self,x):
        B , T , D = x.shape
        pos_matrix = self.generate_position_encoding(T,D)
        print(x.shape,pos_matrix.shape)
        x = x + pos_matrix.type_as(x)
        x = self.drop_out(x)
        return x

PE = PositionalEncoder()
pe = PE(transformed)
pe.shape

In [None]:
class Attention(nn.Module):
    def __init__(self,embed_size,heads):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = self.embed_size // self.heads
        assert self.head_dim * self.heads == self.embed_size, "dim not compatible"
        
        self.Q2d = nn.Linear(embed_size,embed_size)
        self.K2d = nn.Linear(embed_size,embed_size)
        self.V2d = nn.Linear(embed_size,embed_size)
        
        self.d2o = nn.Linear(embed_size,embed_size)
        
        self.drop_out = nn.Dropout(0.1)
    def forward(self,Q,K,V,mask= None):
        ## in essence, Q, K and V comes from the same tensor(which is X). So Q.shape = K.shape = V.shape
        B , T , _ = Q.shape
        
        ## generate embeddings for query, key ,value
        Q = self.Q2d(Q)
        K = self.K2d(K)
        V = self.V2d(V)
        ## split QKV to n_heads
        Q = Q.view(B , -1 , self.head_dim,self.heads)
        K = K.view(B , -1 , self.head_dim,self.heads)
        V = V.view(B , -1 , self.head_dim,self.heads)
        ## do inner-product for queries and keys
        inner_product = torch.einsum("bqhd,bkhd->bhqk",[Q,K])
        print("inner_product shape" , inner_product.shape)
        ## apply mask in case some of the inputs are padded instead of real things
        if mask is not None:
            inner_product = inner_product.mask_filled(mask==0,float("-1e20"))
        ## find how many attention to pay for each place
        attention = torch.softmax(inner_product / (self.embed_size**(1/2)),dim=3) ## divided by self.embed_size ^ 1/2 according to the paper
        ## sum up the values, multiplied by attention in palce
        out = torch.einsum("bhqk,bvhd->bqhd",[attention,V])
        print(out.shape)
        # out.permute(0,3,1,2)
        # print(out.shape)
        ## apply drop out
        out = self.drop_out(out).reshape(B,T,-1)
        
        out = self.d2o(out)
        print(out.shape)
        return out
A = Attention(transformed.shape[2],8)
a = A(transformed,transformed,transformed)

In [None]:
class ResidualConnection(nn.Module):
    def __init__(self,embed_size,d_out):
        super().__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.drop_out = nn.Dropout(d_out)
    def forward(self,x,sublayer):
        res = self.norm(x)
        res = sublayer(res)
        res= self.drop_out(res)
        return x + res

In [None]:
class EncodingLayer(nn.Module):
    """
    Transformer's encoding layer, in each layer it will do attention mechanism and feedforward
    """
    def __init__(self,embed_size,d_ff=1024):
        super().__init__()
        self.reslayer = ResidualConnection(embed_size,.1)
        self.attention = Attention(embed_size,8)
        
        self.feedforward = nn.Sequential(
            nn.Linear(embed_size,d_ff),
            nn.ReLU(True),
            nn.Dropout(0.1),
            nn.Linear(d_ff,embed_size),
        )
    def forward(self,x):
        sublayer = lambda x:self.attention(x,x,x,None)
        x = self.reslayer(x,sublayer)
        print(x.shape)
        x = self.reslayer(x,self.feedforward)
        return x

EL = EncodingLayer(transformed.shape[2])
el = EL(a)
el.shape

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.positional_encoder = PositionalEncoder()
        self.encode_layers = nn.ModuleList([
            EncodingLayer(embed_size) for _ in range(3)
        ])
        self.embedding_layer = nn.Linear(256,128)
    def forward(self,x):
        x = self.positional_encoder(x)
        for encode_layer in self.encode_layers:
            x = encode_layer(x)
        x = self.embedding_layer(x)
        return x
Transformer = TransformerEncoder(256)
tfr = Transformer(transformed)

tfr.shape

In [None]:
class CARL(nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.resnet = ResNet50()
        self.transformation = Transformation()
        self.transformerEncoder = TransformerEncoder(embed_size)
    def forward(self,x):
        resnet = self.resnet(x)
        transformed = self.transformation(resnet)
        encoding = self.transformerEncoder(transformed)
        return encoding
carl = CARL(256)
from torchvision.io import read_video
from torchvision.transforms import Resize,functional
x,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
y,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
x = torch.tensor(torch.cat((x.unsqueeze(0),y.unsqueeze(0)),dim=0)).float()
x = x[:,:,:224,:224]
x = x.permute(0,1,4,2,3)
print(x.shape)

y = carl(x)
print(y.shape)

In [None]:
class MOCA(nn.Module):
    def __init__(self):
        super().__init__()
        self.phi = nn.Conv1d(2048,1024,kernel_size=1,stride=1)
        self.theta = nn.Conv1d(2048,1024,kernel_size=1,stride=1)
        self.g = nn.Conv1d(2048,1024,kernel_size=1,stride=1)
        self.ruo = nn.Conv2d(2,1,kernel_size=1,stride=1)
        self.W = nn.Conv1d(1024,2048,kernel_size=1,stride=1)
        self.video_emb=nn.Linear(2048,256)
    def forward(self,x):
        print(x.shape)
        x = x.permute(0,2,1)
        B, D, T = x.size()
        ## NSSM
        x_ = x.permute(0,2,1)
        NSSM = x_.matmul(x).softmax(dim=-1)
        print("NSSM shape:" , NSSM)
        # AttentionMap
        x_theta = self.theta(x)
        x_phi = self.phi(x).permute(0,2,1).contiguous()
        AttentionMap = x_phi.matmul(x_theta).softmax(dim=-1)
        print("Attention shape:" , AttentionMap)
        # MocaMap
        x_concat = torch.cat((NSSM,AttentionMap),dim=0)
        print("original x concat shape:" , x_concat.shape)
        x_concat = x_concat.view(NSSM.size(0),-1,NSSM.size(1),NSSM.size(2))
        print("x concat shape: " , x_concat.shape)
        MocaMap = self.ruo(x_concat).reshape(B,T,T)
        print("MocaMap shape:" , MocaMap.shape)
        # g branch
        x_g = self.g(x).permute(0,2,1)
        print("X G shape:" , x_g.shape)
        
        Y = MocaMap.matmul(x_g).permute(0,2,1)
        Wz = self.W(Y)
        Z = Wz+x
        Z = Z.reshape(B,T,D)
        Z = self.video_emb(Z)
        return Z

In [None]:
data = torch.rand(2,10,2048)
m = MOCA()
Z = m(data)
print("Z shape:" , Z.shape)
print("Z :" , Z)

In [None]:
new_carl = CARL(256)
new_carl.transformation = MOCA2()

In [None]:
from torchvision.io import read_video
from torchvision.transforms import Resize,functional
x,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")
y,_,_ = read_video("/home/c1l1mo/datasets/ACM_skating/Axel_and_Axel_com/467204328706015300.mp4")

print(x.shape)
print(y.shape)

#x = torch.rand(200,224,224,3)
#y = torch.rand(200,224,224,3)

x = torch.tensor(torch.cat((x.unsqueeze(0),y.unsqueeze(0)),dim=0)).float()
x = x[:,:,:224,:224]
x = x.permute(0,1,4,2,3)
print(x.shape)

y = new_carl(x)
print(y.shape)
print(y)

In [None]:
class MOCA2(nn.Module):
    def __init__(self):
        super().__init__()
        self.theta = nn.Conv1d(2048 , 1024 , kernel_size=1 , stride=1 )
        self.phi = nn.Conv1d(2048 , 1024 , kernel_size=1 , stride=1 )
        self.g = nn.Conv1d(2048 , 1024 , kernel_size=1 , stride=1 )
        self.rou = nn.Conv2d(2,1,kernel_size=1,stride=1)
        self.w = nn.Conv1d(1024,2048,kernel_size=1,stride=1)

        self.video_emb = nn.Linear(2048,256)
    def forward(self,x,B=2,T=66):
        x = x.view(B,T,2048)
        # B , T , D = x.size()
        ## NSSM
        x_ = x.permute(0,2,1) # B,D,T
        NSSM = x.matmul(x_).softmax(dim=-1) # B,T,T
        print("NSSM shape:" , NSSM)
        ## AttetnionMap

        x = x.permute(0,2,1) # B,D,T
        x_theta = self.theta(x) # B,D/2,T
        x_phi = self.phi(x).permute(0,2,1) # B,T,D/2
        AttentionMap = x_phi.matmul(x_theta).softmax(dim=-1) # B,T,T
        print("Attention shape:" , AttentionMap)
        ## MocaMap
        x_concat = torch.cat([NSSM,AttentionMap],dim=0) # 2B,T,T
        x_concat = x_concat.view(B,-1,T,T) # B,2,T,T
        MocaMap = self.rou(x_concat).squeeze(1).softmax(dim=-1) # B,T,T
        ## G branch
        x_g = self.g(x) # B,D/2,T

        # print("MocaMap.shape: ",MocaMap.shape)
        # print("x_g.shape: ",x_g.shape)

        Y = x_g.matmul(MocaMap) # B,D/2,T
        Wz = self.w(Y)          # B,D,T
        Z = (Wz+x).permute(0,2,1)                # B,T,D
        Z = self.video_emb(Z)                    # B,T,256
        return Z

In [1]:
import torch
import sys
sys.path.append('/home/c1l1mo/projects/VideoAlignment')
import model
import yaml
cfg_file = "/home/c1l1mo/projects/VideoAlignment/result/scl_processed_axel_trimmed/config.yaml"
with open(cfg_file, 'r') as config_file:
    config_dict = yaml.safe_load(config_file)
from easydict import EasyDict as Edict

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
cfg=Edict(config_dict)
carl = model.carl_transformer.transformer.TransformerModel(cfg)
mine = model.transformer.transformer.CARL(cfg)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


256 8 256 False
256 0.0 False
256 0.0 False
256 0.0 False
256 0.0 False
256 0.0 False
256 0.0 False


In [9]:
carl_res = carl.backbone
my_res = mine.resnet50.backbone

In [10]:
data = torch.rand(1,10,3,224,224)

In [11]:
import math
def carl_process(x):
    batch_size, num_steps, c , h ,w = data.shape
    frames_per_batch = 40
    num_blocks = int(math.ceil(float(num_steps)/frames_per_batch))
    backbone_out = []
    for i in range(num_blocks):
        curr_idx = i * frames_per_batch
        cur_steps = min(num_steps-curr_idx, frames_per_batch) ## make sure the next line will not be out of bound
        curr_data = x[:, curr_idx:curr_idx+cur_steps]         ## take a batch for resnet to encode (size will be BASRMODEL.BATCH_SIZE or the remainder)
        curr_data = curr_data.contiguous().view(-1, c, h, w)
        carl.backbone.eval()
        with torch.no_grad():
            curr_emb = carl.backbone(curr_data)
        curr_emb = carl.res_finetune(curr_emb)
        _, out_c, out_h, out_w = curr_emb.size()
        curr_emb = curr_emb.contiguous().view(batch_size, cur_steps, out_c, out_h, out_w)
        backbone_out.append(curr_emb)
        x = torch.cat(backbone_out, dim=1)
        _,_,c,h,w = x.shape
        x = x.view(batch_size*num_steps,c,h,w)
        x = carl.embed.pooling(x)
        x = x.flatten(start_dim=1)
        
    return x
def my_process(x):
    x = mine.resnet50(x)
    return x
    B , T , C , W ,H = x.shape
    frames_per_batch = 40
    num_blocks = int(math.ceil(float(T)/frames_per_batch))
    output = []
    for i in range(num_blocks):
        if (i+1) * frames_per_batch > T:
            processing = x[:,i*frames_per_batch:]
        else:
            processing = x[:,i*frames_per_batch:(i+1)*frames_per_batch]
        processing = processing.contiguous().view(-1,C,W,H)
        ## feed into resnet
        mine.resnet50.backbone.eval()
        with torch.no_grad():
            processing = mine.resnet50.backbone(processing)
        processing = mine.resnet50.finetune(processing)
        processing = processing.view(B,-1,2048,7,7)
        output.append(processing)
        x = torch.cat(output, dim=1)
    return x

In [12]:
print(torch.equal(carl_process(data),my_process(data)))
print(carl_process(data).shape)
print(my_process(data).shape)

True
torch.Size([10, 2048])
torch.Size([10, 2048])


In [19]:
import torch.nn as nn
import random
import numpy as np
seed = 7 
random.seed(seed)                          
np.random.seed(seed)                       
torch.manual_seed(seed)                    
torch.cuda.manual_seed(seed)               
torch.cuda.manual_seed_all(seed)           
torch.backends.cudnn.deterministic = True  

C = []
in_channel = 2048
for layer in range(2):
    #C.append(nn.Dropout(.1))
    C.append(nn.Linear(in_channel,512))
    C.append(nn.BatchNorm1d(512))
    C.append(nn.ReLU(True))
    in_channel = 512
C = nn.Sequential(*C)


M = []
in_channel = 2048
for layer in range(2):
    #M.append(nn.Dropout(.1))
    M.append(nn.Linear(in_channel,512))
    M.append(nn.BatchNorm1d(512))
    M.append(nn.ReLU(True))
    in_channel = 512
M = nn.Sequential(*M)
    
for name,parameter in C.named_parameters():
    parameter.data.fill_(.1)
for name,parameter in M.named_parameters():
    parameter.data.fill_(.1)


In [20]:
torch.equal(C(carl_process(data)),M(my_process(data)))

True

In [None]:
def generate_sincos_embedding(seq_len, d_model, train_len=None):
    odds = np.arange(0, d_model, 2)
    evens = np.arange(1, d_model, 2)
    pos_enc_mat = np.zeros((seq_len, d_model))

    # if train_len is None:
    #     pos_list = np.arange(seq_len)
    # else:
    #     pos_list = np.linspace(0, train_len-1, num=seq_len)
    pos_list = np.arange(seq_len)

    for i, pos in enumerate(pos_list):
        pos_enc_mat[i, odds] = np.sin(pos / (10000 ** (odds / d_model)))
        pos_enc_mat[i, evens] = np.cos(pos / (10000 ** (evens / d_model)))

    return torch.from_numpy(pos_enc_mat).unsqueeze(0)

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, dout_p, seq_len=3660):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dout_p)
        self.seq_len = seq_len

    def forward(self, x):
        B, S, d_model = x.shape
        if S != self.seq_len:
            pos_enc_mat = generate_sincos_embedding(S, d_model, self.seq_len)
            x = x + pos_enc_mat.type_as(x)
        else:
            pos_enc_mat = generate_sincos_embedding(S, d_model)
            x = x + pos_enc_mat.type_as(x)
        x = self.dropout(x)
        return x
    
class PositionalEncoding(nn.Module):
    def __init__(self,embed_size,dout_p):
        super().__init__()
        self.embed_size=embed_size  # dimension of the input/output embedding space e.g.: if the input is (T x 256), T is the seqence length and 256 is the embedding space (512)
        self.dropout = nn.Dropout(dout_p)  

    def encode(self,seq_len,embed_size,train_len=None):
        # construct all the odds entries
        odds = np.arange(0,embed_size,2)  ## [0 , 2 , 4 , .... , d_model ]    (if d_model is odd) 
        evens = np.arange(1,embed_size,2) ## [1 , 3 , 5 , .... , d_model-1]   (if d_model is odd)
        
        # construct multiple positional encoding since transformer operates parrellally
        pos_enc_mat = np.zeros((seq_len,embed_size)) ## Shape: (seq_len , d_model)

        pos_list = np.arange(seq_len) ## [0 , 1 , 2 , 3 , 4 , .... , seq_len-1]
        for i,pos in enumerate((pos_list)):
            pos_enc_mat[i, odds]  = np.sin(pos / (10000 ** (odds / embed_size))) 
            pos_enc_mat[i, evens] = np.cos(pos / (10000 ** (evens / embed_size)))

        return torch.from_numpy(pos_enc_mat).unsqueeze(0) 
    
    def forward(self,x):
        B , T,embed_size = x.shape 
        pos_enc_matrix = self.encode(T,embed_size) 
        x = x + pos_enc_matrix.type_as(x)

        x = self.dropout(x)
        return x
pe_data = torch.rand(2,10,256)
carl_pe = PositionalEncoder(256,0)
my_pe = PositionalEncoding(256,0)
torch.equal(carl_pe(pe_data),my_pe(pe_data))

In [None]:
import torch.nn as nn
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.activation = nn.ReLU(inplace=True)
        
        for name,parameter in self.fc1.named_parameters():
            parameter.data.fill_(.1)
        for name,parameter in self.fc2.named_parameters():
            parameter.data.fill_(.1)

    def forward(self, x):
        '''In, Out: (B, S, D)'''
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)

        return x
embed_size = d_model = 256
carl_ff = PositionwiseFeedForward(256,256*4)
my_ff = nn.Sequential(
            nn.Linear(embed_size,4 * embed_size),
            nn.ReLU(True),
            nn.Linear(4 * embed_size,embed_size)
        )
for name,parameter in my_ff.named_parameters():
    parameter.data.fill_(.1)
ff_data = torch.rand(2,10,256)
torch.equal(carl_ff(ff_data),my_ff(ff_data))

In [36]:

import torch.nn as nn
import numpy as np
import torch.nn.functional as F
class Attention(nn.Module):
    def __init__(self,embed_size,dout_p,heads=8 , test = False):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.n_heads = embed_size // heads
        assert self.embed_size % self.heads == 0
        self.Q2d = nn.Linear(embed_size,embed_size)
        self.K2d = nn.Linear(embed_size,embed_size)
        self.V2d = nn.Linear(embed_size,embed_size)
        self.d2O = nn.Linear(embed_size,embed_size)
        if test:
            for name,parameter in self.Q2d.named_parameters():
                parameter.data.fill_(.1)
            for name,parameter in self.K2d.named_parameters():
                parameter.data.fill_(.1)
            for name,parameter in self.V2d.named_parameters():
                parameter.data.fill_(.1)
            for name,parameter in self.d2O.named_parameters():
                parameter.data.fill_(.1)
            dout_p = 0
        self.dropout = nn.Dropout(dout_p)
    def forward(self,Q,K,V,mask=None):
        B , T , embed_size = Q.shape
        Q = self.Q2d(Q)
        K = self.K2d(K)
        V = self.V2d(V)
        Q = Q.reshape(B , T , self.heads, self.n_heads)
        K = K.reshape(B , T , self.heads, self.n_heads)
        V = V.reshape(B , T , self.heads, self.n_heads)
        

        attention = torch.einsum('bqhd,bkhd->bhqk',[Q,K])
        attention = attention / np.sqrt(self.n_heads)
        if mask is not None:
            mask = mask.unsqueeze(1)#.unsqueeze(2)
            attention = attention.masked_fill(mask==0,-float('inf'))
        attention = torch.softmax(attention,dim=-1)

        out = torch.einsum('bhqk,bkhd->bqhd',[attention,V])
        out = self.dropout(out)
        out = out.contiguous().view(B,T,embed_size)
        out = self.d2O(out)
        return out
def attention(Q, K, V, mask=None, dropout=None, visual=False):
    # Q, K, V are (B, *(H), seq_len, d_model//H = d_k)
    # mask is     (B,    1,       1,               Ss)
    d_k = Q.size(-1)
    # (B, H, S, S)
    QKt = Q.matmul(K.transpose(-1, -2))
    sm_input = QKt / np.sqrt(d_k)

    if mask is not None:
        sm_input = sm_input.masked_fill(mask == 0, -float('inf'))

    # try:
    softmax = F.softmax(sm_input, dim=-1)
    # except:
    #     print('softmax failed: ' , sm_input)
    #     raise ValueError(sm_input)


    out = softmax.matmul(V)

    if dropout is not None:
        out = dropout(out)

    # (B, *(H), seq_len, d_model//H = d_k)
    if visual:
        return out, softmax.detach()
    else:
        return out


class MultiheadedAttention(nn.Module):
    def __init__(self, d_model_Q, d_model_K, d_model_V, H, dout_p=0.0, d_model=None, d_out=None, test= False):
        super(MultiheadedAttention, self).__init__()
        self.d_model_Q = d_model_Q
        self.d_model_K = d_model_K
        self.d_model_V = d_model_V
        self.H = H
        self.d_model = d_model
        self.dout_p = dout_p
        self.d_out = d_out
        if self.d_out is None:
            self.d_out = self.d_model_Q

        if self.d_model is None:
            self.d_model = self.d_model_Q

        self.d_k = self.d_model // H

        self.linear_Q2d = nn.Linear(self.d_model_Q, self.d_model)
        self.linear_K2d = nn.Linear(self.d_model_K, self.d_model)
        self.linear_V2d = nn.Linear(self.d_model_V, self.d_model)
        self.linear_d2Q = nn.Linear(self.d_model, self.d_out)

        if test :
            for name,parameter in self.linear_Q2d.named_parameters():
                parameter.data.fill_(.1)
            for name,parameter in self.linear_K2d.named_parameters():
                parameter.data.fill_(.1)
            for name,parameter in self.linear_V2d.named_parameters():
                parameter.data.fill_(.1)
            for name,parameter in self.linear_d2Q.named_parameters():
                parameter.data.fill_(.1)
            self.dout_p = 0

        self.dropout = nn.Dropout(self.dout_p)
        self.visual = False

        assert self.d_model % H == 0

    def forward(self, Q, K, V, mask=None):
        ''' 
            Q, K, V: (B, Sq, Dq), (B, Sk, Dk), (B, Sv, Dv)
            mask: (B, 1, Sk)
            Sk = Sv, 
            Dk != self.d_k
        '''
        B, Sq, d_model_Q = Q.shape
        # (B, Sm, D) <- (B, Sm, Dm)
        Q = self.linear_Q2d(Q)
        K = self.linear_K2d(K)
        V = self.linear_V2d(V)

        # (B, H, Sm, d_k) <- (B, Sm, D)
        Q = Q.view(B, -1, self.H, self.d_k).transpose(-3, -2)  # (-4, -3*, -2*, -1)
        K = K.view(B, -1, self.H, self.d_k).transpose(-3, -2)
        V = V.view(B, -1, self.H, self.d_k).transpose(-3, -2)

        if mask is not None:
            # the same mask for all heads -> (B, 1, 1, Sm2)
            mask = mask.unsqueeze(1)

        # (B, H, Sq, d_k) <- (B, H, Sq, d_k), (B, H, Sk, d_k), (B, H, Sv, d_k), Sk = Sv
        if self.visual:
            Q, self.attn_matrix = attention(Q, K, V, mask, self.dropout, self.visual)
            self.attn_matrix = self.attn_matrix.mean(-3)
        else:
            Q = attention(Q, K, V, mask, self.dropout)
        # (B, Sq, D) <- (B, H, Sq, d_k)
        Q = Q.transpose(-3, -2).contiguous().view(B, Sq, self.d_model)
        # (B, Sq, Dq)
        Q = self.linear_d2Q(Q)

        return Q

In [49]:
class EncodingLayer(nn.Module):
    def __init__(self,embed_size,dout_p,test=False):
        super().__init__()

        #self.residualNetwork_1 = ResidualNetwork(embed_size,dout_p=.1,test=test)
        #self.residualNetwork_2 = ResidualNetwork(embed_size,dout_p=.1,test=test)

        if test:
            dout_p = 0

        print(embed_size,dout_p,test)
        self.attention = Attention(embed_size,dout_p,test=test)
        self.feedForward = nn.Sequential(
            nn.Linear(embed_size,4 * embed_size),
            nn.ReLU(True),
            nn.Dropout(dout_p),
            nn.Linear(4 * embed_size,embed_size)
        )

        if test:
            for name,parameter in self.feedForward.named_parameters():
                parameter.data.fill_(.1)

    def forward(self,x,video_mask):
        return x
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, dout_p, H=8, d_ff=None, d_hidden=None , test = False):
        super(EncoderLayer, self).__init__()
        self.self_att = MultiheadedAttention(256, 256, 256, 8, d_model=256, test=test)
        
    def forward(self, x, src_mask=None):
        '''
        in:
            x: (B, S, d_model), src_mask: (B, 1, S)
        out:
            (B, S, d_model)
        '''
        # sublayer should be a function which inputs x and outputs transformation
        # thus, lambda is used instead of just `self.self_att(x, x, x)` which outputs 
        # the output of the self attention
        sublayer0 = lambda x: self.self_att(x, x, x, src_mask)
        sublayer1 = self.feed_forward
        
        #x = self.res_layer0(x, sublayer0)
        #x = self.res_layer1(x, sublayer1)
        
        return x

In [4]:
from model.carl_transformer.transformer import EncoderLayer
from model.transformer.encoder.encodingLayer.encodingLayer import EncodingLayer

carl_en = EncoderLayer(256,0,test=True)
my_en = EncodingLayer(256,0,test=True)

#carl_en.self_att = MultiheadedAttention(256,256,256,8,d_model=256,test=True)
#my_en.attention=Attention(256,0,test=True)

en_data = torch.rand(2,10,256)
torch.equal(carl_en(en_data),my_en(en_data,None))

256 8 256 True
256 0 True


True

In [25]:
import torch.nn as nn
class MLPHead(nn.Module):
    def __init__(self,test=False):
        super().__init__()
        projection_hidden_size = cfg.MODEL.PROJECTION_SIZE
        self.embedding_size = cfg.MODEL.EMBEDDER_MODEL.EMBEDDING_SIZE
        assert projection_hidden_size==128
        assert self.embedding_size==128
        self.net = nn.Sequential(nn.Linear(self.embedding_size, projection_hidden_size),
                                nn.BatchNorm1d(projection_hidden_size),
                                nn.ReLU(True),
                                nn.Linear(projection_hidden_size, self.embedding_size))
        for name,paramter in self.net.named_parameters():
                paramter.data.fill_(.1)
    
    def forward(self, x):
        b, l, c = x.shape
        x = x.view(-1,c)
        x = self.net(x)
        return x.view(b, l, c)
my_projection = nn.Sequential(
                nn.Linear(128,128),
                nn.BatchNorm1d(128),
                nn.ReLU(True),
                nn.Linear(128,128)
            )
carl_projection = MLPHead(test=True)

for name,parameter in my_projection.named_parameters():
    parameter.data.fill_(.1)

In [28]:
pro_data = torch.rand(2,10,128)
my_pro_data = pro_data.view(-1,128)
my_pro_data = my_projection(my_pro_data)
my_pro_data = my_pro_data.view(2,10,128)
carl_pro_data= carl_projection(pro_data)
torch.equal(my_pro_data,carl_pro_data)

True