## Description

Implemented a Transformer decoder block manually

Used dummy data to understand:

Masked Multi-Head Attention

Layer Normalization

Residual Connections

Feed Forward Network

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random, math
import numpy as np

In [12]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [13]:
def set_causal_mask(size):
    # create a matrix with values as -ve infinity
    mask=torch.full((size, size), float('-inf'))
    # set the lower half of the matrix zero
    mask=torch.triu(mask, diagonal=1)
    return mask

In [14]:
sequence_length=10
mask=set_causal_mask(sequence_length)
print(mask)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [15]:
dummy_attention_weights=torch.randn(1,sequence_length,sequence_length)
print(dummy_attention_weights)


tensor([[[-0.9654,  0.4343, -0.9185, -0.8232, -1.1175, -2.0940, -0.3108,
           1.4962, -0.3618, -2.0375],
         [-2.5759,  1.6493,  1.3595,  1.4973, -1.0889,  0.2724, -0.1583,
          -0.1888,  0.3989, -0.7162],
         [-0.3447,  1.2901,  1.8200, -0.5086, -1.5370,  0.6569, -0.0963,
          -0.4093,  2.0329,  0.0758],
         [-1.6710, -0.0544,  1.4740, -0.0517, -1.3301, -0.3474, -0.2829,
          -1.8867,  2.5350,  0.3790],
         [ 1.6299, -0.0596,  0.0190, -2.5560, -0.0063,  0.3257,  0.1969,
          -0.5971, -0.0352, -1.5251],
         [ 1.0983,  0.3083, -1.8326,  1.0167,  0.3939,  1.5390, -1.2255,
           2.7642,  0.8218,  0.3076],
         [-0.8098, -0.5819, -0.4566,  0.4317,  1.4262,  0.7808, -0.0803,
          -0.7487, -0.5130,  0.0109],
         [ 0.3862,  1.6571,  1.0131, -2.3466,  0.0741,  0.7171,  1.1973,
          -0.1694, -0.6201,  0.4520],
         [-0.4994, -0.0443,  1.2898,  0.1607, -1.3919,  0.0539,  0.3607,
          -0.7042,  0.0172,  1.6976],
 

In [16]:
dummy_attention_weights+mask

tensor([[[-0.9654,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf,    -inf,    -inf],
         [-2.5759,  1.6493,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf,    -inf,    -inf],
         [-0.3447,  1.2901,  1.8200,    -inf,    -inf,    -inf,    -inf,
             -inf,    -inf,    -inf],
         [-1.6710, -0.0544,  1.4740, -0.0517,    -inf,    -inf,    -inf,
             -inf,    -inf,    -inf],
         [ 1.6299, -0.0596,  0.0190, -2.5560, -0.0063,    -inf,    -inf,
             -inf,    -inf,    -inf],
         [ 1.0983,  0.3083, -1.8326,  1.0167,  0.3939,  1.5390,    -inf,
             -inf,    -inf,    -inf],
         [-0.8098, -0.5819, -0.4566,  0.4317,  1.4262,  0.7808, -0.0803,
             -inf,    -inf,    -inf],
         [ 0.3862,  1.6571,  1.0131, -2.3466,  0.0741,  0.7171,  1.1973,
          -0.1694,    -inf,    -inf],
         [-0.4994, -0.0443,  1.2898,  0.1607, -1.3919,  0.0539,  0.3607,
          -0.7042,  0.0172,    -inf],
 

In [17]:
F.softmax(dummy_attention_weights+mask, dim=-1)# we apply softmax to the dimension that refers to the Keys

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.0144, 0.9856, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.0674, 0.3456, 0.5870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.0292, 0.1468, 0.6769, 0.1472, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.6273, 0.1158, 0.1253, 0.0095, 0.1221, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.2234, 0.1014, 0.0119, 0.2059, 0.1104, 0.3471, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.0426, 0.0535, 0.0606, 0.1474, 0.3985, 0.2090, 0.0883, 0.0000,
          0.0000, 0.0000],
         [0.0873, 0.3113, 0.1635, 0.0057, 0.0639, 0.1216, 0.1966, 0.0501,
          0.0000, 0.0000],
         [0.0571, 0.0901, 0.3420, 0.1106, 0.0234, 0.0994, 0.1351, 0.0466,
          0.0958, 0.0000],
         [0.0319, 0.3256, 0.1124, 0.1483, 0.0904, 0.0067, 0.0347, 0.0414,
          0.1809,

In [18]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, embedding_dimension):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.embedding_dimension=embedding_dimension
        pem=torch.zeros(max_seq_len,embedding_dimension)
        positions=torch.arange(0,max_seq_len).unsqueeze(1).float()
        even_positions=torch.arange(0,embedding_dimension,2 ).float()
        exponential_term=torch.log(torch.tensor(10000.0))/embedding_dimension
        div_term=torch.exp(even_positions * -(exponential_term))
        # Sine to even indices
        pem[:,0::2]=torch.sin(positions * div_term)
        pem[:,1::2]=torch.cos(positions * div_term)
        self.register_buffer('pem', pem.unsqueeze(0))
    def forward(self, x):
        # shape of x is [batch_size, length_of_seq, embedding_dimension]
        length_of_seq=x.size(1)
        return self.pem[:,:length_of_seq, :]

In [19]:
def set_padding_mask(sequence,padding_index):
    return sequence==padding_index
    

In [29]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dimension, sequence_length, n_attention_heads, ff_hidden_layers=1024, dropout=.01):
        super().__init__()
        self.ln_1=nn.LayerNorm(embedding_dimension)
        self.multihead_attention=nn.MultiheadAttention(embedding_dimension,n_attention_heads, dropout, batch_first=True)
        self.dropout_1=nn.Dropout(dropout)
        self.ln_2=nn.LayerNorm(embedding_dimension)
        self.ffn=nn.Sequential(
            nn.Linear(embedding_dimension,ff_hidden_layers ),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_hidden_layers,embedding_dimension)
           
        )
        self.dropout_2=nn.Dropout(dropout)
    def forward(self,x, causal_mask=None):
        x_norm=self.ln_1(x)
        att,_=self.multihead_attention(x_norm, x_norm, x_norm, attn_mask=causal_mask)
        att=self.dropout_1(att)
        x=x+att
        x_ffn_norm=self.ln_2(x)
        x_ffn=self.ffn(x_ffn_norm)
        x=x+self.dropout_2(x_ffn)
        return x
        
    

In [30]:
dummy_sequence=torch.randn(1, 10, 12)
print(dummy_sequence)
causal_mask=set_causal_mask(10)
print(causal_mask)
decoder=DecoderBlock(12,10,4)

output=decoder(dummy_sequence,causal_mask=causal_mask)
print(output)

tensor([[[ 3.0058e-01, -1.1193e-01, -1.7671e+00, -2.1940e-01,  2.6723e+00,
          -5.2958e-01,  2.7066e-02,  5.8608e-01, -1.6722e+00,  1.8119e+00,
          -2.3175e+00,  8.0755e-01],
         [ 4.8411e-01, -6.9740e-01, -1.8349e+00,  5.9242e-01, -3.9815e-01,
           7.7734e-01,  5.3490e-01, -6.4500e-01, -6.5829e-01, -3.4404e-01,
           4.3873e-01,  6.6312e-02],
         [-2.5691e+00,  1.4824e-02,  2.5424e+00,  7.5870e-01, -1.2597e+00,
          -8.9006e-01, -7.8506e-01,  9.5710e-01,  2.9448e-02, -3.3721e-01,
           7.3392e-01, -7.1969e-01],
         [ 8.0331e-01,  7.6176e-01,  4.9357e-01,  2.0885e+00,  4.1913e-01,
           3.0149e-01,  3.2093e-01, -2.6646e+00,  9.7319e-01,  7.3366e-01,
           1.6656e+00, -2.4378e-01],
         [ 1.1523e+00, -1.0555e+00,  1.1867e-01, -3.9691e-01,  1.0793e+00,
          -3.7202e-01,  1.4140e+00, -2.7426e-01, -4.8509e-02,  1.7031e-01,
           5.7462e-01,  2.4806e-01],
         [ 1.0352e+00,  1.4812e-01, -1.5510e-01,  4.7041e-01,  1.