In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
sequence_length = 4
batch_size = 1
input_dim = 512
d_model = 512
x = torch.randn((batch_size, sequence_length, input_dim))

In [3]:
x.shape

torch.Size([1, 4, 512])

In [4]:
qkv_layer = nn.Linear(input_dim, 3*d_model)

In [5]:
qkv = qkv_layer(x)

In [6]:
qkv.shape

torch.Size([1, 4, 1536])

In [7]:
num_heads = 8
head_dim = d_model//num_heads
qkv = qkv.reshape(batch_size, num_heads, sequence_length , 3*head_dim)

In [8]:
qkv.shape

torch.Size([1, 8, 4, 192])

In [9]:
# now break this into simple q, k, v

In [10]:
q, k,v = qkv.chunk(3, dim = -1)
q.shape, k.shape, v.shape

(torch.Size([1, 8, 4, 64]),
 torch.Size([1, 8, 4, 64]),
 torch.Size([1, 8, 4, 64]))

In [11]:
d_k = q.shape[-1]

In [12]:
scaled = torch.matmul(q,k.transpose(-2, -1))/np.sqrt(d_k)

In [13]:
y = torch.randn(2, 3)

In [14]:
mask = torch.full(scaled.size(), float('-inf'))

In [15]:
mask = torch.triu(mask, diagonal = 1)

In [16]:
mask.shape

torch.Size([1, 8, 4, 4])

In [17]:
mask[0][1]

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [18]:
(scaled + mask)[0][0]

tensor([[-0.6306,    -inf,    -inf,    -inf],
        [ 0.2766, -0.3240,    -inf,    -inf],
        [-0.0979, -0.2961,  0.0378,    -inf],
        [ 0.2177, -0.1842, -0.0748,  0.1721]], grad_fn=<SelectBackward0>)

In [19]:
scaled = scaled + mask

In [20]:
attention = F.softmax(scaled, dim = -1) # dim = -1 means last dimension mei attention use kiya hai cuz voh row by row context nikalega

In [21]:
attention

tensor([[[[1.0000, 0.0000, 0.0000, 0.0000],
          [0.6458, 0.3542, 0.0000, 0.0000],
          [0.3372, 0.2766, 0.3862, 0.0000],
          [0.2967, 0.1985, 0.2214, 0.2834]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.6219, 0.3781, 0.0000, 0.0000],
          [0.2716, 0.2213, 0.5071, 0.0000],
          [0.3096, 0.2237, 0.2087, 0.2581]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.7208, 0.2792, 0.0000, 0.0000],
          [0.3975, 0.3232, 0.2793, 0.0000],
          [0.3718, 0.1743, 0.1997, 0.2543]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.4488, 0.5512, 0.0000, 0.0000],
          [0.2671, 0.3425, 0.3904, 0.0000],
          [0.2677, 0.2747, 0.2968, 0.1608]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.2910, 0.7090, 0.0000, 0.0000],
          [0.3545, 0.3726, 0.2729, 0.0000],
          [0.3298, 0.1000, 0.1494, 0.4208]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.3868, 0.6132, 0.0000, 0.0000],
          [0.2661, 0.5

In [22]:
value = torch.matmul(attention, v)
value.shape

torch.Size([1, 8, 4, 64])

In [23]:
value = value.reshape(batch_size, sequence_length, num_heads*head_dim)

In [24]:
value.shape

torch.Size([1, 4, 512])

In [25]:
linear_layer = nn.Linear(d_model, d_model )

In [26]:
output = linear_layer(value)

In [27]:
output.shape

torch.Size([1, 4, 512])

In [28]:
output

tensor([[[-0.6098,  0.1974,  0.0480,  ...,  0.0383, -0.0209,  0.2957],
         [-0.0973, -0.1122,  0.0256,  ..., -0.1880, -0.0123, -0.4714],
         [-0.1467, -0.0171, -0.0209,  ...,  0.1269, -0.1492, -0.1048],
         [ 0.0994,  0.3236,  0.1151,  ...,  0.3135,  0.2553,  0.3763]]],
       grad_fn=<ViewBackward0>)

In [29]:
import torch
import torch.nn as nn
import math

def scaled_dot_product(q, k, v):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    mask = torch.full(scaled.size(), float('-inf'))
    mask = torch.triu(mask, diagonal=1)
    scaled = scaled + mask
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention


class MultiheadAttention(nn.Module):

    def __init__(self, input_dim, d_model, num_heads):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(input_dim , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask=None):
        batch_size, sequence_length, input_dim = x.size()
        print(f"x.size(): {x.size()}")
        qkv = self.qkv_layer(x)
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3)
        print(f"qkv.size(): {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1)
        print(f"q size: {q.size()}, k size: {k.size()}, v size: {v.size()}, ")
        values, attention = scaled_dot_product(q, k, v)
        print(f"values.size(): {values.size()}, attention.size:{ attention.size()} ")
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        print(f"values.size(): {values.size()}")
        out = self.linear_layer(values)
        print(f"out.size(): {out.size()}")
        return out

In [30]:
input_dim = 1024
d_model = 512
num_heads = 8

batch_size = 30
sequence_length = 5
x = torch.randn( (batch_size, sequence_length, input_dim) )

model = MultiheadAttention(input_dim, d_model, num_heads)
out = model.forward(x)

x.size(): torch.Size([30, 5, 1024])
qkv.size(): torch.Size([30, 5, 1536])
qkv.size(): torch.Size([30, 5, 8, 192])
qkv.size(): torch.Size([30, 8, 5, 192])
q size: torch.Size([30, 8, 5, 64]), k size: torch.Size([30, 8, 5, 64]), v size: torch.Size([30, 8, 5, 64]), 
values.size(): torch.Size([30, 8, 5, 64]), attention.size:torch.Size([30, 8, 5, 5]) 
values.size(): torch.Size([30, 5, 512])
out.size(): torch.Size([30, 5, 512])


In [31]:
out.shape

torch.Size([30, 5, 512])

In [32]:
# Positional encoding
import torch.nn as nn

In [33]:
max_len = 10
d_model = 6

In [34]:
even_i = torch.arange(0, d_model, 2).float()
even_i


tensor([0., 2., 4.])

In [35]:
even_denominator = torch.pow(10000, even_i/d_model)

In [36]:
even_denominator

tensor([  1.0000,  21.5443, 464.1590])

In [37]:
odd_i = torch.arange(1, d_model , 2).float()

In [38]:
odd_denominator = torch.pow(10000, (odd_i-1)/d_model)

In [39]:
odd_denominator

tensor([  1.0000,  21.5443, 464.1590])

In [40]:
denominator = even_denominator

In [41]:
position = torch.arange(max_len, dtype = torch.float).reshape(max_len, 1)

In [42]:
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [43]:
even_pe = torch.sin(position/denominator)
odd_pe = torch.cos(position/denominator)

In [44]:
even_pe

tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0464,  0.0022],
        [ 0.9093,  0.0927,  0.0043],
        [ 0.1411,  0.1388,  0.0065],
        [-0.7568,  0.1846,  0.0086],
        [-0.9589,  0.2300,  0.0108],
        [-0.2794,  0.2749,  0.0129],
        [ 0.6570,  0.3192,  0.0151],
        [ 0.9894,  0.3629,  0.0172],
        [ 0.4121,  0.4057,  0.0194]])

In [45]:
odd_pe

tensor([[ 1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9989,  1.0000],
        [-0.4161,  0.9957,  1.0000],
        [-0.9900,  0.9903,  1.0000],
        [-0.6536,  0.9828,  1.0000],
        [ 0.2837,  0.9732,  0.9999],
        [ 0.9602,  0.9615,  0.9999],
        [ 0.7539,  0.9477,  0.9999],
        [-0.1455,  0.9318,  0.9999],
        [-0.9111,  0.9140,  0.9998]])

In [46]:
stacked = torch.stack([even_pe, odd_pe], dim = 2)
stacked.shape

torch.Size([10, 3, 2])

In [47]:
stacked

tensor([[[ 0.0000,  1.0000],
         [ 0.0000,  1.0000],
         [ 0.0000,  1.0000]],

        [[ 0.8415,  0.5403],
         [ 0.0464,  0.9989],
         [ 0.0022,  1.0000]],

        [[ 0.9093, -0.4161],
         [ 0.0927,  0.9957],
         [ 0.0043,  1.0000]],

        [[ 0.1411, -0.9900],
         [ 0.1388,  0.9903],
         [ 0.0065,  1.0000]],

        [[-0.7568, -0.6536],
         [ 0.1846,  0.9828],
         [ 0.0086,  1.0000]],

        [[-0.9589,  0.2837],
         [ 0.2300,  0.9732],
         [ 0.0108,  0.9999]],

        [[-0.2794,  0.9602],
         [ 0.2749,  0.9615],
         [ 0.0129,  0.9999]],

        [[ 0.6570,  0.7539],
         [ 0.3192,  0.9477],
         [ 0.0151,  0.9999]],

        [[ 0.9894, -0.1455],
         [ 0.3629,  0.9318],
         [ 0.0172,  0.9999]],

        [[ 0.4121, -0.9111],
         [ 0.4057,  0.9140],
         [ 0.0194,  0.9998]]])

In [48]:
PE = torch.flatten(stacked, start_dim = 1, end_dim = 2)

In [49]:
PE.shape

torch.Size([10, 6])

In [50]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [51]:
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

# Layer Normalization

In [52]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])

In [53]:
B, S, E = inputs.shape

In [54]:
inputs = inputs.reshape(S, B, E)

In [55]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.ones(parameter_shape))

In [56]:
beta.shape, gamma.shape

(torch.Size([1, 3]), torch.Size([1, 3]))

In [57]:
# dims = [-(i + 1) for i in range(len(parameter_shape))]

In [58]:
dims = [-1, -2]  # for batch dimension and embedding dimension

In [59]:
dims

[-1, -2]

In [60]:
mean = inputs.mean(dim = dims,  keepdim = True)
mean.size()

torch.Size([2, 1, 1])

In [61]:
var = ((inputs-mean)**2).mean(dim = dims, keepdim = True)
epsilon = 1e-5
std = (var+epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [62]:
y = (inputs-mean)/std

In [63]:
out = gamma*y + beta

In [64]:
out

tensor([[[ 1.0000, -0.2238,  2.2238]],

        [[ 2.4140,  0.2930,  0.2930]]], grad_fn=<AddBackward0>)

In [65]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        print('parameter shape ', self.parameters_shape)
        return out

In [66]:
batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")


input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.7885, -0.4783, -0.5920,  0.2692, -1.0349,  0.2413,  1.2919,
           1.7395],
         [-1.7447,  0.8992,  0.9784,  0.0715, -0.7955,  1.7451, -1.9119,
          -0.6223],
         [-0.5215,  1.8098,  0.3279, -1.6558, -1.3841, -0.3033, -1.0581,
          -0.5864]],

        [[-0.2612,  0.1764,  1.6328,  0.4338,  1.1443, -0.9820, -0.8404,
          -0.4170],
         [ 0.5260,  0.5908, -0.2610,  0.3541,  1.8695, -0.2470, -0.8677,
          -0.0088],
         [ 0.8854,  0.1873,  0.8674, -1.2851,  0.4021, -0.4690, -2.0062,
          -0.1976]],

        [[-2.2157, -0.0656,  0.8522,  1.8787, -1.2984, -0.2642,  0.4032,
           1.4360],
         [-0.5445, -1.1208,  1.1413, -0.8726,  0.2163,  1.0912,  0.2856,
          -0.4894],
         [ 1.0202, -0.2206,  1.0396,  0.0276,  1.4310, -0.6794,  1.4389,
          -0.1593]],

        [[ 0.5882, -0.7973,  0.7749, -1.2714,  0.8682,  0.7185, -0.8039,
           1.1034],
         [ 1.8745, -0.0967, 

In [67]:
layer_norm = LayerNormalization(inputs.size()[-1:])


In [68]:
out = layer_norm.forward(inputs)


Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[ 0.0810],
         [-0.1725],
         [-0.4214]],

        [[ 0.1109],
         [ 0.2445],
         [-0.2020]],

        [[ 0.0908],
         [-0.0366],
         [ 0.4873]],

        [[ 0.1476],
         [ 0.4578],
         [-0.0483]],

        [[-0.3291],
         [ 0.4393],
         [-0.1009]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[0.9384],
         [1.2398],
         [1.0276]],

        [[0.8667],
         [0.7624],
         [0.9580]],

        [[1.2789],
         [0.8035],
         [0.7808]],

        [[0.8774],
         [0.9298],
         [0.7538]],

        [[0.8200],
         [0.6912],
         [1.2412]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.9266, -0.5961, -0.7172,  0.2005, -1.1891,  0.1708,  1.2903,
           1.7673],
         [-1.2680,  0.8644,  0.9283,  0.1968, -0.5024,  1.5467, -1.4029,
          -0.3628],
         [-0.0974,  2.1714,  0.7292, -1.2013, -0.9368,  0.1150, -0.6196,
          -0.1605]],

