In [5]:
# In this notebook you learn:
#
# 1) How to create positional encodings as described in 'Attention Is All You Need' paper?
#
# NOTE: Intuitively, we don't need to do all this messy stuff (as added below) to calculate positional encodings. We 
# can just run two for loops to calculate the positional encodings according to the formula given in the paper. But,
# we are doing this to calculate the positional encodings in a vectorized way and also avoid numerical instability.

In [None]:
# Resources to understand positional encoding:
#
# 1) https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/
# 2) https://datascience.stackexchange.com/questions/51065/what-is-the-positional-encoding-in-the-transformer-model
# 3) https://www.youtube.com/watch?v=dichIcUZfOw
# 4) https://kazemnejad.com/blog/transformer_architecture_positional_encoding/

In [3]:
import math
import torch
from torch import nn, Tensor

In [None]:
# Positional encoding is a way to inject information about the position of the token in the sequence.
#
# In the 'Attention Is All You Need' paper, the authors used the following formula to calculate the 
# positional encoding:
#   -- PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
#   -- PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
#
# where: 
# 'pos' is the position of the token in the sequence (0, 1, 2, 3, ..., sentence_length - 1).
# 'i' is the index in the positional encoding vector (0, 1, 2, 3, ..., 254, 255).
# 'd_model' is the size of the positional encoding vector (512).

In [19]:
# This is the maximum length of the sentence that we expect as input to the model.
max_len = 20
# This is the size of the positional encoding vector.
d_model = 16

In [None]:
# (10000^(2i/d_model)) involves the calculation of the power of 10000 to (2i/d_model) which can be a very
# small number. Calculations involving the large numbers and small numbers can lead to numerical instability.
# To avoid these numbers, we will calculate positional encodings in the log space.
#
# Resources to understand more about representing floating point numbers in computers and rounding errors:
#
# https://softwareengineering.stackexchange.com/questions/215065/can-anyone-explain-representation-of-float-in-memory
#       -- Quick overview of how floats are represented in computers
# https://www.youtube.com/watch?v=yvdtwKF87Ts
#       -- Quick overview of how floats are represented in computers
# https://www.youtube.com/watch?v=PZRI1IfStY0
#       -- Explains why floating point rounding errors occur
# https://docs.python.org/3/tutorial/floatingpoint.html
#       -- How floats are represented in python
# https://www.youtube.com/watch?v=m_G3z-C1C2g&t=1s
#       -- Two's complement representation of numbers

In [None]:
# A tensor (container) to hold the positional encodings until position 19 where the size of each positional encoding 
# vector is fixed to be 16.
positional_encoding = torch.zeros(max_len, d_model)
print(positional_encoding)
print(positional_encoding.shape)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0

There are multiple steps involved in creating positional encoding vectors. We go step by step.

In [None]:
# The numerator in the above forumale for Positional Encoding is the position. Here we are considering
# positional encoding vectors until 19th position. So, we create a tensor that holds the numerators
# until position 19.
positional_encoding_numerators = torch.arange(0, max_len).unsqueeze(1)
print(positional_encoding_numerators)
print(positional_encoding_numerators.shape)

tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14],
        [15],
        [16],
        [17],
        [18],
        [19]])
torch.Size([20, 1])


Now, lets calculate the denominators from the positional encoding formula above. Just calculating the denominators <br>
in log space itself involves multiple steps.


In [None]:
# We will pre-calculate the values of 10000^(2i/d_model) for all the values of 'i' in the positional encoding vector. 
# We will do this calculation in the log space to avoid numerical stability.
# 
# 1 / (10000^(2i/d_model)) = 10000^(-2i/d_model)
#                          = exp(log(10000^(-2i/d_model)))
#                          = exp(-2i/d_model * log(10000))
#                          = exp(2i * (-log(10000) / d_model))
# 
#
# Saves the values of 2i for each i from 0 to (d_model - 2) i.e., [0, 14] in this context.
numerators_in_exponent = torch.arange(0, d_model, 2)
print(numerators_in_exponent)
print(numerators_in_exponent.shape)

tensor([ 0,  2,  4,  6,  8, 10, 12, 14])
torch.Size([8])


In [24]:
# Calculate exp(2i * (-log(10000) / d_model)) which is same as 
# (10000^(2i/d_model)) 
# for 2i in {0, 2, 4, 6, 8, 10, 12, 14}
positional_encoding_denominators = torch.exp(numerators_in_exponent * (-math.log(10000.0) / d_model))
print(positional_encoding_denominators)
print(positional_encoding_denominators.shape)

tensor([1.0000e+00, 3.1623e-01, 1.0000e-01, 3.1623e-02, 1.0000e-02, 3.1623e-03,
        1.0000e-03, 3.1623e-04])
torch.Size([8])


In [None]:
# We already have the denominators for the positional encoding formula.
# Now, we need to calculate the positional encodings for each position in the sequence:
#   -- PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
#   -- PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
#
# Refer to 'understanding_pytorch/tensor_manipulations/understanding_tensor_manipulations_part_4.ipynb' (https://github.com/MB1151/understanding_pytorch/blob/main/tensor_manipulations/understanding_tensor_manipulations_part_4.ipynb) 
# to understand more about broadcasting in python.
#
# The below element wise multiplication is possible because of the python broadcasting feature.
# (20, 1) --> positional_encoding_numerators
# (  , 8) --> positional_encoding_denominators
# (20, 8) --> Resultant tensor shape
# 
# Each element in the 1D tensor (1,) in the positional_encoding_numerators is repeated to create a 
# 2D tensor of shape (20, 8):
# [[0, 0, ..., 0]
# [1, 1, ..., 1]
# ...
# [19, 19, ..., 19]]
#
# The 1D tensor (8,) positional_encoding_denominators itself is repeated 20 times to create a 2D
# tesnor of shape (20, 8):
# [positional_encoding_denominators, positional_encoding_denominators, ..., positional_encoding_denominators]
#
# The broadcasted (expanded) tensors are multiplied element wise to get the positional encodings.
positional_encoding_sin_terms = torch.sin(positional_encoding_numerators * positional_encoding_denominators)
print(positional_encoding_sin_terms)
print(positional_encoding_sin_terms.shape)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  3.1098e-01,  9.9833e-02,  3.1618e-02,  9.9998e-03,
          3.1623e-03,  1.0000e-03,  3.1623e-04],
        [ 9.0930e-01,  5.9113e-01,  1.9867e-01,  6.3203e-02,  1.9999e-02,
          6.3245e-03,  2.0000e-03,  6.3246e-04],
        [ 1.4112e-01,  8.1265e-01,  2.9552e-01,  9.4726e-02,  2.9995e-02,
          9.4867e-03,  3.0000e-03,  9.4868e-04],
        [-7.5680e-01,  9.5358e-01,  3.8942e-01,  1.2615e-01,  3.9989e-02,
          1.2649e-02,  4.0000e-03,  1.2649e-03],
        [-9.5892e-01,  9.9995e-01,  4.7943e-01,  1.5746e-01,  4.9979e-02,
          1.5811e-02,  5.0000e-03,  1.5811e-03],
        [-2.7942e-01,  9.4715e-01,  5.6464e-01,  1.8860e-01,  5.9964e-02,
          1.8973e-02,  6.0000e-03,  1.8974e-03],
        [ 6.5699e-01,  8.0042e-01,  6.4422e-01,  2.1956e-01,  6.9943e-02,
          2.2134e-02,  6.9999e-03,  2.2136e-03],
        [ 9.8936

In [26]:
# Same logic as the above cell but just with the cos function.
positional_encoding_cos_terms = torch.cos(positional_encoding_numerators * positional_encoding_denominators)
print(positional_encoding_cos_terms)
print(positional_encoding_cos_terms.shape)

tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9504,  0.9950,  0.9995,  0.9999,  1.0000,  1.0000,  1.0000],
        [-0.4161,  0.8066,  0.9801,  0.9980,  0.9998,  1.0000,  1.0000,  1.0000],
        [-0.9900,  0.5828,  0.9553,  0.9955,  0.9996,  1.0000,  1.0000,  1.0000],
        [-0.6536,  0.3011,  0.9211,  0.9920,  0.9992,  0.9999,  1.0000,  1.0000],
        [ 0.2837, -0.0103,  0.8776,  0.9875,  0.9988,  0.9999,  1.0000,  1.0000],
        [ 0.9602, -0.3208,  0.8253,  0.9821,  0.9982,  0.9998,  1.0000,  1.0000],
        [ 0.7539, -0.5994,  0.7648,  0.9756,  0.9976,  0.9998,  1.0000,  1.0000],
        [-0.1455, -0.8186,  0.6967,  0.9682,  0.9968,  0.9997,  1.0000,  1.0000],
        [-0.9111, -0.9566,  0.6216,  0.9598,  0.9960,  0.9996,  1.0000,  1.0000],
        [-0.8391, -0.9998,  0.5403,  0.9504,  0.9950,  0.9995,  0.9999,  1.0000],
        [ 0.0044, -0.9438,  0.4536,  0.9401,  0.9940,  0.9994,  0.9999,  1.0000],
        [ 0.8439

In [27]:
# We are filling the even terms in the positional encoding tensor with the sin terms.
# The even terms are at the 0th, 2nd, 4th, 6th, ..., 14th positions in the positional encoding vector.
# Notice that the odd terms (cos terms) are still zeros since we haven't filled them yet.
positional_encoding[:, 0::2] = positional_encoding_sin_terms
print(positional_encoding)
print(positional_encoding.shape)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00],
        [ 8.4147e-01,  0.0000e+00,  3.1098e-01,  0.0000e+00,  9.9833e-02,
          0.0000e+00,  3.1618e-02,  0.0000e+00,  9.9998e-03,  0.0000e+00,
          3.1623e-03,  0.0000e+00,  1.0000e-03,  0.0000e+00,  3.1623e-04,
          0.0000e+00],
        [ 9.0930e-01,  0.0000e+00,  5.9113e-01,  0.0000e+00,  1.9867e-01,
          0.0000e+00,  6.3203e-02,  0.0000e+00,  1.9999e-02,  0.0000e+00,
          6.3245e-03,  0.0000e+00,  2.0000e-03,  0.0000e+00,  6.3246e-04,
          0.0000e+00],
        [ 1.4112e-01,  0.0000e+00,  8.1265e-01,  0.0000e+00,  2.9552e-01,
          0.0000e+00,  9.4726e-02,  0.0000e+00,  2.9995e-02,  0.0000e+00,
          9.4867e-03,  0.0000e+00,  3.0000e-03,  0.0000e+00,  9.4868e-04,
          0.0000e+00],
        [-7.5680e-01

In [28]:
# We are filling the odd terms in the positional encoding tensor with the cos terms.
# The even terms are at the 1th, 3nd, 5th, 7th, ..., 15th positions in the positional encoding vector.
# Now, the positional_encoding tensor is completely filled with the positional encodings.
positional_encoding[:, 1::2] = positional_encoding_cos_terms
print(positional_encoding)
print(positional_encoding.shape)

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  3.1098e-01,  9.5042e-01,  9.9833e-02,
          9.9500e-01,  3.1618e-02,  9.9950e-01,  9.9998e-03,  9.9995e-01,
          3.1623e-03,  9.9999e-01,  1.0000e-03,  1.0000e+00,  3.1623e-04,
          1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  5.9113e-01,  8.0658e-01,  1.9867e-01,
          9.8007e-01,  6.3203e-02,  9.9800e-01,  1.9999e-02,  9.9980e-01,
          6.3245e-03,  9.9998e-01,  2.0000e-03,  1.0000e+00,  6.3246e-04,
          1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  8.1265e-01,  5.8275e-01,  2.9552e-01,
          9.5534e-01,  9.4726e-02,  9.9550e-01,  2.9995e-02,  9.9955e-01,
          9.4867e-03,  9.9995e-01,  3.0000e-03,  1.0000e+00,  9.4868e-04,
          1.0000e+00],
        [-7.5680e-01

## Positional Encoding in Transformers

The __init__ code below just combines all the individual steps discussed above into a single initialization function.

In addition to the actual implementation of Positional Encoding class, we also add Dropout layer and apply it in the <br>
forward function. This is because dropout is applied to the output of Positional Encoding layer in the actual <br>
translation model from 'Attention Is All You Need' paper.

In [None]:
# Now, we will combine the above steps and put the logic into a module to use the Positional Encoding in the Transformer model.
# Refer 'understanding_pytorch/modules/using_modules.ipynb notebook' (https://github.com/MB1151/understanding_pytorch) to 
# understand more about modules in pytorch.
class PositionalEncoding(nn.Module):
    # d_model above is the same as encoding_size here.
    def __init__(self, encoding_size: int, dropout_prob: float, max_len: int = 5000):
        """Creates the positional encodings.

        Args:
            encoding_size (int): Size of the positional encoding vector that represents the position of the token.
            dropout_prob (float): Probability of an element to be zeroed or dropped.
            max_len (int, optional): Largest position for which the positional encoding vector is generated. Defaults to 5000.
                                     By default, it generates positional encodings for the first 5000 positions.
        """
        super().__init__()
        # Refer to step_7_drop_out.ipynb to understand more about dropout.
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)
        # Compute the positional encodings in log space.
        positional_encoding = torch.zeros(size=(max_len, encoding_size), dtype=torch.float)
        positional_encoding_numerators = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        numerators_in_exponent = torch.arange(0, encoding_size, 2, dtype=torch.float)
        positional_encoding_denominators = torch.exp(numerators_in_exponent * (-math.log(10000.0) / encoding_size))
        positional_encoding[:, 0::2] = torch.sin(positional_encoding_numerators * positional_encoding_denominators)
        positional_encoding[:, 1::2] = torch.cos(positional_encoding_numerators * positional_encoding_denominators)
        # Refer to 'understanding_pytorch/tensor_manipulations/understanding_tensor_manipulations_part_1.ipynb' (https://github.com/MB1151/understanding_pytorch/blob/main/tensor_manipulations/understanding_tensor_manipulations_part_1.ipynb) 
        # to understand more about unsqueeze operation in pytorch.
        # In transformer model, we receive 3D tensors as input to this module. Each 1D tensor in the last dimension 
        # is an embedding for the token. Each 2D tensor is a sentence. The entire 3D tensor is a batch of sentences. 
        # To work with 3D tensors in the forward method, we convert the positional encoding to a 3D tensor.
        positional_encoding = positional_encoding.unsqueeze(0)
        # Refer to 'understanding_pytorch/tensor_manipulations/using_modules.ipynb' (https://github.com/MB1151/understanding_pytorch/blob/main/modules/using_modules.ipynb) 
        # to understand more about buffers in pytorch. This tells the module to not update the positional encoding 
        # tensor during the training. It is not a trainable parameter but it is still part of the state of the model.
        self.register_buffer('positional_encoding', positional_encoding)
    
    def forward(self, input: Tensor) -> Tensor:
        """Adds the positional encodings to the input tensor.
        Args:
            input (Tensor): The input tensor containing the embeddings of the tokens.
                            shape: [batch_size, sentence_length, d_model]

        Returns:
            Tensor: Input with the positional encodings added to it.
                    shape: [batch_size, sentence_length, d_model]
        """
        # Refer to 'understanding_pytorch/tensor_manipulations/understanding_tensor_manipulations_part_5.ipynb' (https://github.com/MB1151/understanding_pytorch/blob/main/tensor_manipulations/understanding_tensor_manipulations_part_5.ipynb) 
        # to understand more about broadcasting in python.
        #
        # There are two important bits of information condensed in this operation:
        # 1) We pre-calculate positional encodings for a fixed number (max_len=100 here) of positions. However the input 
        #    tensor might be of length 56 in which case, we only need the positional encoding vectors for the first 56
        #    positions. This is the first part.
        # 2) The input tensor is a 3D tensor of shape (batch_size, sentence_length, encoding_size) in the translation 
        #    model. However, the shape of pre-calculated positional encoding is (1, max_len, encoding_size). So, after
        #    step 1 is done, the positional encoding tensor is broadcasted for the addition operation with input tensor.
        #                                                  (broadcasted)
        # positional_encoding: (1, max_len, encoding_size) -------------> (1, sentence_length, encoding_size) 
        #       -- Extracts the positional encodings for the sentence_length from the positional_encoding tensor.
        #
        # (batch_size, sentence_length, encoding_size) --> input
        # (batch_size, sentence_length, encoding_size) --> Resultant tensor shape after broadcasting.
        # requires_grad_(False) is not needed since the positional encoding is already registered
        # as a Buffer and not a trainable parameter. It is just included for clarity.
        input = input + self.positional_encoding[:, :input.size(1)].requires_grad_(False)
        return self.dropout(input)


In [41]:
positional_encoding_layer = PositionalEncoding(encoding_size=16, dropout_prob=0.2)
print(positional_encoding_layer)

PositionalEncoding(
  (dropout): Dropout(p=0.2, inplace=False)
)


In [42]:
# Notice that positional_encoding is a not part of the parameters in the module. This is because
# we registered the positional_encoding tensor as a buffer and not as a parameter.
print(positional_encoding_layer.named_parameters)

<bound method Module.named_parameters of PositionalEncoding(
  (dropout): Dropout(p=0.2, inplace=False)
)>


In [43]:
input_1 = torch.arange(start=0, end=128, dtype=torch.float).reshape(2, 4, 16)
print(input_1)
print(input_1.shape)

tensor([[[  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
           11.,  12.,  13.,  14.,  15.],
         [ 16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,  26.,
           27.,  28.,  29.,  30.,  31.],
         [ 32.,  33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,
           43.,  44.,  45.,  46.,  47.],
         [ 48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,
           59.,  60.,  61.,  62.,  63.]],

        [[ 64.,  65.,  66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,
           75.,  76.,  77.,  78.,  79.],
         [ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
           91.,  92.,  93.,  94.,  95.],
         [ 96.,  97.,  98.,  99., 100., 101., 102., 103., 104., 105., 106.,
          107., 108., 109., 110., 111.],
         [112., 113., 114., 115., 116., 117., 118., 119., 120., 121., 122.,
          123., 124., 125., 126., 127.]]])
torch.Size([2, 4, 16])


In [39]:
# Lets analyze a single output element:
# output[0][0][1] = 2.5
# input[0][0][1] = 1
# positional_encoding[0][0][1] = 1
# input[0][0][1] + positional_encoding[0][0][1] = 2
#
# Now, the dropout is applied to the output element. So, the element
# is scaled up by 1 / (1 - dropout_prob) = 1 / (1 - 0.2) = 1 / 0.8 = 1.25
# 
# output[0][0][1] = 2 * 1.25 = 2.5
# Of course, this analysis might not be valid (values might change but the logic holds good) if this cell 
# is run again on the input since Dropout's behavior is probablistic.
#
# So, the module behaves as expected.
output = positional_encoding_layer(input_1)
print(output)
print(output.shape)

tensor([[[  0.0000,   2.5000,   0.0000,   5.0000,   0.0000,   0.0000,   7.5000,
           10.0000,  10.0000,   0.0000,  12.5000,  15.0000,  15.0000,   0.0000,
            0.0000,   0.0000],
         [ 21.0518,  21.9254,  22.8887,  24.9380,  25.1248,  27.4938,  27.5395,
           29.9994,  30.0125,  32.4999,   0.0000,  35.0000,  35.0013,  37.5000,
           37.5004,  40.0000],
         [ 41.1366,  40.7298,  43.2389,   0.0000,   0.0000,  47.4751,  47.5790,
           49.9975,  50.0250,  52.4998,   0.0000,  55.0000,  55.0025,  57.5000,
            0.0000,  60.0000],
         [ 60.1764,  60.0125,  63.5158,  64.4784,  65.3694,   0.0000,   0.0000,
           69.9944,  70.0375,  72.4994,   0.0000,  74.9999,  75.0037,  77.5000,
           77.5012,   0.0000]],

        [[ 80.0000,  82.5000,   0.0000,  85.0000,  85.0000,  87.5000,  87.5000,
           90.0000,  90.0000,  92.5000,  92.5000,  95.0000,  95.0000,  97.5000,
            0.0000, 100.0000],
         [101.0518, 101.9254,   0.0000, 104