In [21]:
# In this notebook, you learn:
#
# 1) How to build Encoder for the Transformer model?
#       -- The idea is to use all the building blocks you learned in the previous notebooks to build the Encoder.

In [22]:
# Resources I referred to build this notebook:
#
# 1) https://nlp.seas.harvard.edu/annotated-transformer/
#       -- Refer to the EncoderLayer code part of the blog post.
# 2) https://jalammar.github.io/illustrated-transformer/
#       -- It is a great blog post to understand the Transformer model. I highly recommend reading it.

In [1]:
from torch import nn, Tensor
from typing import Optional, Tuple

import copy
import math
import torch

In [2]:
# CONSTANTS TO BE USED IN THIS NOTEBOOK.
# Number of sentence in a batch.
batch_size = 3
# Number of tokens in a sentence.
seq_len = 4
# Dimension of the word embeddings.
d_model = 8
# Number of heads in the MultiHeadedAttention layer.
num_heads = 2
# Number of neurons in the hidden layer (that expands the input) in the feed forward neural network.
d_feed_forward = 16
# Probability with which the nodes are dropped in the dropout layer.
dropout_prob = 0.1

In [3]:
# Encoder is a stack of 6 identical EncoderLayers. Lets first deep dive into the EncoderLayer.
#
# Every EncoderLayer has two main transformations:
# 1) Multi-head self-attention mechanism
# 2) Position-wise feedforward neural network
#
# There is a Layer Normalization layer after each of these two transformations.
# So, the overall structure of the encoder layer is:
#
# Input
#   -> Multi-head self-attention
#       -> Layer Normalization
#   -> Add Input To Output + Dropout
#   -> Position-wise feedforward neural network
#       -> Layer Normalization
#   -> Add Input To Output + Dropout
# Output
#   -> This is the output of one encoder layer

<img src="../../Data/Images/EncoderLayer.png" alt="Encoder Layer" width="450" height="400">

In [4]:
# credits: The above image is taken from this blog post: https://jalammar.github.io/illustrated-transformer/

In [27]:
# The EncoderLayer class can be viewed as a combination of two sublayers:
# 1) MultiHeadedAttention + LayerNormalization
# 2) PositionwiseFeedforward + LayerNormalization
#
# Both the MultiHeadedAttention and PositionwiseFeedForward are two operations that take an input and 
# produce an output. We don't need to differentiate between these two operations while using them in
# the EncoderLayer class. We can treat them as black boxes that take an input and produce an output.
# So, we will create a single class called 'SubLayerWrapper' that takes in an operation 
# (MultiHeadedAttention or PositionwiseFeedForward), applies to the input, then applies a dropout, 
# adds the input back to the transformed input, does normalization and returns the output. 

In [28]:
# SKIP THIS CELL IF YOU ALREADY LOOKED INTO THE STEP_9 AND STEP_10 NOTEBOOKS. JUST RUN IT BLINDLY.
#
def clone_module(module: nn.Module, num_clones: int) -> nn.ModuleList:
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)])

# Refer to 'step_11_feed_forward_neural_network.ipynb' to understand more about the FeedForwardNN class.
class FeedForwardNN(nn.Module):
    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float = 0.1):
        super().__init__()
        self.linear_layer_1 = nn.Linear(in_features=d_model, out_features=d_feed_forward)
        self.linear_layer_2 = nn.Linear(in_features=d_feed_forward, out_features=d_model)
        self.dropout_layer = nn.Dropout(p=dropout_prob)

    def forward(self, input: Tensor) -> Tensor:
        # This input is forwaded from the Attention layer.
        # input: [batch_size, seq_len, d_model]
        # We first expand the input to higher dimension. We apply the ReLU activation function 
        # in this layer.
        intermediate_output = self.linear_layer_1(input).relu()
        # Dropout layer to prevent overfitting
        intermediate_output = self.dropout_layer(intermediate_output)
        # We then compress the input back to its original dimension.
        # There is no specific intuitive explanation as to why this is done. It is just shown
        # to be working practically.
        return self.linear_layer_2(intermediate_output)
    

# Refer to 'step_11_multi_headed_attention.ipynb' to understand more about the MultiHeadedAttention class.
# This function is just copied from that notebook to use it here.
def construct_attention_heads(queries: Tensor, keys: Tensor, values: Tensor, mask: Optional[Tensor]=None, dropout_layer: Optional[nn.Module]=None) -> Tuple[Tensor, Tensor]:
    """Calculates the attention scores for each token in the sequence with every other token in the sequence.
       Applues the mask if provided and then normalizes the scores using softmax. It then calculates the 
       attention heads for each token in the sequence.

    Args:
        queries (Tensor): [batch_size, num_heads, seq_len, d_k]
        keys (Tensor): [batch_size, num_heads, seq_len, d_k]
        values (Tensor): [batch_size, num_heads, seq_len, d_k]
        mask (Optional[Tensor], optional): [batch_size, 1, seq_len, seq_len]. Defaults to None.
        dropout_layer (Optional[nn.Module], optional): probability with which the values are dropped on dropout layer. Defaults to None.

    Returns:
        Tuple[Tensor, Tensor]: Returns the attention heads and the attention scores.
                               attention_heads: [batch_size, num_heads, seq_len, d_k]
                               attention_scores: [batch_size, num_heads, seq_len, seq_len]
    """
    # Size of the vectors for each token for each head in the sequence.
    d_k = queries.shape[-1]
    # Calculate the attention scores for each token in the sequence with every other token in the sequence.
    attention_scores = torch.matmul(queries, keys.transpose(dim0=2, dim1=3)) / math.sqrt(d_k)
    # Mask the attention scores if a mask is provided. Mask is used in two different ways:
    # 1) To prevent the model from attending to the padding tokens --> This applies for both src and tgt sentences.
    # 2) To prevent the model from attending to the future tokens in the sequence --> This applies only for tgt sentences.
    if mask is not None:
        # Please do not set the masked values to float('-inf') as it sometimes (not in everycase) causes softmax to return nan.
        attention_scores = attention_scores.masked_fill(mask == False, float('-1e9'))
    # Normalize the attention scores using softmax.
    attention_scores = attention_scores.softmax(dim=-1)
    # Apply dropout regularization to prevent overfitting problems.
    if dropout_layer is not None:
        dropout_layer(attention_scores)
    # Calculate the attention heads for each token in the sequence. The head for each token is calculated by
    # taking the weighted average (averaged by attention scores) of the values for all the tokens in the 
    # sequence for the token of interest.
    attention_heads = torch.matmul(attention_scores, values)
    return attention_heads, attention_scores


# Refer to 'using_modules.ipynb' (Add link to the notebook) to understand more about Pytorch modules.
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads: int, d_model: int, dropout_prob: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads."
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        # We use dropout to prevent overfitting.
        self.dropout_layer = nn.Dropout(p=dropout_prob)
        # Creating the linear layers that generate queries, keys and values for each token in the sequence.
        # Also, creating an additional linear layer to generate the output of the Multi-Headed Attention from concatenated attention heads.
        self.linear_layers = clone_module(module=nn.Linear(in_features=d_model, out_features=d_model), num_clones=4)


    def forward(self, query_input: Tensor, key_input: Tensor, value_input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """Forward pass of the Multi-Headed Attention layer. 

        Args:
            query (Tensor): Input to be used for query creation.
                            query_input: [batch_size, seq_len, d_model]
            key (Tensor): Input to be used for key creation.
                          key_input  : [batch_size, seq_len, d_model]
            value (Tensor): Input to be used for value creation.
                            value_input: [batch_size, seq_len, d_model]
            mask (Tensor): Mask to be applied to the attention scores. Default is None. Same mask will 
                           be applied to all the heads in the Multi-Headed Attention layer.
                           mask: [batch_size, 1, seq_len, seq_len]

        Returns:
            Mutli-Headed Attention Output: Output of the Multi-Headed Attention layer. Generates one output vector 
                                           for each token in the sequence. Does this for each sequence in the batch.
                                           output: [batch_size, seq_len, d_model]
        """
        # Generates the queries, keys and values for each token in the sequence.
        # shape of queries, keys, values: [batch_size, seq_len, d_model]
        queries, keys, values = [linear_layer(input) for linear_layer, input in zip(self.linear_layers, (query_input, key_input, value_input))]
        batch_size = query_input.shape[0]
        seq_len = query_input.shape[1]
        # Separating the queries, keys and values for each head into a separate vector. The vectors for each token in all the heads
        # are concatenated when they are created using the linear_layers above.
        # Shape for queries, keys, values after view: [batch_size, seq_len, num_heads, d_k]
        # Shape for queries, key, values after transpose: [batch_size, num_heads, seq_len, d_k]
        queries, keys, values = [data.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(dim0=1, dim1=2) for data in (queries, keys, values)]
        # Calculate the attention heads for each token in the sequence.
        # attention_heads: [batch_size, num_heads, seq_len, d_k]
        attention_heads, attention_scores = construct_attention_heads(queries=queries, keys=keys, values=values, mask=mask, dropout_layer=self.dropout_layer)
        # Concatenate the attention heads for each token from all the heads.
        # attention_heads: [batch_size, seq_len, d_model]
        attention_heads = attention_heads.transpose(dim0=1, dim1=2).reshape(batch_size, seq_len, self.d_model)
        # Generate the output of the Multi-Headed Attention layer.
        return self.linear_layers[-1](attention_heads)

In [29]:
# Notice that dropout and layer_norm are the child modules (part of BackPropagation) of the SubLayerWrapper 
# class. However, 'sublayer' (argument to the forward function) is not a child module of the SubLayerWrapper 
# class. It is passed as an argument to the forward method of the SubLayerWrapper class.
class SubLayerWrapper(nn.Module):
    def __init__(self, d_model: int, dropout_prob: float):
        """This class is a wrapper around the MultiHeadedAttention and PositionwiseFeedForward classes.

        Args:
            d_model (int): Dimension of the vectors used in the Attention model.
            dropout_prob (float): probability with which nodes can be dropped.
        """
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input: Tensor, sublayer: nn.Module) -> Tensor:
        """It applies the operation on the input, applies dropout, adds the input back to the transformed 
           input, does normalization and returns the output.

        Args:
            input (Tensor): Input to be transformer by the sublayer.
                            shape: [batch_size, seq_len, d_model]
            sublayer (nn.Module): sublayer could be either MultiHeadedAttention or PositionwiseFeedForward.
            
        Returns:
            Tensor: Output of the sublayer transformation.
                    shape: [batch_size, seq_len, d_model]
        """
        return self.layer_norm(input + self.dropout(sublayer(input)))

In [30]:
# The MultiHeadedAttention (self_attention here) and FeedForward modules are also common (common meaning they 
# have the same implementation and instantiation mechanism and not that they share weights) to the DecoderLayer 
# Hence, we create them in a common way at the top level and pass them as arguments to the EncoderLayer and 
# DecoderLayer classes. Passing them as arguments is more of a design choice than a necessity. Since 
# EncodeLayer is a common abstraction that can act on any kind of layers, it is reasonable to create encoder 
# as a container and pass the layers as arguments to the container. 
class EncoderLayer(nn.Module):
    def __init__(self, self_attention: MultiHeadedAttention, feed_forward: FeedForwardNN, d_model: int, dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the EncoderLayer and will be registered as parameters of the EncoderLayer.
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        # We need two instances of the SubLayerWrapper class. One for the self_attention and the other for the feed_forward.
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=self.d_model, dropout_prob=self.dropout_prob), num_clones=2)

    def forward(self, input: Tensor, mask: Tensor) -> Tensor:
        """This method is the forward pass of the EncoderLayer class.

        Args:
            input (Tensor): Source sentence provided as input to the EncoderLayer. These are the embeddings of the source 
                            sentence for the first EncoderLayer.
                            SHAPE: [batch_size, seq_len, d_model]
            mask (Tensor): Boolean mask to be applied to the input during attention scores calculation.
                           SHAPE: [batch_size, 1, seq_len, seq_len]
        Returns:
            Tensor: Output of the EncoderLayer.
                    SHAPE: [batch_size, seq_len, d_model]
        """
        # We are just saving the function call to the self_attention method in a variable and passing the
        # lambda function (contained within the variable) to the sublayer_wrappers[0] to execute it when 
        # needed.
        output = self.sublayer_wrappers[0](input, lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=mask))
        return self.sublayer_wrappers[1](output, self.feed_forward)

In [31]:
# Generating input to experiment with the FeedForward Neural Network
def generate_batch_of_input_data(batch_size: int, seq_len: int, d_model: int) -> Tensor:
    return torch.randn(batch_size, seq_len, d_model)

In [32]:
input_data = generate_batch_of_input_data(batch_size, seq_len, d_model)
print("shape: ", input_data.shape)
print("input_data: ", input_data)

shape:  torch.Size([3, 4, 8])
input_data:  tensor([[[ 0.2582, -1.3831, -1.9438,  0.5316, -0.1784,  0.0083,  0.2884,
          -0.8376],
         [ 0.2480,  0.8048, -2.7526,  1.5818, -0.2004,  0.5739,  0.5272,
           0.0656],
         [ 1.1624,  1.2678,  0.1227,  0.2279, -0.5776, -0.5855, -0.9168,
           0.0203],
         [ 1.4978,  1.0155,  0.4399, -0.8802, -0.1166,  0.3307,  0.1024,
          -1.5923]],

        [[ 1.3564,  1.2711, -0.0802, -0.3287, -1.6290, -0.1725, -0.8015,
          -1.1178],
         [-0.8002, -0.6339,  0.3804,  2.0195,  0.7491, -0.3428,  0.5965,
          -0.6687],
         [-0.8388, -1.0904,  0.9625, -0.5597,  1.2343,  0.6819,  1.2447,
          -0.3270],
         [ 0.4437, -1.3110,  0.3671, -2.0343, -1.1671, -1.5498, -0.4165,
          -0.5960]],

        [[-0.5625, -0.5839,  2.0064, -0.8538, -0.2274,  0.7378,  0.3497,
           1.1398],
         [-0.3905, -0.1371, -1.7092, -0.3213, -0.9406, -0.9600,  0.3203,
           1.0399],
         [-0.4192,  0.2

In [33]:
multiheaded_attention = MultiHeadedAttention(num_heads=num_heads, d_model=d_model, dropout_prob=dropout_prob)
print("multiheaded_attention: ", multiheaded_attention)
print("----------------------------------------------")
feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=dropout_prob)
print("feed_forward_nn: ", feed_forward_nn)
print("----------------------------------------------")
# We are using the deepcopy function to create a new instance of the multiheaded_attention and feed_forward_nn.
encoder_layer = EncoderLayer(self_attention=copy.deepcopy(multiheaded_attention), 
                             feed_forward=copy.deepcopy(feed_forward_nn), 
                             d_model=d_model, 
                             dropout_prob=dropout_prob)
print("encoder_layer: ", encoder_layer)

multiheaded_attention:  MultiHeadedAttention(
  (dropout_layer): Dropout(p=0.1, inplace=False)
  (linear_layers): ModuleList(
    (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
  )
)
----------------------------------------------
feed_forward_nn:  FeedForwardNN(
  (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
  (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
  (dropout_layer): Dropout(p=0.1, inplace=False)
)
----------------------------------------------
encoder_layer:  EncoderLayer(
  (self_attention): MultiHeadedAttention(
    (dropout_layer): Dropout(p=0.1, inplace=False)
    (linear_layers): ModuleList(
      (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
    )
  )
  (feed_forward): FeedForwardNN(
    (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
    (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
    (dropout_layer): Dropout(p=0.1, inplace=False)
  )
  (sublayer_wrappers):

In [34]:
encoder_layer_output = encoder_layer(input=input_data, mask=None)
print("encoder_layer_output: ", encoder_layer_output)

encoder_layer_output:  tensor([[[ 0.6012, -1.4467, -1.3647,  1.5490,  0.0944,  0.6453,  0.6146,
          -0.6933],
         [ 0.0136, -0.0800, -2.1195,  1.6702,  0.2798,  0.5962,  0.1458,
          -0.5061],
         [ 1.7685,  1.0970, -0.1746, -0.1778, -0.4999, -0.9963, -1.4695,
           0.4526],
         [ 2.1072,  0.2719,  0.2543, -1.1158, -0.1703,  0.3268, -0.2718,
          -1.4023]],

        [[ 1.9583,  0.6615, -0.1043,  0.3946, -1.5033, -0.1120, -1.1217,
          -0.1731],
         [-0.6857, -1.2708,  0.6155,  1.9099,  0.9417, -0.6115, -0.1152,
          -0.7838],
         [-0.8260, -1.8154,  0.6749, -0.2955,  1.4778,  0.6743,  0.7010,
          -0.5910],
         [ 1.0786, -1.0121,  1.6652, -1.1747,  0.0932, -1.0857, -0.2165,
           0.6519]],

        [[-1.2896, -1.1925,  1.5293, -0.1240, -0.5423,  1.0088, -0.4238,
           1.0341],
         [-1.0181, -0.2743, -1.5114,  1.0017, -0.1723, -0.2879,  0.4495,
           1.8127],
         [-1.2134, -0.2620, -0.6588,  1.634

In [35]:
# Now lets look at the Encoder itself. The Encoder is a stack of 6 identical EncoderLayers.
# The output of one EncoderLayer is passed as input to the next EncoderLayer.

<img src="../../Data/Images/Encoder.png" alt="Encoder" width="450" height="500">

In [36]:
class Encoder(nn.Module):
    def __init__(self, encoder_layer: EncoderLayer, num_layers: int):
        super().__init__()
        self.encoder_layers = clone_module(module=encoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(encoder_layer.d_model)

    def forward(self, input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Encoder class. The output of the current EncoderLayer is
           passed as input to the next EncoderLayer. We have 6 identical EncoderLayers stacked on top of 
           each other. The output of the last EncoderLayer is passed through a Layer Normalization layer
           and returned as the final output of the Encoder

        Args:
            input (Tensor): Input to the Encoder i.e., embeddings of the tokenized src sequences.
                            input: [batch_size, seq_len, d_model]
            mask (Optional[Tensor], optional): Boolean mask to be applied during attention scores calculation.
                                               mask: [batch_size, 1, seq_len, seq_len]. Defaults to None.
                            
        Returns:
            Tensor: Output of the Encoder i.e., encoded src sentences.
                    output: [batch_size, seq_len, d_model]
        """
        output = input
        for encoder_layer in self.encoder_layers:
            # Pass the output of the previous EncoderLayer to the current EncoderLayer.
            output = encoder_layer(input=output, mask=mask)
        return self.layer_norm(output)

In [37]:
encoder = Encoder(encoder_layer=encoder_layer, num_layers=6)
print("encoder: ", encoder)

encoder:  Encoder(
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attention): MultiHeadedAttention(
        (dropout_layer): Dropout(p=0.1, inplace=False)
        (linear_layers): ModuleList(
          (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
        )
      )
      (feed_forward): FeedForwardNN(
        (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
        (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
        (dropout_layer): Dropout(p=0.1, inplace=False)
      )
      (sublayer_wrappers): ModuleList(
        (0-1): 2 x SubLayerWrapper(
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (layer_norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
)


In [38]:
# We are just creating a random mask for testing here. This is not how masks are created to be used 
# in the transformers. Refer to 'step_7_data_batching_and_masking.ipynb' to understand how masks 
# are created in the transformers.
def construct_random_mask(batch_size: int, seq_len: int) -> Tensor:
    # If some index is set to False, then it will be masked out.
    mask = torch.randn(size=(batch_size, 1, seq_len, seq_len)) > 0.5
    return mask.bool()

In [39]:
mask = construct_random_mask(batch_size=batch_size, seq_len=seq_len)
print("shape: ", mask.shape)
print("mask: \n", mask)

shape:  torch.Size([3, 1, 4, 4])
mask: 
 tensor([[[[ True, False, False, False],
          [False,  True,  True, False],
          [ True, False, False, False],
          [False, False,  True,  True]]],


        [[[False, False, False, False],
          [False, False, False,  True],
          [ True,  True, False, False],
          [False,  True,  True,  True]]],


        [[[ True, False, False,  True],
          [False,  True,  True,  True],
          [False, False, False, False],
          [False, False, False, False]]]])


In [40]:
encoder_output = encoder(input=input_data, mask=mask)
print("encoder_output: ", encoder_output)

encoder_output:  tensor([[[-0.3792, -1.9030,  0.3590,  1.7789,  0.0416,  0.8241, -0.3721,
          -0.3493],
         [ 0.2684, -0.6450,  0.9577,  0.3035,  1.5067, -1.0332, -1.7380,
           0.3798],
         [-1.2583, -0.2144,  0.2131,  0.3853,  1.7022, -0.4989, -1.3844,
           1.0554],
         [ 0.3593, -0.3572,  1.4368, -0.2228,  0.5058, -1.0850, -1.7314,
           1.0946]],

        [[-0.0035, -0.9030,  1.2484, -0.0305,  1.2161, -1.0708, -1.4315,
           0.9749],
         [-0.3114, -0.7708,  1.2795,  0.1511,  1.1976, -1.1423, -1.3911,
           0.9874],
         [-0.0956, -1.5132,  1.5087,  0.6951,  0.8065, -0.5209, -1.3437,
           0.4631],
         [-0.4436, -0.9031,  1.2466, -0.0507,  1.5930, -1.0334, -1.1376,
           0.7290]],

        [[-1.7342, -0.9609,  0.1137,  1.1219, -0.4349,  1.4785, -0.2019,
           0.6177],
         [-1.8125, -0.2758, -0.6063,  1.5421,  0.0272,  1.3618, -0.1888,
          -0.0477],
         [-0.8641, -0.9186, -0.5245,  1.7321, -0.