In [1]:
# In this notebook, you learn:
#
# 1) How to build Decoder for the Transformer model?
#       -- The idea is to use all the building blocks you learned in the previous notebooks to 
#          build the Decoder.
#
# IF YOU UNDERSTOOD 'step_12_encoder.ipynb' notebook, IT SHOULD BE VERY EASY TO UNDERSTAND THIS
# CURRENT NOTEBOOK. IT IS ALMOST THE SAME AS THE ENCODER, EXCEPT FOR A FEW CHANGES (SOURCE_ATTENTION). 
# HOWEVER, BE CAREFUL WITH SHAPES IN DECODER ESPECIALLY DURING THE SOURCE ATTENTION CALCULATION.

In [2]:
# Resources I referred to build this notebook:
#
# 1) https://nlp.seas.harvard.edu/annotated-transformer/
#       -- Refer to the EncoderLayer part of the blog post.
# 2) https://jalammar.github.io/illustrated-transformer/
#       -- It is a great blog post to understand the Transformer model. I highly recommend reading it.

In [3]:
from torch import nn, Tensor
from typing import Optional, Tuple

import copy
import math
import torch

In [4]:
# CONSTANTS TO BE USED IN THIS NOTEBOOK.
# Number of sentences in a batch.
batch_size = 3
# Number of tokens in a sentence.
seq_len = 4
# Dimension of the word embeddings.
d_model = 8
# Number of heads in the MultiHeadedAttention layer.
num_heads = 2
# Number of neurons in the hidden layer (that expands the input) in the feed forward neural network.
d_feed_forward = 16
# Probability with which the nodes are dropped in the dropout layer.
dropout_prob = 0.1

In [5]:
# Decoder is a stack of 6 identical DecoderLayers. Lets first deep dive into the DecoderLayer.
#
# Every DecoderLayer has three main transformations:
# 1) Multi-headed Attention mechanism on target sentences.
#       -- This is self attention on the tgt sentences i.e., queries, keys and values, all 
#          come from the tgt sentences. 
# 2) Multi-Headed Attention mechanism on the target sentence with the keys, values taken 
#    from the Encoder output (encoded src sentences).
#       -- This is shown as Encoder-Decoder attention in the below image.
#       -- This is similar to the self attention mechanism, but the keys and values come
#          from the output of the Encoder (encoded src sentences) and queries come from the 
#          tgt sentences.
# 2) Position-wise feedforward neural network
#
# There is a Layer Normalization layer after each of these three transformations.
# So, the overall structure of the encoder layer is:
#
# Input
#   -> Self Attention (Multi-Headed Attention)
#       -> Layer Normalization
#   -> Add + Dropout
#   -> Source Attention (Multi-Headed Attention)
#       -> Layer Normalization
#   -> Add Input To Output + Dropout
#   -> Position-wise feedforward neural network
#       -> Layer Normalization
#   -> Add Input To Output + Dropout
# Output
#   -> This is the output of one Decoder Layer

<img src="../../Data/Images/DecoderLayer.png" alt="Decoder Layer" width="450" height="400">

In [None]:
# credits: The above image is taken from this blog post: https://jalammar.github.io/illustrated-transformer/

In [6]:
# SKIP THIS CELL IF YOU ALREADY LOOKED INTO THE STEP_9 AND STEP_10 NOTEBOOKS. ALL THIS CODE IN THIS CELL 
# IS COPIED FROM THE PREVIOUS NOTEBOOKS. JUST RUN IT BLINDLY.
#
def clone_module(module: nn.Module, num_clones: int) -> nn.ModuleList:
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)])

# Refer to 'step_11_feed_forward_neural_network.ipynb' to understand more about the FeedForwardNN class.
class FeedForwardNN(nn.Module):
    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float = 0.1):
        super().__init__()
        self.linear_layer_1 = nn.Linear(in_features=d_model, out_features=d_feed_forward)
        self.linear_layer_2 = nn.Linear(in_features=d_feed_forward, out_features=d_model)
        self.dropout_layer = nn.Dropout(p=dropout_prob)

    def forward(self, input: Tensor) -> Tensor:
        # This input is forwaded from the Attention layer.
        # input: [batch_size, seq_len, d_model]
        # We first expand the input to higher dimension. We apply the ReLU activation function 
        # in this layer.
        intermediate_output = self.linear_layer_1(input).relu()
        # Dropout layer to prevent overfitting
        intermediate_output = self.dropout_layer(intermediate_output)
        # We then compress the input back to its original dimension.
        # There is no specific intuitive explanation as to why this is done. It is just shown
        # to be working practically.
        return self.linear_layer_2(intermediate_output)
    

# Refer to 'step_11_multi_headed_attention.ipynb' to understand more about the MultiHeadedAttention class.
# This function is just copied from that notebook to use it here.
def construct_attention_heads(queries: Tensor, keys: Tensor, values: Tensor, mask: Optional[Tensor]=None, dropout_layer: Optional[nn.Module]=None) -> Tuple[Tensor, Tensor]:
    """Calculates the attention scores for each token in the sequence with every other token in the sequence.
       Applies the mask if provided and then normalizes the scores using softmax. It then calculates the 
       attention heads for each token in the sequence.

    Args:
        queries (Tensor): [batch_size, num_heads, seq_len, d_k]
        keys (Tensor): [batch_size, num_heads, seq_len, d_k]
        values (Tensor): [batch_size, num_heads, seq_len, d_k]
        mask (Optional[Tensor], optional): [batch_size, 1, seq_len, seq_len]. Defaults to None.
        dropout_layer (Optional[nn.Module], optional): probability with which the values are dropped on dropout layer. Defaults to None.

    Returns:
        Tuple[Tensor, Tensor]: Returns the attention heads and the attention scores.
                               attention_heads: [batch_size, num_heads, seq_len, d_k]
                               attention_scores: [batch_size, num_heads, seq_len, seq_len]
    """
    # Size of the vectors for each token for each head in the sequence.
    d_k = queries.shape[-1]
    # Calculate the attention scores for each token in the sequence with every other token in the sequence.
    attention_scores = torch.matmul(queries, keys.transpose(dim0=2, dim1=3)) / math.sqrt(d_k)
    # Mask the attention scores if a mask is provided. Mask is used in two different ways:
    # 1) To prevent the model from attending to the padding tokens --> This applies for both src and tgt sentences.
    # 2) To prevent the model from attending to the future tokens in the sequence --> This applies only for tgt sentences.
    if mask is not None:
        # Please do not set the masked values to float('-inf') as it sometimes (not in everycase) causes softmax to return nan.
        attention_scores = attention_scores.masked_fill(mask == False, float('-1e9'))
    # Normalize the attention scores using softmax.
    attention_scores = attention_scores.softmax(dim=-1)
    # Apply dropout regularization to prevent overfitting problems.
    if dropout_layer is not None:
        dropout_layer(attention_scores)
    # Calculate the attention heads for each token in the sequence. The head for each token is calculated by
    # taking the weighted average (averaged by attention scores) of the values for all the tokens in the 
    # sequence for the token of interest.
    attention_heads = torch.matmul(attention_scores, values)
    return attention_heads, attention_scores


class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads: int, d_model: int, dropout_prob: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads."
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        # We use dropout to prevent overfitting.
        self.dropout_layer = nn.Dropout(p=dropout_prob)
        # Creating the linear layers that generate queries, keys and values for each token in the sequence.
        # Also, creating an additional linear layer to generate the output of the Multi-Headed Attention from concatenated attention heads.
        self.linear_layers = clone_module(module=nn.Linear(in_features=d_model, out_features=d_model), num_clones=4)


    def forward(self, query_input: Tensor, key_input: Tensor, value_input: Tensor, mask: Optional[Tensor]=None) -> Tensor:
        """Forward pass of the Multi-Headed Attention layer. 

        Args:
            query (Tensor): Input to be used for query creation.
                            query_input: [batch_size, seq_len, d_model]
            key (Tensor): Input to be used for key creation.
                          key_input  : [batch_size, seq_len, d_model]
            value (Tensor): Input to be used for value creation.
                            value_input: [batch_size, seq_len, d_model]
            mask (Tensor): Mask to be applied to the attention scores. Default is None. Same mask will 
                           be applied to all the heads in the Multi-Headed Attention layer.
                           mask: [batch_size, 1, seq_len, seq_len]

        Returns:
            Mutli-Headed Attention Output: Output of the Multi-Headed Attention layer. Generates one output vector 
                                           for each token in the sequence. Does this for each sequence in the batch.
                                           output: [batch_size, seq_len, d_model]
        """
        # Generates the queries, keys and values for each token in the sequence.
        # shape of queries, keys, values: [batch_size, seq_len, d_model]
        queries, keys, values = [linear_layer(input) for linear_layer, input in zip(self.linear_layers, (query_input, key_input, value_input))]
        batch_size = query_input.shape[0]
        seq_len = query_input.shape[1]
        # Separating the queries, keys and values for each head into a separate vector. The vectors for each token in all the heads
        # are concatenated when they are created using the linear_layers above.
        # Shape for queries, keys, values after view: [batch_size, seq_len, num_heads, d_k]
        # Shape for queries, key, values after transpose: [batch_size, num_heads, seq_len, d_k]
        queries, keys, values = [data.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(dim0=1, dim1=2) for data in (queries, keys, values)]
        # Calculate the attention heads for each token in the sequence.
        # attention_heads: [batch_size, num_heads, seq_len, d_k]
        attention_heads, attention_scores = construct_attention_heads(queries=queries, keys=keys, values=values, mask=mask, dropout_layer=self.dropout_layer)
        # Concatenate the attention heads for each token from all the heads.
        # attention_heads: [batch_size, seq_len, d_model]
        attention_heads = attention_heads.transpose(dim0=1, dim1=2).reshape(batch_size, seq_len, self.d_model)
        # Generate the output of the Multi-Headed Attention layer.
        return self.linear_layers[-1](attention_heads)
    

# This class is the same as the SubLayerWrapper in the 'step_14_encoder.ipynb' notbeook.
class SubLayerWrapper(nn.Module):
    def __init__(self, d_model: int, dropout_prob: float):
        """This class is a wrapper around the MultiHeadedAttention and PositionwiseFeedForward classes.

        Args:
            d_model (int): Dimension of the vectors used in the Attention model.
            dropout_prob (float): probability with which nodes can be dropped.
        """
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input: Tensor, sublayer: nn.Module) -> Tensor:
        """It applies the provided operation on the input, applies dropout, adds the input back to the 
           transformed input, does normalization and returns the output.

        Args:
            input (Tensor): Input to be transformer by the sublayer.
                            input: [batch_size, seq_len, d_model]
            sublayer (nn.Module): sublayer could be either MultiHeadedAttention or PositionwiseFeedForward.
            
        Returns:
            Tensor: Output of the sublayer transformation.
                    output: [batch_size, seq_len, d_model]
        """
        return self.layer_norm(input + self.dropout(sublayer(input)))

In [7]:
# THE MAIN PART (THAT IS DIFFERENT FROM EARLIER NOTEBOOKS) OF THIS NOTEBOOK STARTS HERE. HOWEVER,
# IT IS STILL VERY SIMILAR TO THE ENCODER PART. SO, IT SHOULD BE STRAIGHT FORWARD TO UNDERSTAND.

In [8]:
# (seq_len - 1) in the decoder input and mask comes from the fact that we remove the last token from the 
# decoder input when we create batches and masks. Refer to 'step_5_data_batching_and_masking.ipynb' notebook
# to understand this better.
class DecoderLayer(nn.Module):
    def __init__(self, self_attention: MultiHeadedAttention, src_attention: MultiHeadedAttention, feed_forward: FeedForwardNN, d_model: int, dropout_prob: float):
        super().__init__()
        self.d_model = d_model
        self.dropout_prob = dropout_prob
        # These modules are now the child modules of the DecoderLayer and will be registered as parameters of the DecoderLayer.
        self.self_attention = self_attention
        self.src_attention = src_attention
        self.feed_forward = feed_forward
        self.sublayer_wrappers = clone_module(module=SubLayerWrapper(d_model=d_model, dropout_prob=dropout_prob), num_clones=3)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the DecoderLayer class.

        Args:
            input (Tensor): Target sentence provided as input to the DecoderLayer. These are the embeddings of the target 
                            sentence for the first DecoderLayer.
                            SHAPE: [batch_size, seq_len - 1, d_model]
            encoded_src (Tensor): Encoded source sentence. This is the output of the Encoder. This is used to calculate the
                                  source attention scores for the target sentence. 
                                  SHAPE: [batch_size, seq_len, d_model] 
            tgt_mask (Tensor): Mask to prevent the future tokens in the target sentence to attend to the previous tokens and
                               also to prevent padding tokens from attending to any other token except other padding tokens.
                               SHAPE: [batch_size, 1, seq_len - 1, seq_len - 1]
            src_mask (Optional[Tensor], optional): Mask to prevent the the padding tokens to attend to the tokens in the tgt sentence. 
                                                   Defaults to None.
                                                   SHAPE: [batch_size, 1, seq_len, seq_len]

        Returns:
            Tensor: Returns the output of the DecoderLayer. This is the output of the Positionwise FeedForward Neural Network.
                    SHAPE: [batch_size, seq_len - 1, d_model]
        """
        # First sublayer: Self-Attention on the target sentence. Hence, it uses the tgt_mask.
        self_attention_output = self.sublayer_wrappers[0](input=input, sublayer=lambda input: self.self_attention(query_input=input, key_input=input, value_input=input, mask=tgt_mask)) 
        # To give intuition about src_attention, I have a query for a token in the target sentence. I want to know whether 
        # some token in the source sentence is important for me to predict the output for the token in the target sentence. 
        # So, I go to the source sentence and get the values for all the tokens in the source sentence. I then calculate 
        # the attention scores between the query (in tgt) and the keys (in src). I then calculate the attention heads for 
        # the token in the target sentence using the attention scores. This is what is done in the below line. Note that 
        # referring to statement 'the keys and values are from the source' doesn't mean that you get keys and values 
        # explicitly. It means we use the encoded data from the source sentence to calculate the queries and keys for 
        # this transformation.
        # Second sublayer: Attention on the source sentence. Hence, it uses the src_mask.
        src_attention_output = self.sublayer_wrappers[1](input=self_attention_output, sublayer=lambda self_attention_output: self.src_attention(query_input=self_attention_output, key_input=encoded_src, value_input=encoded_src, mask=src_mask))
        # Third sublayer: Positionwise FeedForward Neural Network
        return self.sublayer_wrappers[2](input=src_attention_output, sublayer=self.feed_forward)
    

# Here, lets try to understand how the shapes change when we caluclate source_attention above.
# queries from tgt:    [batch_size, num_heads, seq_len - 1, d_k]
# keys from encoder:   [batch_size, num_heads, seq_len, d_k]
# values from encoder: [batch_size, num_heads, seq_len, d_k]
#
# attention_scores = queries * keys^{transpose}  --> * here represents matrix multiplication.
# attention_scores: [batch_size, num_heads, seq_len - 1, seq_len]
#
# attention_heads = attentions_scores * values  --> * here represents matrix multiplication.
# attention_heads: [batch_size, num_heads, seq_len - 1, d_k]
# output of source attention calculation: [batch_size, num_heads, seq_len - 1, d_k]

In [9]:
# Now lets look at the Decoder itself. The Decoder is a stack of 6 identical DecoderLayers.
# The output of one DecoderLayer is passed as input to the next DecoderLayer.

<img src="../../Data/Images/Decoder.png" alt="Decoder" width="450" height="500">

In [10]:
class Decoder(nn.Module):
    def __init__(self, decoder_layer: DecoderLayer, num_layers: int):
        super().__init__()
        self.decoder_layers = clone_module(module=decoder_layer, num_clones=num_layers)
        self.layer_norm = nn.LayerNorm(decoder_layer.d_model)

    def forward(self, input: Tensor, encoded_src: Tensor, tgt_mask: Tensor, src_mask: Optional[Tensor]=None) -> Tensor:
        """This method is the forward pass of the Decoder class. The output of the current DecoderLayer is
           passed as input to the next DecoderLayer. We have 6 identical DecoderLayers stacked on top of 
           each other. The output of the Encoder (last EncoderLayer) is also passed as input to the 
           first DecoderLayer. The output of the last DecoderLayer is passed through a Layer Normalization 
           layer and returned as the final output of the Decoder.

        Args:
            input (Tensor): Input to the Decoder i.e., embeddings of the tokenized tgt sequences.
                            SHAPE: [batch_size, seq_len - 1, d_model]
            encoded_src (Tensor): output of the encoder i.e., encoded src sequences.
                                  SHAPE: [batch_size, seq_len, d_model]
            tgt_mask (Tensor): Boolean mask to be applied during self attention scores calculation.
                               SHAPE: [batch_size, 1, seq_len - 1, seq_len - 1].
            src_mask (Optional[Tensor], optional): Boolean mask to be applied during src attention scores calculation.
                                                   SHAPE: [batch_size, 1, seq_len, seq_len]. Defaults to None.

        Returns:
            Tensor: Output of the Decoder.
                    SHAPE: [batch_size, seq_len - 1, d_model]
        """
        output = input
        for decoder_layer in self.decoder_layers:
            # Pass the output of the previous DecoderLayer to the current DecoderLayer.
            output = decoder_layer(input=output, encoded_src=encoded_src, tgt_mask=tgt_mask, src_mask=src_mask)
        return self.layer_norm(output)

In [11]:
# Generates input to experiment with the pipeline.
def generate_batch_of_input_data(batch_size: int, seq_len: int, d_model: int) -> Tensor:
    return torch.randn(batch_size, seq_len, d_model)

# We are just creating a random mask for testing here. This is not how masks are created to be used 
# in the transformers. Refer to 'step_7_data_batching_and_masking.ipynb' to understand how src and 
# tgt masks are created in the transformers.
def construct_random_mask(batch_size: int, seq_len: int) -> Tensor:
    # If some index is set to False, then it will be masked out.
    mask = torch.randn(size=(batch_size, 1, seq_len, seq_len)) > 0.5
    return mask.bool()

In [12]:
# Please note that transformer expects src_data and tgt_data in some specific format. However, 
# for the sake of simplicity, I am just generating random data here. The actual format of the data
# is shown in the 'step_7_data_batching_and_masking.ipynb' notebook. We will also explore it in
# the future notebooks.
src_data = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, d_model=d_model)
print("shape: ", src_data.shape)
print("src_data: ", src_data)
print("-----------------------------------------------------")
src_mask = construct_random_mask(batch_size=batch_size, seq_len=seq_len)
print("shape: ", src_mask.shape)
print("src_mask: ", src_mask)
print("-----------------------------------------------------")
tgt_data = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, d_model=d_model)
print("shape: ", tgt_data.shape)
print("tgt_data: ", tgt_data)
print("-----------------------------------------------------")
tgt_mask = construct_random_mask(batch_size=batch_size, seq_len=seq_len)
print("shape: ", tgt_mask.shape)
print("tgt_mask: ", tgt_mask)
print("-----------------------------------------------------")
# In the actual model, we first the encoder on the src data and then pass the output of the encoder
# to the decoder. However, for the sake of simplicity, I am just passing the src_data to the decoder.
# We will explore this connection in the future notebooks.
encoder_output = generate_batch_of_input_data(batch_size=batch_size, seq_len=seq_len, d_model=d_model)
print("shape: ", encoder_output.shape)
print("encoder_output: ", encoder_output)

shape:  torch.Size([3, 4, 8])
src_data:  tensor([[[-1.1403e+00, -9.2024e-01,  8.0869e-01,  5.7408e-01, -1.1810e+00,
          -2.8915e-01,  2.4586e-01, -1.3560e+00],
         [-6.8936e-01,  2.0820e-01, -2.3100e-01,  2.7594e-01, -7.0491e-01,
          -3.3236e+00, -1.6732e-02, -4.9242e-01],
         [-8.3267e-01, -1.2729e-01,  1.3480e-01, -7.5081e-01, -1.6244e-01,
           1.7945e+00,  6.8148e-01,  4.4234e-01],
         [ 1.6017e+00, -6.5314e-01,  1.2028e+00,  1.2371e+00, -1.5162e-01,
           1.8575e-03, -1.2741e+00, -1.6410e-01]],

        [[-4.8748e-02,  4.0851e-01, -6.6606e-01,  1.1173e-01, -5.0911e-01,
           1.0118e+00,  5.2170e-01, -1.0424e-01],
         [ 8.1525e-02,  3.4766e-01,  7.4616e-01,  1.2160e+00,  1.5857e+00,
          -2.8427e-02, -9.0159e-01, -1.2319e+00],
         [-1.6478e+00, -6.8876e-01,  4.7857e-01,  8.3703e-01, -7.3230e-01,
          -1.2180e+00, -5.0042e-02, -2.2114e+00],
         [-1.2035e-01,  1.2297e+00, -5.9811e-01,  6.7815e-03,  1.1323e-01,
       

In [13]:
multiheaded_attention = MultiHeadedAttention(num_heads=num_heads, d_model=d_model, dropout_prob=dropout_prob)
print("multiheaded_attention: ", multiheaded_attention)
print("----------------------------------------------")
feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=dropout_prob)
print("feed_forward_nn: ", feed_forward_nn)
print("----------------------------------------------")

multiheaded_attention:  MultiHeadedAttention(
  (dropout_layer): Dropout(p=0.1, inplace=False)
  (linear_layers): ModuleList(
    (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
  )
)
----------------------------------------------
feed_forward_nn:  FeedForwardNN(
  (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
  (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
  (dropout_layer): Dropout(p=0.1, inplace=False)
)
----------------------------------------------


In [14]:
decoder_layer = DecoderLayer(self_attention=copy.deepcopy(multiheaded_attention), 
                             src_attention=copy.deepcopy(multiheaded_attention), 
                             feed_forward=copy.deepcopy(feed_forward_nn), 
                             d_model=d_model, 
                             dropout_prob=dropout_prob)
print("decoder_layer: ", decoder_layer)

decoder_layer:  DecoderLayer(
  (self_attention): MultiHeadedAttention(
    (dropout_layer): Dropout(p=0.1, inplace=False)
    (linear_layers): ModuleList(
      (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
    )
  )
  (src_attention): MultiHeadedAttention(
    (dropout_layer): Dropout(p=0.1, inplace=False)
    (linear_layers): ModuleList(
      (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
    )
  )
  (feed_forward): FeedForwardNN(
    (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
    (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
    (dropout_layer): Dropout(p=0.1, inplace=False)
  )
  (sublayer_wrappers): ModuleList(
    (0-2): 3 x SubLayerWrapper(
      (dropout): Dropout(p=0.1, inplace=False)
      (layer_norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    )
  )
)


In [15]:
decoder = Decoder(decoder_layer=decoder_layer, num_layers=6)
print("decoder: ", decoder)

decoder:  Decoder(
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attention): MultiHeadedAttention(
        (dropout_layer): Dropout(p=0.1, inplace=False)
        (linear_layers): ModuleList(
          (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
        )
      )
      (src_attention): MultiHeadedAttention(
        (dropout_layer): Dropout(p=0.1, inplace=False)
        (linear_layers): ModuleList(
          (0-3): 4 x Linear(in_features=8, out_features=8, bias=True)
        )
      )
      (feed_forward): FeedForwardNN(
        (linear_layer_1): Linear(in_features=8, out_features=16, bias=True)
        (linear_layer_2): Linear(in_features=16, out_features=8, bias=True)
        (dropout_layer): Dropout(p=0.1, inplace=False)
      )
      (sublayer_wrappers): ModuleList(
        (0-2): 3 x SubLayerWrapper(
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        )
     

In [16]:
decoder_output = decoder(input=tgt_data, encoded_src=encoder_output, tgt_mask=tgt_mask, src_mask=src_mask)
print("shape: ", decoder_output.shape)
print("decoder_output: ", decoder_output)

shape:  torch.Size([3, 4, 8])
decoder_output:  tensor([[[-1.1061, -0.5500,  0.0733, -0.7903,  1.2729,  1.9033, -0.0283,
          -0.7748],
         [-0.1326, -0.5112, -0.0748, -0.2682,  1.7345,  1.4551, -0.8880,
          -1.3149],
         [-0.5470, -1.0540, -0.3246, -0.5026,  0.9811,  2.0944,  0.2564,
          -0.9038],
         [-0.1037, -0.6275, -0.6672,  0.0567,  0.9113,  2.1837, -0.8015,
          -0.9518]],

        [[-0.1121,  0.2154, -0.3915,  0.7804,  1.0024,  1.4002, -1.5558,
          -1.3390],
         [ 0.3801,  0.4446, -0.3432,  0.5393,  0.6576,  1.3945, -1.8112,
          -1.2616],
         [ 0.1352, -0.0923, -0.4717,  0.6608,  0.6897,  1.7826, -1.3983,
          -1.3059],
         [ 0.1704,  0.0198, -0.4958,  0.4451,  0.7853,  1.7894, -1.4684,
          -1.2458]],

        [[ 0.5008,  0.4563, -0.4886,  1.0461,  0.4811,  1.0075, -1.9767,
          -1.0265],
         [ 0.7084,  0.4482, -0.3441,  0.9553,  0.2433,  1.0916, -1.8691,
          -1.2336],
         [-0.2425, 