In [1]:
# In this notebook, you will learn:
#
# 1) How to use the FeedForward Neural Network in Transformers?

In [None]:
import torch

from torch import nn, Tensor

In [None]:
# In the transformer model, the output of Multi-Headed Attention layer is passed to a simple FeedForward Neural 
# Network. We will create a module for this layer and show the input transformation within this layer.

In [4]:
# Number of sentences in a batch
batch_size = 2
# Size of the input vector
d_model = 8
# Number of neurons in the intermediate layer of the Feed Forward Neural network
d_feed_forward = 32

In this notebook, we will just show how to create a FeedForward neural network in Pytorch and pass a random input <br>
through the network.

In [None]:
class FeedForwardNN(nn.Module):
    def __init__(self, d_model: int, d_feed_forward: int, dropout_prob: float = 0.1):
        super().__init__()
        # This neural network will have two linear layers with a ReLU activation function in between them.
        self.linear_layer_1 = nn.Linear(in_features=d_model, out_features=d_feed_forward)
        self.linear_layer_2 = nn.Linear(in_features=d_feed_forward, out_features=d_model)
        # We also just add a dropout layer to show how it can be used in the Feed Forward Neural Network.
        self.dropout_layer = nn.Dropout(p=dropout_prob)

    def forward(self, input: Tensor) -> Tensor:
        """Passes the input through the Feed Forward Neural Network and returns the output 
           of the neural network.

        Args:
            input (Tensor): The output of the Multi-Headed Attention layer.
                            shape: [batch_size, seq_len, d_model]

        Returns:
            Tensor: The output of the Feed Forward Neural Network.
                    shape: [batch_size, seq_len, d_model]
        """
        # We first expand the input to higher dimension. We apply the ReLU activation function in this layer.
        intermediate_output = self.linear_layer_1(input).relu()
        # Dropout layer to prevent overfitting
        intermediate_output = self.dropout_layer(intermediate_output)
        # We then compress the input back to its original dimension. There is no specific intuitive explanation 
        # as to why this is done. It is just shown to be working practically in neural networks in general and 
        # in this paper in particular.
        return self.linear_layer_2(intermediate_output)

In [8]:
feed_forward_nn = FeedForwardNN(d_model=d_model, d_feed_forward=d_feed_forward, dropout_prob=0.1)
print(feed_forward_nn)

FeedForwardNN(
  (linear_layer_1): Linear(in_features=8, out_features=32, bias=True)
  (linear_layer_2): Linear(in_features=32, out_features=8, bias=True)
  (dropout_layer): Dropout(p=0.1, inplace=False)
)


In [5]:
# Generating input to experiment with the FeedForward Neural Network
def generate_batch_of_input_data(batch_size: int, seq_len: int, d_model: int) -> Tensor:
    return torch.randn(batch_size, seq_len, d_model)

In [9]:
input = generate_batch_of_input_data(batch_size=batch_size, seq_len=3, d_model=8)
print("shape: ", input.shape)
print("input: ", input)

shape:  torch.Size([2, 3, 8])
input:  tensor([[[ 0.9430,  0.6256, -0.3153, -1.1407, -1.0138, -2.4470,  0.5556,
           0.0917],
         [ 0.4587,  0.0568, -0.6081, -1.0623, -0.7745,  0.1406,  1.7913,
          -1.2390],
         [-2.2765,  0.4605,  0.3626,  0.0594,  0.0383,  0.1687,  1.0082,
           0.2421]],

        [[ 1.8294,  1.4184,  0.3092, -0.6906, -0.0684, -0.7176,  0.6720,
          -1.5491],
         [-0.8510, -0.7837,  0.1592, -0.3168,  0.7195, -0.3016, -1.9717,
          -1.1617],
         [-1.2917,  0.9663, -0.1424, -0.0375,  1.2998, -0.0973, -0.1307,
           1.8511]]])
