In [75]:
# In this notebook, you learn:
#
# 1) What is label smoothing and how to use it?

In [None]:
# Resources to understand label smoothing:
#
# 1) https://www.youtube.com/watch?v=wmUiOAra_-M
# 2) https://towardsdatascience.com/label-smoothing-make-your-model-less-over-confident-b12ea6f81a9a

In [76]:
# LABEL SMOOTHING:
#   
# Label smoothing is a regularization technique that is used to prevent the model from becoming too 
# confident about its predictions. We just add a small amount of noise to the labels.
# In the case of classification, the target is a one-hot vector and looks like [0, 0, 1, 0, 0, 0] i.e.,
# the probability of the correct class is 1 and the probability of all other classes is 0.
# Label smoothing replaces the 1 with (1 - epsilon) and all other classes with (epsilon / (num_classes - 1)).
# For example, if epsilon = 0.1 and num_classes = 6, then the target after Label Smoothing applied 
# will be [0.02, 0.02, 0.9, 0.02, 0.02, 0.02].
#
#--------------------------------------------------------------------------------------------------------------------------------
# LABEL SMOOTHING IN TRANSFORMERS:
#
# In the target output for transformers, we also have a padding token. We don't want to apply label
# smoothing to the padding token. So, we will only apply label smoothing to the non-padding tokens.
# Lets say the target for a specific token is [0, 0, 0, 0, 1, 0] and the padding token is 2. In 
# other words, the expected output is 4 (some token in the 4th position in the vocabulary). Now, when
# we apply Label Smoothing to this target, we will not share any of the 'smoothing' to the padding token.
# We let the probability of the padding token to be 0 and the probability of the correct token to be
# (1 - epsilon). The probability of all other tokens will be (epsilon / (num_classes - 2)). Note that we
# subtract 2 because we are not considering the padding token and the correct token i.e., ignoring 
# two classes. In this case, the target after Label Smoothing applied will be 
# [0.025, 0.025, 0.0, 0.025, 0.9, 0.025].

In [77]:
from torch import nn, Tensor
from typing import Optional

import torch

In [78]:
# Index of the padding token or the class label for the padding token.
padding_idx = 2
# Amount of probability to be shared among the tokens excluding correct token and padding tokens.
smoothing = 0.1
# Amount of probability shared with the correct token.
confidence = 1 - smoothing
# Number of classes in the classification problem. It is the size of the vocabulary in transformers.
# It includes the padding token.
num_classes = 6
# Number of sentences in the batch.
batch_size = 2
# Number of tokens in each sentence.
seq_len = 8

In [None]:
# Creating the target labels for 2 sentences each with 8 tokens and 6 possible classes for prediction 
# (including the padding token). This is compared to the output of the linear layer (TokenPredictor layer) 
# after the Decoder in the transformer.
#
# The class labels are the index of the correct class in the vocabulary.
# targets[0][0] = 0 --> The correct class for the 0th token in sentence 1 is 0 i.e., some token in the 0th position in the target vocabulary.
# targets[0][1] = 3 --> The correct class for the 1st token in sentence 1 is 3 i.e., some token in the 3th position in the target vocabulary.
# ...
# targets[0][7] = 2 --> The correct class for the 7th token in sentence 1 is 2 i.e., the padding token.
# targets[1][0] = 1 --> The correct class for the 0th token in sentence 2 is 1 i.e., some token in the 5th position in the target vocabulary.
# ...
targets = torch.tensor(data=[[0, 3, 4, 5, 5, 1, 2, 2], [1, 5, 3, 3, 4, 0, 0, 2]], dtype=torch.int64)
print("shape: ", targets.shape)
print("targets: \n", targets)

shape:  torch.Size([2, 8])
targets: 
 tensor([[0, 3, 4, 5, 5, 1, 2, 2],
        [1, 5, 3, 3, 4, 0, 0, 2]])


In [80]:
# Create a copy of the predicted probabilities to get the same shape as the predicted_probs.
# We will use this copy to create the smoothed probabilities of the corresponding targets. 
smoothed_probs = torch.zeros(size=(batch_size, seq_len, num_classes), dtype=torch.float32)
print("shape: ", smoothed_probs.shape)
print("smoothed_probs: \n", smoothed_probs)

shape:  torch.Size([2, 8, 6])
smoothed_probs: 
 tensor([[[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]]])


In [81]:
# We need to share the smoothing probability to the tokens (excluding correct token and padding token).
# Lets first fill the entire tensor with the smoothing probability. We will deal with the correct token
# and padding token probabilities later.
smoothed_probs = smoothed_probs.fill_(value=smoothing / (num_classes - 2))
print("shape: ", smoothed_probs.shape)
print("smoothed_probs: \n", smoothed_probs)

shape:  torch.Size([2, 8, 6])
smoothed_probs: 
 tensor([[[0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250]],

        [[0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250]]])


In [82]:
# The targets tensor [[0, 3, 4, 5, 5, 1, 2, 2], [1, 5, 3, 3, 4, 0, 0, 2]] contains the correct class labels 
# (correspond to the tokens in the vocabulary) for each of the 8 tokens in each of the 2 sentences in the batch. 
# We will use the targets tensor to find the appropriate positions in the smoothed_probs tensor to update it
# with the confidence probabilities. We will need to bring the targets tensor to contain the same number of
# dimensions as the smoothed_probs tensor to use it with the scatter_ function in the next step to do this.
unsqueezed_targets = targets.unsqueeze(dim=-1)
print("shape: ", unsqueezed_targets.shape)
print("unsqueezed_targets: \n", unsqueezed_targets)

shape:  torch.Size([2, 8, 1])
unsqueezed_targets: 
 tensor([[[0],
         [3],
         [4],
         [5],
         [5],
         [1],
         [2],
         [2]],

        [[1],
         [5],
         [3],
         [3],
         [4],
         [0],
         [0],
         [2]]])


In [83]:
# The smoothed_probs tensor need to be filled with the confidence probability at the positions that 
# correspond to the correct class labels (expected output tokens). 
#
# From above, the correct class labels for sentence 1 in the batch are [0, 3, 4, 5, 5, 1, 2, 2].
# - This means the right token for the zeroth token is in the 0th position (targets[0][0]) in the vocabulary. 
#   So, the 0th position in the smoothed_probs tensor should be filled with the confidence probability (0.9).
# - Similarly, the right word for the first token is in the 3rd position (targets[0][1]) in the vocabulary. 
#   So, the 3rd position in the smoothed_probs tensor should be filled with the confidence probability (0.9).
# - ... for all other tokens.
# 
# The unsqueezed_targets tensor will act as the index tensor to replace the probabilities in the 
# smoothed_probs tensor. We will use the 'scatter_' function to accomplish this. 
# Refer to 'understanding_tensor_manipulations_part_4.ipynb' to understand torch.scatter_ function.
smoothed_probs.scatter_(dim=-1, index=unsqueezed_targets, value=confidence)
print("shape: ", smoothed_probs.shape)
print("smoothed_probs: \n", smoothed_probs)

shape:  torch.Size([2, 8, 6])
smoothed_probs: 
 tensor([[[0.9000, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.9000, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.9000, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.9000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.9000, 0.0250, 0.0250, 0.0250]],

        [[0.0250, 0.9000, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0250, 0.0250, 0.9000, 0.0250],
         [0.9000, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.9000, 0.0250, 0.0250, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.9000, 0.0250, 0.0250, 0.0250]]])


In [84]:
# The padding token should not be predicted at all by the model. So, the probability associated with the
# class label that correspond to the padding token within each target token distribution should be 0. So, 
# we set the probability at index 2 (padding_idx) to 0.
smoothed_probs[:, :, padding_idx] = 0
print("shape: ", smoothed_probs.shape)
print("smoothed_probs: \n", smoothed_probs)

shape:  torch.Size([2, 8, 6])
smoothed_probs: 
 tensor([[[0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.9000, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.9000, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250]],

        [[0.0250, 0.9000, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.9000, 0.0250],
         [0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250]]])


In [85]:
# The target tensor is appended with the padding tokens at the end. These are just dummy tokens added to bring 
# all the sentences in the batch to the same length. We don't want the model to consider these tokens at all 
# in the loss calculation. So, we set the probabilities of the entire rows corresponding to the padding tokens
# to 0.
# 
# We repeat the target tensor 6 times (num_classes) along the last dimension to create a tensor of same shape
# as the smoothed_probs tensor. We then use this tensor to create a mask tensor that is True for the rows 
# corresponding to the padding tokens and False for all other tokens.
#
# mask[0][6] and mask[0][7] are True because the 6th and 7th tokens in the 0th sentence are padding tokens.
# mask[1][7] is True because the 7th token in the 1st sentence is a padding token.
mask = unsqueezed_targets.repeat(1, 1, num_classes) == padding_idx
print("shape: ", mask.shape)
print("mask: \n", mask)

shape:  torch.Size([2, 8, 6])
mask: 
 tensor([[[False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [ True,  True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True,  True]],

        [[False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [ True,  True,  True,  True,  True,  True]]])


In [86]:
# We now use the mask to set the probabilities of the entire rows corresponding to the padding tokens to 0.
# smoothed_probs[0][6] = 0.0
# smoothed_probs[0][7] = 0.0
# smoothed_probs[1][7] = 0.0
# This will be used in the next step to calculate the loss.
smoothed_probs = smoothed_probs.masked_fill(mask=mask, value=0.0)
print("shape: ", smoothed_probs.shape)
print("smoothed_probs: \n", smoothed_probs)

shape:  torch.Size([2, 8, 6])
smoothed_probs: 
 tensor([[[0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.9000, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.9000, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.0250, 0.9000, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.9000, 0.0250],
         [0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]])


In [None]:
# Combining the above steps into a module to be used in the transformer implementation.
class LabelSmoothing(nn.Module):
    def __init__(self, tgt_vocab_size: int, padding_idx: int, smoothing: float=0.1):
        super(LabelSmoothing, self).__init__()
        # Number of classes in the classification problem. It is the size of the vocabulary in transformers.
        self.vocab_size = tgt_vocab_size
        # Index of the padding token or the class label for the padding token. Usually set to 2.
        self.padding_idx = padding_idx
        # Amount of probability to be shared among the tokens excluding correct token and padding tokens.
        self.smoothing = smoothing
        # Amount of probability shared with the correct token.
        self.confidence = 1 - smoothing
    
    def forward(self, targets: Tensor) -> Tensor:
        """Calculates the smoothed probabilities for each of the target tokens within each sentence.

        Args:
            targets (Tensor): The target tensor containing the correct class labels (expected token indices from the 
                              vocab) for each token in the batch. An example target tensor for a batch of 2 sentences
                              each with 8 tokens and 6 possible classes for prediction (including the padding token)
                              would be: [[0, 3, 4, 5, 5, 1, 2, 2], [1, 5, 3, 3, 4, 0, 0, 2]]
                              SHAPE: [batch_size, tgt_seq_len - 1]

        Returns:
            Tensor: A smoothed probability distribution (1D tensor) for each target token in the batch.
                    SHAPE: [batch_size, tgt_seq_len - 1, vocab_size]                    
        """
        # The above description showing the shape as (tgt_seq_len - 1) is because the first token is removed from the
        # target tensor while calculating the loss. 'tgt_seq_len' variable here is the number of tokens in each 
        # target sequence in the batch before we removed the first token to form the expected decoder output. 
        # Don't get confused with the variable naming. Just ignore this explanation if it is confusing.
        batch_size, tgt_seq_len = targets.shape
        # Creating a tensor that will hold the smoothed probabilities for each target token in all the sentences.
        smoothed_probs = torch.zeros(size=(batch_size, tgt_seq_len, self.vocab_size), dtype=torch.float32)
        # Filling the entire tensor with the smoothing probability. We will deal with the probabilities of the
        # correct token and padding token later.
        smoothed_probs = smoothed_probs.fill_(value=self.smoothing / (self.vocab_size - 2))
        # Bringing the targets tensor to contain the same number of dimensions as the smoothed_probs tensor to 
        # use it with the 'scatter_' function. This is to replace the probabilities in the smoothed_probs tensor 
        # for the padding token and the correct token in the following steps.
        unsqueezed_targets = targets.unsqueeze(dim=-1)
        # Replacing the probabilities in the smoothed_probs tensor with the confidence probability at the 
        # positions that correspond to the correct class labels (expected output tokens in the target).
        smoothed_probs.scatter_(dim=-1, index=unsqueezed_targets, value=self.confidence)
        # The padding token should not be predicted at all by the model. So, the probability associated with the
        # class label that correspond to the padding token within each target token distribution should be 0. 
        smoothed_probs[:, :, self.padding_idx] = 0
        # The target tensor is appended with the padding tokens at the end. These are just dummy tokens added to bring 
        # all the sentences in the batch to the same length. We don't want the model to consider these tokens at all 
        # in the loss calculation. So, we set the probabilities of the entire rows corresponding to the padding tokens
        # to 0. More about why this setup works is explained in the next notebook 'step_17_loss_computation.ipynb'.
        mask = unsqueezed_targets.repeat(1, 1, self.vocab_size) == self.padding_idx
        return smoothed_probs.masked_fill(mask=mask, value=0.0)

In [92]:
transformer_targets = torch.tensor(data=[[0, 3, 4, 5, 5, 1, 2, 2], [1, 5, 3, 3, 4, 0, 0, 2]], dtype=torch.int64)
print("shape: ", transformer_targets.shape)
print("transformer_targets: \n", transformer_targets)

shape:  torch.Size([2, 8])
transformer_targets: 
 tensor([[0, 3, 4, 5, 5, 1, 2, 2],
        [1, 5, 3, 3, 4, 0, 0, 2]])


In [93]:
label_smoothing = LabelSmoothing(tgt_vocab_size=num_classes, padding_idx=padding_idx, smoothing=smoothing)
print(label_smoothing)

LabelSmoothing()


In [94]:
transformer_smoothed_probabilties = label_smoothing(targets=transformer_targets)
print("shape: ", transformer_smoothed_probabilties.shape)
print("smoothed_probabilties: \n", transformer_smoothed_probabilties)

shape:  torch.Size([2, 8, 6])
smoothed_probabilties: 
 tensor([[[0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.9000, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.9000, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.0250, 0.9000, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.0250, 0.9000],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.9000, 0.0250, 0.0250],
         [0.0250, 0.0250, 0.0000, 0.0250, 0.9000, 0.0250],
         [0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.9000, 0.0250, 0.0000, 0.0250, 0.0250, 0.0250],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]