In [4]:
# In this notebook, you learn:
# 
# 1) What is Dropout Regularization?
# 2) How to use Dropout with Pytorch?
# 3) How does Dropout module work within Neural Networks?
#
# Dropout is used along with Positional Encoding in the Transformer model to prevent 
# overfitting. This will be explained in the step_9_positional_encoding.ipynb notebook.

In [5]:
# Resources to go through to understand about Regularization and Dropout before continuing 
# further in this notebook:
#
# https://www.youtube.com/watch?v=6g0t3Phly2M&t=1s
#       -- Explains what Regularization is and L2 Regularization in particular.
# https://www.youtube.com/watch?v=NyG-7nRpsW8
#       -- Gives intuition on why Regularization works i.e., why it prevents the model from overfitting?
# https://www.youtube.com/watch?v=D8PJAL-MZv8
#       -- What is Dropout Regularization and how to implement it (Inverted Dropout)?
# https://www.youtube.com/watch?v=ARq74QuavAo
#       -- Gives intuition on why Dropout works.

## [torch.nn.Dropout](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html#dropout)

In [7]:
import torch
from torch import nn, Tensor

In [8]:
# p=0.5 implies that any input is zeroed out with a probability of 0.5.
dropout_module = nn.Dropout(p=0.5, inplace=False)
print(dropout_module)

Dropout(p=0.5, inplace=False)


In [12]:
input_1 = torch.arange(start=0, end=60, dtype=torch.float).reshape(3, 4, 5)
print(input_1)
print(input_1.shape)

tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14.],
         [15., 16., 17., 18., 19.]],

        [[20., 21., 22., 23., 24.],
         [25., 26., 27., 28., 29.],
         [30., 31., 32., 33., 34.],
         [35., 36., 37., 38., 39.]],

        [[40., 41., 42., 43., 44.],
         [45., 46., 47., 48., 49.],
         [50., 51., 52., 53., 54.],
         [55., 56., 57., 58., 59.]]])
torch.Size([3, 4, 5])


In [13]:
# Notice that roughly half of the values in the input tensor have been zeroed out.
# Also, notice that the remaining values are scaled up by dividing each element with 
# '0.5'. As explained in one of the above videos, this scaling is done so that the 
# magnitude (maybe sum -- need to verify this) of the input expected by the layers 
# after dropout is not impacted because of the Dropout.  
output_1 = dropout_module(input_1)
print(output_1)

tensor([[[  0.,   2.,   4.,   0.,   8.],
         [ 10.,   0.,   0.,  16.,   0.],
         [ 20.,  22.,   0.,  26.,   0.],
         [  0.,  32.,  34.,   0.,  38.]],

        [[ 40.,   0.,   0.,  46.,   0.],
         [ 50.,  52.,   0.,  56.,   0.],
         [ 60.,   0.,  64.,   0.,   0.],
         [ 70.,  72.,   0.,  76.,   0.]],

        [[ 80.,  82.,   0.,   0.,   0.],
         [  0.,  92.,  94.,  96.,   0.],
         [100., 102.,   0., 106.,   0.],
         [  0., 112.,   0.,   0., 118.]]])


## Dropout in Neural Network

In [None]:
# Summary of how Dropout works in neural Networks: 
#
# Dropout is a regularization technique used to prevent overfitting in Neural Networks. 
# The neurons are randomly dropped with some probability which has an effect of 
# training the model with a smaller network (since neurons have been deleted) that 
# prevents overfitting.
#
# In practice, Dropout is implemented using the Inverted Dropout technique. Inverted 
# Dropout creates a binary mask (same size as input) that holds information whether 
# the output of any neuron is to be propagated or dropped. This mask is then 
# multiplied (element wise multiplication) to the neuron outputs (activations) and 
# the resultant output is passed to the next layer.
#
# Back Propagation with Dropout works the same way as it works in neural networks 
# without Dropout i.e., there is no additional step required to handle gradients with 
# dropout. It gets taken care of by the mask variable in the gradient computation. In 
# the end, the effect of dropout in gradient computation is that the gradients wrt to 
# the weights attached (coming in and going out) to the dropped neuron are all zeros. 
#
# In mini-batch gradient descent, the neurons are dropped independently for each input 
# from the batch i.e., the binary mask is generated independently for every input in 
# the batch. However, the gradients in this case are calculated based on the average 
# loss (Loss averaged over all the inputs in the batch). So, the gradients of the 
# weights associated with dropped neurons (different for different inputs) are not 
# always zero. Elaborating the above statement, If neuron 'n' is dropped for input 5 but 
# is not dropped for input 7 in the batch, the gradient calculation wrt the weights 
# associated (weights coming in or going out of 'n') with neuron 'n' will include 
# contribution from both input 5 and input 7. Since neuron 'n' is not dropped for 
# input 7, the gradient for the corresponding weight might be a non-zero value 
# (gradient contribution from input 7).

In [29]:
# Refer this notebook (link to using_modules.ipynb) to understand more about pytorch modules.
class SimpleNeuralNetwork(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, dropout_prob: float):
        super().__init__()
        # Refer this notebook (link to understanding_nn_linear.ipynb) to understand more about pytorch Linear module.
        self.layer_1 = nn.Linear(in_features=input_size, out_features=hidden_size)
        self.relu = nn.ReLU()
        self.dropout_prob = dropout_prob
        # We define the Dropout module here.
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)
        self.layer_2 = nn.Linear(in_features=hidden_size, out_features=1)

    def forward(self, input: Tensor) -> Tensor:
        output = self.layer_1(input)
        output = self.relu(output)
        print("output before applying dropout: \n\n", output)
        # This randomly drops the output values with probability of 'self.dropout_prob'
        output = self.dropout(output)
        print("\n\n output after applying dropout: \n\n", output)
        output = self.layer_2(output)
        return output

In [30]:
input_size: int = 10
hidden_size: int = 40
# Setting it to a higher value so that the difference is visible in the example.
dropout_prob: float = 0.5

In [31]:
my_neural_net = SimpleNeuralNetwork(input_size=input_size, hidden_size=hidden_size, dropout_prob=dropout_prob)
print(my_neural_net)

SimpleNeuralNetwork(
  (layer_1): Linear(in_features=10, out_features=40, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (layer_2): Linear(in_features=40, out_features=1, bias=True)
)


In [32]:
# Refer this notebook (link to building_simple_neural_network_using_modules.ipynb) to understand more
# about building neural network and training it.
loss_function = nn.MSELoss()
learning_rate = 0.03
sgd_optimizer = torch.optim.SGD(params=my_neural_net.parameters(), lr=learning_rate, momentum=0.9)
print(loss_function)
print(sgd_optimizer)

MSELoss()
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.03
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
)


In [33]:
# Create dummy inputs and output for to see the network running.
# This means we have 5 inputs and each input containing 10 features.
model_inputs = torch.randn(size=(5, 10))
# We have 5 targets, each one to the corresponding input.
input_targets = torch.rand(size=(5, 1))
print(model_inputs, model_inputs.shape)
print(input_targets, input_targets.shape)

tensor([[ 0.7289, -2.7069, -0.9212,  0.1576, -1.8981,  0.8384,  0.8872,  1.0395,
          1.9443, -0.2546],
        [-0.6895, -0.6642, -0.2122, -0.5916, -1.7462,  0.5320,  0.3308,  0.6395,
         -0.3654,  2.6568],
        [-0.3839,  2.0729, -0.4598, -1.9365,  0.8274, -0.3351,  0.0230, -0.3876,
         -0.5108, -0.2736],
        [-0.0928,  2.0255, -1.1254, -0.3370, -1.5516,  0.8941,  0.7879, -1.0145,
          0.9762, -1.1817],
        [ 0.0563,  0.1312,  0.0361,  1.9600,  2.5043,  0.0831, -1.2609,  0.3114,
         -0.2559, -1.2298]]) torch.Size([5, 10])
tensor([[0.6992],
        [0.9715],
        [0.8541],
        [0.1547],
        [0.5483]]) torch.Size([5, 1])


In [34]:
sgd_optimizer.zero_grad()
# Observe the following things from the printed outputs below (output before applying 
# dropout and output after applying dropout):
# 
# 1) A lot of the output values (output after applying dropout) are set to zero and the 
#    non-zero values are scaled up (by 2) after Dropout is applied.
# 2) Each 1D tensor below corresponds to the hidden layer outputs produced by a single 
#    input.
# 3) For each input, the neurons (hidden layer) are dropped independently using the 
#    probability from the other inputs.
#       -- 2nd Neuron is dropped for input 0.
#           -- output[0][1] = 1.3293 (output before applying dropout)
#           -- output[0][1] = 0.0000 (output after applying dropout)
#           -- Ofcourse, this data won't be (probably) valid if you run this cell again.
#       -- 2nd Neuron is active for input 2 where as 4th neuron is dropped for input 2.
#           -- output[2][1] = 0.4705 (output after applying dropout)
#           -- output[2][3] = 0.2969 (output before applying dropout)
#           -- output[2][3] = 0.0000 (output after applying dropout)
#           -- ofcourse, this data won't be (probably) valid if you run this cell again. 
#
model_predictions = my_neural_net(model_inputs)
loss = loss_function(model_predictions, input_targets)
loss.backward()
sgd_optimizer.step()

output before applying dropout: 

 tensor([[0.0000, 1.3293, 0.5765, 0.1958, 0.0000, 0.8037, 0.0000, 0.0000, 0.3233,
         0.7857, 0.1811, 0.0974, 0.5224, 0.5422, 0.6177, 0.0000, 0.4496, 0.0153,
         0.0000, 2.0704, 0.4691, 0.6185, 0.0000, 0.6780, 1.1849, 0.0000, 0.9923,
         0.0000, 0.0000, 0.0000, 0.0452, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0290, 0.9173, 0.0000, 0.0000],
        [0.0000, 0.5209, 0.0122, 1.0171, 0.5432, 0.2443, 0.0000, 0.5771, 0.0227,
         0.1358, 0.7629, 0.0967, 0.1740, 0.0000, 0.0000, 0.0000, 1.0735, 0.8084,
         0.0000, 0.8091, 1.2554, 0.0461, 0.0000, 0.3882, 0.0000, 1.1974, 0.0029,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6189, 0.3849,
         1.1420, 0.0000, 0.0134, 0.0000],
        [0.1213, 0.2353, 0.0000, 0.2969, 0.0000, 0.0000, 1.0119, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.1121, 0.0204, 1.2546, 0.4286, 0.1234,
         0.7397, 0.0000, 0.2635, 0.0000, 0.7562, 0.0000, 0.0000, 0.8936

In [27]:
def PrintModelParameters(model):
  for name, param in model.named_parameters():
    print("\nPrinting Model Parameters:\n\n", f"{name}: {param.data}")
    if param.requires_grad:
      print("\nPrinting Parameter Gradients:\n\n", f"{name}: {param.grad}")

In [28]:
PrintModelParameters(model=my_neural_net)


Printing Model Parameters:

 layer_1.weight: tensor([[ 0.1481,  0.0321,  0.1532,  0.2778, -0.1383, -0.1684, -0.2570, -0.3082,
         -0.0702,  0.1710],
        [-0.0975, -0.0337, -0.1405, -0.1601, -0.2358,  0.1764,  0.3005, -0.1157,
         -0.0870, -0.0054],
        [ 0.0391, -0.0073,  0.2941, -0.3008,  0.1426,  0.2562, -0.1783,  0.2085,
         -0.1614, -0.1403],
        [ 0.1527, -0.0482, -0.1914, -0.0590,  0.0636,  0.1869, -0.0845, -0.2904,
          0.2017,  0.0700],
        [ 0.1266,  0.1643,  0.1210,  0.1053, -0.0793, -0.0570, -0.1535,  0.1699,
          0.2627,  0.3099],
        [-0.0246, -0.0234,  0.0109,  0.3076, -0.0882,  0.1236, -0.1018, -0.2685,
          0.1116,  0.2856],
        [ 0.0079,  0.3016, -0.1408, -0.1778,  0.0037, -0.0810,  0.1429, -0.0711,
          0.1075, -0.1725],
        [-0.0417, -0.2125,  0.3024, -0.2208, -0.2260, -0.0640,  0.1855, -0.1335,
          0.0368,  0.1503],
        [-0.1350, -0.1790, -0.1752,  0.1196,  0.1866,  0.2226,  0.2406, -0.2032,
 