In [1]:
# In this notebook, you learn:
# 
# 1) How to use Adam optimization in Pytorch?
# 
# Resources to go through before continuing with this notebook:
# 1) https://www.youtube.com/watch?v=lAq96T8FkTw&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=18
#       -- Part 1 of the videos on exponential weighted averages.
#       -- Explains how exponential weighted averages work is calculated.
# 2) https://www.youtube.com/watch?v=NxTFlzBjS-4&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=19
#       -- Part 2 of the videos on exponential weighted averages.
#       -- Explains why exponential weighted averages work.
# 3) https://www.youtube.com/watch?v=lWzo8CajF5s&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=19
#       -- Part 3 of the videos on exponential weighted averages.
#       -- Explains bias correction in exponential weighted averages.
# 4) https://www.youtube.com/watch?v=k8fTYJPd3_I&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=20
#       -- Explains Gradient Descent with Momentum.
# 5) https://www.youtube.com/watch?v=_e-LFe_igno&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=22
#       -- Explains RMS prop.
# 6) https://www.youtube.com/watch?v=JXQT_vxqwIs&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=22
#       -- Explains Adam optimization algorithm.
# 7) https://www.linkedin.com/pulse/getting-know-adam-optimization-comprehensive-guide-kiran-kumar/
#       -- Reiterates the concepts of Adam optimization.

In [2]:
import torch
from torch import nn

In [3]:
# Optimizers are used to update the weights of the neural network in order to minimize the loss function. 

## [torch.optim.Adam](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam)

In [4]:
# Constants used in the notebook.
# Number of input features for our neural network (linear layer).
in_features = 4
# Number of output features for our neural network (linear layer).
out_features = 6
# Number of samples in our dataset.
num_samples = 2
# Number of iterations to train the neural network.
num_iterations = 2
# Learning rate for the optimizer.
learning_rate = 0.01
# Hyperparameter to calculate the m1 moment in the optimizer. This roughly corresponds to averaging over the
# last 10 (1/(1-beta_1)) sets of gradients. This comes from 'Gradient Descent with Momentum' algorithm.
beta_1 = 0.9
# Hyperparameter to calculate the m1 moment in the optimizer. This roughly corresponds to averaging over the
# last 1000 (1/(1-beta_2)) sets of gradients. This comes from 'RMS prop' algorithm.
beta_2 = 0.999
# Small value to avoid division by zero in the optimizer.
epsilon = 1e-8

In [6]:
# Creating a linear layer which will serve as the neural network for this experiment.
linear_layer = nn.Linear(in_features=in_features, out_features=out_features, bias=True)
print(linear_layer)
print("-" * 150)
# This is what we pass to the optimizer to update the weights of the neural network.
weights_or_params = linear_layer.parameters()
print(weights_or_params)
print("-" * 150)
# Peeking into the actual parameters of the linear layer.
for name, param in linear_layer.named_parameters():
  print("parameters: ", name, " - ", param.data.shape, " - ", param.data)
  # Since we have not yet called the backward function, the gradients are not yet calculated. So, this will print None.
  print("gradients_of_parameters: ", name, " - ", param.grad.shape if param.grad else "no shape to print ", " - ", param.grad)
  print("-" * 150)

Linear(in_features=4, out_features=6, bias=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
<generator object Module.parameters at 0x7ffa60245000>
------------------------------------------------------------------------------------------------------------------------------------------------------
parameters:  weight  -  torch.Size([6, 4])  -  tensor([[ 0.1006,  0.2277, -0.2466, -0.0159],
        [-0.2504,  0.1124,  0.4456,  0.3830],
        [-0.3520, -0.0375,  0.1437,  0.2367],
        [-0.0943, -0.1823, -0.3903, -0.1696],
        [ 0.3875,  0.3849,  0.3608,  0.2921],
        [-0.3984, -0.4341,  0.1094, -0.2876]])
gradients_of_parameters:  weight  -  no shape to print   -  None
------------------------------------------------------------------------------------------------------------------------------------------------------
parameters:  bias  -  torch.Size([6])  -  tensor([-0.3

In [7]:
# Create random input and output tensors for the neural network.
input_data = torch.randn(size=(num_samples, in_features), dtype=torch.float32)
print("shape: ", input_data.shape)
print("input_data: \n", input_data)
print("-" * 150)
output_data = torch.randn(size=(num_samples, out_features), dtype=torch.float32)  
print("shape: ", output_data.shape)
print("output_data: \n", output_data)
print("-" * 150)

shape:  torch.Size([2, 4])
input_data: 
 tensor([[ 0.0391,  0.2567, -1.8606,  0.2611],
        [ 1.2128, -0.2215, -1.3360, -0.4727]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape:  torch.Size([2, 6])
output_data: 
 tensor([[ 0.5449, -0.4881, -0.3103, -1.7100,  0.1560, -0.0139],
        [ 0.9330,  0.0412, -1.0586,  0.5111,  1.5341,  0.0960]])
------------------------------------------------------------------------------------------------------------------------------------------------------


In [8]:
# Loss function to calculate the loss between the predicted output and the actual output. This is just a 
# simple mean squared error loss function.
loss_fn = nn.MSELoss()
print(loss_fn)
print("-" * 150)
adam_optimizer = torch.optim.Adam(params=linear_layer.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
print(adam_optimizer)
print("-" * 150)
# Before training the netural network, there are no gradients calculated. So, the state corresponding to the moving 
# averages of the gradients and the squared moving averages of the gradients are not stored in the state_dict.
print(adam_optimizer.state_dict())

MSELoss()
------------------------------------------------------------------------------------------------------------------------------------------------------
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
------------------------------------------------------------------------------------------------------------------------------------------------------
{'state': {}, 'param_groups': [{'lr': 0.01, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}


In [9]:
# For now lets assume the entire dataset is processed at once and there are no mini-batches.
for epoch in range(num_iterations):
    # Zero the gradients of the parameters of the neural network.
    adam_optimizer.zero_grad()
    # Forward pass. Find the output predictions of the neural network.
    predictions = linear_layer(input_data)
    # Calculate the loss.
    loss = loss_fn(predictions, output_data)
    # Backward pass. Calculate the gradients of the loss with respect to the parameters of the neural network.
    loss.backward()
    # Update the weights of the neural network using the optimizer.
    adam_optimizer.step()
    # Print the state of the adam_optimizer. After each step, the state_dict of the optimizer will be updated.
    # It will now contain the moving averages of the gradients and the squared moving averages of the gradients.
    print(adam_optimizer.state_dict())
    print("-" * 150)
    # Update the weights of the neural network using the optimizer.
    for name, param in linear_layer.named_parameters():
        # Adam optimization.
        # m1 and m2 are the first and second moments of the gradients respectively. 
        # m1 = 0 and m2 = 0 at the start of the optimization.
        # m1 = beta_1 * m1 + (1 - beta_1) * param.grad
        # m2 = beta_2 * m2 + (1 - beta_2) * (param.grad)^2
        # m1_hat = m1 / (1 - (beta_1)^(epoch + 1))
        # m2_hat = m2 / (1 - (beta_2)^(epoch + 1))
        # param.data = param.data - learning_rate * m1_hat / sqrt(m2_hat)
        # param.grad = 0
        print("shape: ", param.data.shape, " name: ", name)
        print("param.data: ", param.data)
        print("-" * 150)
        print("shape: ", param.grad.shape, " name: ", name)
        print("param.grad: ", param.grad)
        print("-" * 150)

{'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[-0.0174,  0.0016,  0.0297,  0.0051],
        [-0.0140,  0.0035,  0.0085,  0.0065],
        [ 0.0055, -0.0009, -0.0067, -0.0021],
        [ 0.0019,  0.0100, -0.0732,  0.0100],
        [-0.0460,  0.0035,  0.0842,  0.0128],
        [-0.0061,  0.0003,  0.0125,  0.0015]]), 'exp_avg_sq': tensor([[3.0114e-05, 2.6479e-07, 8.8434e-05, 2.6514e-06],
        [1.9494e-05, 1.2561e-06, 7.2729e-06, 4.2039e-06],
        [3.0697e-06, 8.6436e-08, 4.4467e-06, 4.2999e-07],
        [3.7330e-07, 9.9159e-06, 5.3531e-04, 1.0093e-05],
        [2.1145e-04, 1.2465e-06, 7.0866e-04, 1.6451e-05],
        [3.6670e-06, 6.8866e-09, 1.5578e-05, 2.1814e-07]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([-0.0200, -0.0079,  0.0049,  0.0394, -0.0558, -0.0081]), 'exp_avg_sq': tensor([3.9857e-05, 6.1839e-06, 2.3710e-06, 1.5537e-04, 3.1100e-04, 6.5399e-06])}}, 'param_groups': [{'lr': 0.01, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize'