In [18]:
# In this notebook, you learn:
# 1) What is learning rate and how is it used in pytorch?
#
# Resources:
# 1) https://www.jeremyjordan.me/nn-learning-rate/
#       -- Explains what learning rate is and why is it used.
# 2) https://www.youtube.com/watch?v=QzulmoOg2JE&list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc&index=23
#       -- Explains what learning rate decay is and why it is used.
# 3) https://www.youtube.com/watch?v=81NJgoR5RfY
#       -- Gives a walk through of how to use different learning_rate schedulers in pytorch.

In [19]:
from torch import nn

import torch

## Setup

In [None]:
# Number of input features for the linear layer.
in_features = 4
# Number of output features for the linear layer. 
out_features = 6
# Number of samples in the dataset.
num_samples = 4
# Hyperparameter to calculate the m1 moment in the optimizer. This roughly corresponds to averaging over the
# last 10 (1/(1-beta_1)) sets of gradients. This comes from 'Gradient Descent with Momentum' algorithm.
beta_1 = 0.9
# Hyperparameter to calculate the m2 moment in the optimizer. This roughly corresponds to averaging over the
# last 1000 (1/(1-beta_2)) sets of gradients. This comes from 'RMS prop' algorithm.
beta_2 = 0.999
# Small value to avoid division by zero in the optimizer.
epsilon = 1e-8
# Learning rate with which the training is started. This gets updated by the learning rate scheduler periodically.
initial_learning_rate = 0.1
# Number of iterations for which the training is run.
num_iterations = 2
# Factor used to decay the learning rate.
decay_factor = 0.1

In [21]:
# Create random input and output tensors for the neural network.
input_data = torch.randn(size=(num_samples, in_features), dtype=torch.float32)
print("shape: ", input_data.shape)
print("input_data: \n", input_data)
print("-" * 150)
output_data = torch.randn(size=(num_samples, out_features), dtype=torch.float32)  
print("shape: ", output_data.shape)
print("output_data: \n", output_data)

shape:  torch.Size([4, 4])
input_data: 
 tensor([[ 0.3768, -2.0741,  0.9403,  0.1040],
        [ 0.3636, -1.2057,  2.1092,  0.8630],
        [-1.2130,  1.3250, -0.5797, -0.5042],
        [ 0.2132, -0.9422,  0.3078,  0.1567]])
------------------------------------------------------------------------------------------------------------------------------------------------------
shape:  torch.Size([4, 6])
output_data: 
 tensor([[ 2.2342,  0.8298, -0.4668,  0.6896, -0.3985,  0.4498],
        [-0.2488,  0.1556,  0.6068,  0.7403, -0.0575, -0.0032],
        [ 1.2860,  2.0274,  0.7297, -0.0376,  1.1107,  0.6648],
        [ 0.8990, -0.4358,  1.3365,  1.8619, -1.1116, -0.9941]])


In [None]:
linear_layer = nn.Linear(4, 6)
print(linear_layer)
print("-" * 150)
# Adam optimizer is a combination of 'Momentum' and 'RMS prop' algorithms. It uses the m1 and m2 moments to update the weights.
# Refer to miscellaneous/optimizers.ipynb notebook for more details about adam_optimizer. This will be passed to the 
# learning rate scheduler to update the learning rate.
adam_optimizer = torch.optim.Adam(params=linear_layer.parameters(), lr=initial_learning_rate, betas=(beta_1, beta_2), eps=epsilon)
print(adam_optimizer)
print("-" * 150)
# The learning_rate is '0.1' as it should be since we passed it to the optimizer.
print(adam_optimizer.state_dict())
print("-" * 150)
# Loss function to be used in the neural network (linear layer) back propogation. This is the mean squared error loss function.
loss_fn = nn.MSELoss()
print(loss_fn)

Linear(in_features=4, out_features=6, bias=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    weight_decay: 0
)
------------------------------------------------------------------------------------------------------------------------------------------------------
{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
------------------------------------------------------------------------------------------------------------------------------------------------------
MSELoss()


## [torch.optim.lr_scheduler.LambdaLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.LambdaLR.html#torch.optim.lr_scheduler.LambdaLR)

In [23]:
# learning_rate_updater needs to take an integer as input and returns a float as output for it to be 
# used with the lr_scheduler. epoch is passed to the learning_rate_updater by the lr_scheduler. We don't
# need to pass it explicitly. Also, epoch starts from 0. We will use this function in the below cells.
learning_rate_updater = lambda epoch: decay_factor ** epoch
print(learning_rate_updater)

<function <lambda> at 0x7f9130bc9fc0>


In [24]:
example_learning_rate_updater = lambda epoch: (epoch + 1) + (epoch + 1)
example_adam_optimizer = torch.optim.Adam(params=linear_layer.parameters(), lr=initial_learning_rate, betas=(beta_1, beta_2), eps=epsilon)
# Note that 'lr' is equal to 0.1 here as expected.
print(example_adam_optimizer.state_dict())
print("-" * 150)
# lr_scheduler takes the optimizer and the learning rate updater function to update the learning rate after every 
# step (which is controlled by us). Note that the 'lr' in the adam_optimizer is updated in the lr_scheduler 
# initialization function initially. It is not that the case that 'lr' stays the same as what was passed during 
# adam_optimizer initialization until the step function of the lr_scheduler is called. However, the next updates 
# to the 'lr' in the adam_optimizer only happen when you call the step function of the lr_scheduler. Let me 
# illustrate this with an example below.
example_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=example_adam_optimizer, lr_lambda=example_learning_rate_updater)
# Note that the new 'lr' here is new_lr = lr * ((epoch + 1) + (epoch + 1)) = 0.1 * ((0 + 1) + (0 + 1)) = 0.1 * 2 = 0.2. This
# calculation will be explained in the next cell.
print(example_adam_optimizer.state_dict())
print("-" * 150)
# This cell is created just to explain the above behavior. We will not be using the computations in this cell in
# the next cells.

{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
------------------------------------------------------------------------------------------------------------------------------------------------------
{'state': {}, 'param_groups': [{'lr': 0.2, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}
------------------------------------------------------------------------------------------------------------------------------------------------------


In [None]:
# On every epoch, LambdaLR will update the learning rate by calling the 'learning_rate_updater' function and multiplying 
# the return value of the lambda function with the 'initial_learning_rate' (Note this is 'initial_learning_rate' in every
# step and not the learning_rate from the previous step).
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=adam_optimizer, lr_lambda=learning_rate_updater)
print("lr_scheduler: ", lr_scheduler)
# This gives us the last computed learning rate. This is the learning rate that will be used in the next step.
print("last_computed_lr: ", lr_scheduler.get_last_lr())
print("lr_scheduler_state: ", lr_scheduler.state_dict())
print("-" * 150)
# Here, the 'lr' did not change because decay_factor ** epoch = 0.1 ** 0 = 1.0. So, the 'lr' remains the same as 
# initial_learning_rate.
print("adam_optimizer_state: ", adam_optimizer.state_dict())

lr_scheduler:  <torch.optim.lr_scheduler.LambdaLR object at 0x7f9133b607f0>
last_computed_lr:  [0.1]
lr_scheduler_state:  {'base_lrs': [0.1], 'last_epoch': 0, 'verbose': False, '_step_count': 1, '_get_lr_called_within_step': False, '_last_lr': [0.1], 'lr_lambdas': [None]}
------------------------------------------------------------------------------------------------------------------------------------------------------
adam_optimizer_state:  {'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}


In [26]:
# For now lets assume the entire dataset is processed at once and there are no mini-batches.
for epoch in range(num_iterations):
    # Zero the gradients of the parameters of the neural network.
    adam_optimizer.zero_grad()
    # Forward pass. Find the output predictions of the neural network.
    predictions = linear_layer(input_data)
    # Calculate the loss.
    loss = loss_fn(predictions, output_data)
    # Backward pass. Calculate the gradients of the loss with respect to the parameters of the neural network.
    loss.backward()
    # Update the weights of the neural network using the optimizer.
    adam_optimizer.step()
    # The 'lr' should be the same as the value before this 'step' function call. However, the m1 and m2 moments are
    # updated in the optimizer. 
    print("adam_optimizer_state: ", adam_optimizer.state_dict())
    print("-" * 150)
    # The 'lr' is recalculated here by the lr_scheduler and is updated inside the adam_optimizer.
    lr_scheduler.step()
    print("lr_scheduler_state: ", lr_scheduler.state_dict())
    print("-" * 150)    
    # The 'lr' should be the same as the value you see in the 'lr_scheduler_state' above.
    print("adam_optimizer_state: ", adam_optimizer.state_dict())
    print("-" * 150)

adam_optimizer_state:  {'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 0.0099,  0.0179, -0.0017,  0.0072],
        [ 0.0142, -0.0119,  0.0083,  0.0084],
        [ 0.0099, -0.0045, -0.0106, -0.0031],
        [ 0.0083, -0.0040,  0.0062,  0.0040],
        [ 0.0043, -0.0131,  0.0059,  0.0028],
        [-0.0202,  0.0641, -0.0579, -0.0205]]), 'exp_avg_sq': tensor([[9.8607e-06, 3.1888e-05, 2.9107e-07, 5.1254e-06],
        [2.0112e-05, 1.4069e-05, 6.9440e-06, 7.0562e-06],
        [9.7399e-06, 2.0047e-06, 1.1210e-05, 9.5614e-07],
        [6.9679e-06, 1.5704e-06, 3.8173e-06, 1.6343e-06],
        [1.8718e-06, 1.7196e-05, 3.4965e-06, 7.6975e-07],
        [4.0630e-05, 4.1142e-04, 3.3472e-04, 4.1839e-05]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([-0.0326, -0.0092, -0.0254, -0.0169,  0.0087, -0.0278]), 'exp_avg_sq': tensor([1.0655e-04, 8.4636e-06, 6.4480e-05, 2.8613e-05, 7.5459e-06, 7.7210e-05])}}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgr