#Learning Rate Scheduler

- Adjusts learning rate during training
- lr scheduling should be applied after optimizer's update

- Documentation: https://pytorch.org/docs/stable/optim.html

#Importing Libraries:

In [None]:
import torch
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler

#Lambda LR:

In [None]:
lr = 0.1
model = nn.Linear(10, 1)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
lambda1 = lambda epoch: epoch / 10 #defines a lambda function

#It's a small, anonymous function that can have any number of arguments but only one expression
#Syntax: lambda args: expression

In [None]:
print(lambda1(5)) #5 / 10 = 0.5

0.5


In [None]:
scheduler = lr_scheduler.LambdaLR(optimizer, lambda1)

#for each epoch, divides epoch by value in lambda function (10) and multiplies by lr (0.01)
#epoch 1: (1/10) * 0.1
#epoch 2: (2/10) * 0.1
#...

In [None]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.0, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}


In [None]:
print("Initial Learning Rate:", optimizer.state_dict()['param_groups'][0]['lr'])
for epoch in range(5):
  #loss.backward()
  optimizer.step()
  #validate(...)
  scheduler.step()
  print(f"Epoch: {epoch + 1}, Learning Rate:",optimizer.state_dict()['param_groups'][0]['lr'])

Initial Learning Rate: 0.0
Epoch: 1, Learning Rate: 0.010000000000000002
Epoch: 2, Learning Rate: 0.020000000000000004
Epoch: 3, Learning Rate: 0.03
Epoch: 4, Learning Rate: 0.04000000000000001
Epoch: 5, Learning Rate: 0.05


#Multiplicative LR:

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
lambda1 = lambda epoch: 0.95 #defines a lambda function

#It's a small, anonymous function that can have any number of arguments but only one expression
#Syntax: lambda args: expression

In [None]:
print(lambda1(5)) #0.95

0.95


In [None]:
scheduler = lr_scheduler.MultiplicativeLR(optimizer, lambda1)

#for each epoch, multiplies last lr(0.1) by lambda function value (0.95)
#e1: 0.95 * 0.1 = 0.095
#e2: 0.95 * 0.95 = 0.09025
#...

In [None]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}


In [None]:
print("Initial Learning Rate:", optimizer.state_dict()['param_groups'][0]['lr'])
for epoch in range(5):
  #loss.backward()
  optimizer.step()
  #validate(...)
  scheduler.step()
  print(f"Epoch: {epoch + 1}, Learning Rate:",optimizer.state_dict()['param_groups'][0]['lr'])

Initial Learning Rate: 0.1
Epoch: 1, Learning Rate: 0.095
Epoch: 2, Learning Rate: 0.09025
Epoch: 3, Learning Rate: 0.0857375
Epoch: 4, Learning Rate: 0.08145062499999998
Epoch: 5, Learning Rate: 0.07737809374999999


#Step LR:

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

#for each 3 epochs, multiplies last lr(0.1) by gamma value (0.1)
#e3: 0.1 * 0.1 = 0.01
#e6: 0.1 * 0.01 = 0.001
#e9: 0.1 * 0.001 = 0.0001

In [None]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}


In [None]:
print("Initial Learning Rate:", optimizer.state_dict()['param_groups'][0]['lr'])
for epoch in range(10):
  #loss.backward()
  optimizer.step()
  #validate(...)
  scheduler.step()
  print(f"Epoch: {epoch + 1}, Learning Rate:",optimizer.state_dict()['param_groups'][0]['lr'])

Initial Learning Rate: 0.1
Epoch: 1, Learning Rate: 0.1
Epoch: 2, Learning Rate: 0.1
Epoch: 3, Learning Rate: 0.010000000000000002
Epoch: 4, Learning Rate: 0.010000000000000002
Epoch: 5, Learning Rate: 0.010000000000000002
Epoch: 6, Learning Rate: 0.0010000000000000002
Epoch: 7, Learning Rate: 0.0010000000000000002
Epoch: 8, Learning Rate: 0.0010000000000000002
Epoch: 9, Learning Rate: 0.00010000000000000003
Epoch: 10, Learning Rate: 0.00010000000000000003


#Multi-Step LR:

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[2, 7], gamma=0.1)

#multiplies gamma (0.1) by last lr when reaches certain milestones (2, 7)
#e2: 0.1 * 0.1 = 0.01
#e7: 0.1 * 0.01 = 0.001

In [None]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}


In [None]:
print("Initial Learning Rate:", optimizer.state_dict()['param_groups'][0]['lr'])
for epoch in range(10):
  #loss.backward()
  optimizer.step()
  #validate(...)
  scheduler.step()
  print(f"Epoch: {epoch + 1}, Learning Rate:",optimizer.state_dict()['param_groups'][0]['lr'])

Initial Learning Rate: 0.1
Epoch: 1, Learning Rate: 0.1
Epoch: 2, Learning Rate: 0.010000000000000002
Epoch: 3, Learning Rate: 0.010000000000000002
Epoch: 4, Learning Rate: 0.010000000000000002
Epoch: 5, Learning Rate: 0.010000000000000002
Epoch: 6, Learning Rate: 0.010000000000000002
Epoch: 7, Learning Rate: 0.0010000000000000002
Epoch: 8, Learning Rate: 0.0010000000000000002
Epoch: 9, Learning Rate: 0.0010000000000000002
Epoch: 10, Learning Rate: 0.0010000000000000002


#Exponential LR:

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

#for each epoch, multiplies last lr by gamma (0.1), same as stepLR with a step size of 1
#e1: 0.1 * 0.1 = 0.01
#e2: 0.1 * 0.01 = 0.001
#...

In [None]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.1, 'params': [0, 1]}]}


In [None]:
print("Initial Learning Rate:", optimizer.state_dict()['param_groups'][0]['lr'])
for epoch in range(5):
  #loss.backward()
  optimizer.step()
  #validate(...)
  scheduler.step()
  print(f"Epoch: {epoch + 1}, Learning Rate:",optimizer.state_dict()['param_groups'][0]['lr'])

Initial Learning Rate: 0.1
Epoch: 1, Learning Rate: 0.010000000000000002
Epoch: 2, Learning Rate: 0.0010000000000000002
Epoch: 3, Learning Rate: 0.00010000000000000003
Epoch: 4, Learning Rate: 1.0000000000000004e-05
Epoch: 5, Learning Rate: 1.0000000000000004e-06


#Reduce LR on Plateau:

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience = 3)

#reduces lr when a metric has stopped improving
#mode = 'min', reduces lr when metric has stopped decreasing
#mode = 'max', reduces lr when metric has stopped increasing
#factor = 0.1, multiplies lr by 0.1
#patience = 10, no. of epochs after which the lr will be reduced if no change in metric is detected

In [None]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.1, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}


In [None]:
print("Initial Learning Rate:", optimizer.state_dict()['param_groups'][0]['lr'])

metric = 0.2

for epoch in range(15):
  #loss.backward()
  optimizer.step()
  #validate(...)
  if epoch == 8:
    metric = 0.1 #decreases metric, so when epoch 9 is reached, the lr is not reduced
  scheduler.step(metric)
  print(f"Epoch: {epoch + 1}, Learning Rate:",optimizer.state_dict()['param_groups'][0]['lr'])

Initial Learning Rate: 0.1
Epoch: 1, Learning Rate: 0.1
Epoch: 2, Learning Rate: 0.1
Epoch: 3, Learning Rate: 0.1
Epoch: 4, Learning Rate: 0.1
Epoch: 5, Learning Rate: 0.010000000000000002
Epoch: 6, Learning Rate: 0.010000000000000002
Epoch: 7, Learning Rate: 0.010000000000000002
Epoch: 8, Learning Rate: 0.010000000000000002
Epoch: 9, Learning Rate: 0.010000000000000002
Epoch: 10, Learning Rate: 0.010000000000000002
Epoch: 11, Learning Rate: 0.010000000000000002
Epoch: 12, Learning Rate: 0.010000000000000002
Epoch: 13, Learning Rate: 0.0010000000000000002
Epoch: 14, Learning Rate: 0.0010000000000000002
Epoch: 15, Learning Rate: 0.0010000000000000002
