|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>Gradient clipping and learning rate scheduler<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup


# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Simple demo of gradient clipping

In [None]:
# tensor with gradients (like a weights matrix)
w = torch.tensor([[-1, 3.3, 2, -5, 3, -2, -4, -5, 1.5]], requires_grad=True)

# loss is sum of squares (L2)
loss = (w**2).sum()

# backprop
loss.backward()

# print the gradients and their norm before clipping
print('BEFORE CLIPPING:')
print(f' Gradient vals: {w.grad[0].tolist()}')
print(f' Gradient norm: {torch.norm(w.grad):.3f}')

# apply gradient clipping
preClipVals = w.grad[0].detach() + 0
nn.utils.clip_grad_norm_([w], max_norm=1)

# print the gradients again
print('\nAFTER CLIPPING:')
print(f' Gradient vals: {w.grad[0].tolist()}')
print(f' Gradient norm: {torch.norm(w.grad):.3f}')

In [None]:
# their correlation
r = torch.corrcoef(torch.cat((preClipVals.unsqueeze(0),w.grad.detach()),dim=0))

plt.plot(preClipVals,w.grad.detach().squeeze(),'ko',markersize=10,markerfacecolor=[.9,.7,.9])
plt.gca().set(xlabel='Pre-clipped values',ylabel='Clipped values',
              title=f'r = {r[0,1]:.2f}')
plt.grid(color=[.9,.9,.9])
plt.show()

# Gradient clipping demo during learning

In [None]:
# training time
training_steps = 80

# define a weight matrix (requires_grad=True to track gradients)
w = torch.tensor([[-4,.2]], requires_grad=True)

# target category
target = torch.tensor([0])

# optimizer and loss function
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD([w],lr=.03)

# training iterations
allWeights = torch.zeros((training_steps+1,2))
allWeights[0,:] = w.detach()
grad_norms = np.zeros((training_steps,2))

# training loop
for i in range(training_steps):

  # reset gradients
  optimizer.zero_grad()

  # model outputs (simulating a full model forward pass ;)  )
  modeloutput = F.log_softmax(w,dim=1)

  # loss
  loss = loss_function(modeloutput,target)

  # gradient descent
  loss.backward()  # calculate gradient of loss wrt w

  # store the gradient norm
  grad_norms[i,0] = torch.norm(w.grad)

  # engage backprop (uncomment the clipping line after visualizing)
  # nn.utils.clip_grad_norm_([w], max_norm=1)
  optimizer.step() # adjust w

  # clip the gradient and recalculate its norm
  nn.utils.clip_grad_norm_([w], max_norm=1)
  grad_norms[i,1] = torch.norm(w.grad)

  # store the new weights
  allWeights[i+1,:] = w.detach()

In [None]:
# let's see the weights!
_,axs = plt.subplots(2,1,figsize=(9,7))
axs[0].plot(allWeights[:,0],'ks',markerfacecolor=[.9,.7,.7],markersize=6,label='Weight 0 (target)')
axs[0].plot(allWeights[:,1],'ko',markerfacecolor=[.7,.7,.9],markersize=6,label='Weight 1 (non-target)')
axs[0].set(xlabel='Training epochs',ylabel='Weight value',xlim=[-3,training_steps+3])
axs[0].legend()

axs[1].plot(grad_norms[:,0],'ko',markerfacecolor=[.7,.9,.7],markersize=6,alpha=.7,label='Pre-clipping')
axs[1].plot(grad_norms[:,1],'rs',markerfacecolor=[.9,.9,.7],markersize=6,alpha=.7,label='Post-clipping')
axs[1].set(xlabel='Training epoch',ylabel='Gradient norm',xlim=[-3,training_steps+3])
axs[1].legend()

plt.tight_layout()
plt.show()

# Simple demo of optimizer scheduler

In [None]:
# training steps
training_steps = 200

# create a "model" and an optimizer
model = nn.Linear(10,10)
optimizer = torch.optim.AdamW(model.parameters(),lr=3e-5)

# learning rate scheduler
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps = int(training_steps*.1), # first 10% of training is warm-up
    num_training_steps = training_steps,
    num_cycles = .5 # in cycles over the entire training course
)

In [None]:
dir(scheduler)

In [None]:
# quick test to see the learning rates
lrs = np.zeros(training_steps)
for i in range(training_steps):
  optimizer.step() # update the optimizer
  scheduler.step() # run the scheduler
  lrs[i] = scheduler.get_last_lr()[0] # get the actual learning rate

# plot!
plt.figure(figsize=(10,3))
plt.plot(lrs,'ko',markersize=5,markerfacecolor=[.7,.7,.9],alpha=.3)

plt.gca().set(xlabel='Training epoch',ylabel='Learning rate')
plt.show()

# Demo of scheduler in a learning model

In [None]:
# redefine training time
training_steps = 5000

# define a weight matrix (requires_grad=True to track gradients)
w = torch.tensor([[-1,.2]], requires_grad=True)

# target category
target = torch.tensor([0])

# optimizer and loss function
loss_function = nn.NLLLoss()
optimizer = torch.optim.AdamW([w],lr=.0005)

### --- pick one of these two --- ###
scheduler = get_cosine_schedule_with_warmup(optimizer,
    num_warmup_steps = int(training_steps*.2), # first 20% of training is warm-up
    num_training_steps = training_steps,
    num_cycles = 2) # note the increase in cycles compared to previous

# scheduler = get_linear_schedule_with_warmup(optimizer,
#     num_warmup_steps = int(training_steps*.2), # first 20% of training is warm-up
#     num_training_steps = training_steps-int(training_steps*.15))  # last 15% of training has no learning, but change the - to +
### ---------------------------- ###

# training iterations
allWeights = torch.zeros((training_steps+1,2))
allWeights[0,:] = w.detach()
lrs = torch.zeros(training_steps)

# training loop
for i in range(training_steps):

  # reset gradients
  optimizer.zero_grad()

  # model outputs (simulating a full model forward pass ;)  )
  modeloutput = F.log_softmax(w,dim=1)

  # loss
  loss = loss_function(modeloutput,target)

  # gradient descent
  loss.backward()  # calculate gradient of loss wrt w
  optimizer.step() # adjust w
  scheduler.step()
  lrs[i] = scheduler.get_last_lr()[0]

  # store the new weights
  allWeights[i+1,:] = w.detach()

In [None]:
# let's see the weights!
_,axs = plt.subplots(2,1,figsize=(9,7))
axs[0].plot(allWeights[:,0],linewidth=2,label='Weight 0 (target)')
axs[0].plot(allWeights[:,1],linewidth=2,label='Weight 1 (non-target)')
axs[0].set(xlabel='Training epochs',ylabel='Weight value',xlim=[-3,training_steps+3])
axs[0].legend()

axs[1].plot(lrs,'k',linewidth=2)
axs[1].set(xlabel='Training epoch',ylabel='Learning rate',xlim=[-3,training_steps+3])

plt.tight_layout()
plt.show()