|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Numerical scaling issues in DL models<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

# vector figs
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Explore an example

In [None]:
# create two normal-random matrices
q = np.random.randn(50,50)
k = np.random.randn(50,50)

# all pairwise dot products via matrix multiplication
dp = q @ k.T

In [None]:
# check the standard deviations
std_q  = np.std(q)
std_k  = np.std(k)
std_dp = np.std(dp)

print(f'Standard deviation of q:  {std_q:.4f}')
print(f'Standard deviation of k:  {std_k:.4f}')
print(f'Standard deviation of dp: {std_dp:.4f}')
print(f'Square root of dimension: {np.sqrt(q.shape[1]):.4f}')

In [None]:
plt.figure(figsize=(8,3))

q_flat = q.flatten()
k_flat = k.flatten()
dp_flat = dp.flatten()
n = len(q_flat)

plt.plot(q_flat,np.random.normal(2,.02,n),'ks',markerfacecolor=[.9,.7,.7],alpha=.4,markersize=9)
plt.plot(k_flat,np.random.normal(1,.02,n),'ko',markerfacecolor=[.7,.9,.7],alpha=.4,markersize=9)
plt.plot(dp_flat,np.random.normal(0,.02,n),'k^',markerfacecolor=[.7,.7,.9],alpha=.4,markersize=9)

plt.gca().set(yticks=[0,1,2],yticklabels=['$QK^T$','$K$','$Q$',],ylim=[-.75,2.75],
              xlabel='Value')
plt.show()

# Exercise 2: A parametric experiment

In [None]:
vector_lengths = np.arange(2,100)

dp_stds = np.zeros(len(vector_lengths))

for l in range(len(vector_lengths)):

  # create two matrices
  q = np.random.randn(50,vector_lengths[l])
  k = np.random.randn(50,vector_lengths[l])

  # their dot products
  dps = q@k.T

  # their std
  dp_stds[l] = np.std(dps)


plt.figure(figsize=(8,4))
plt.plot(vector_lengths,np.sqrt(vector_lengths),'k',linewidth=2,label='Expectation')
plt.plot(vector_lengths,dp_stds,'ko',markersize=10,markerfacecolor=[.7,.7,.9],alpha=.5,label='Empirical')

plt.legend()
plt.gca().set(xlabel='Vector size',ylabel='Standard deviation',title='Standard deviation of multiplied Gaussian noise')
plt.show()

# Exercise 3: Implications for softmax

In [None]:
# create two vectors
q = np.random.randn(50,50)
k = np.random.randn(50,50)

# their dot product
dps = q@k.T


_,axs = plt.subplots(2,2,figsize=(12,6))
for i in range(2):

  # possible scaling
  if i==1:
    dps /= np.sqrt(q.shape[1])

  # calculate softmax and nl
  dps_flat = dps.flatten()
  softmax = np.exp(dps_flat)/np.sum(np.exp(dps_flat))
  nll = -1/np.log(softmax)

  # and plot
  axs[i,0].plot(softmax,'ko',markerfacecolor=[.9,.7,.7],alpha=.6)
  axs[i,0].set(xlabel='Data index',ylabel='Softmax prob.',title='Softmaxified logits ('+['unscaled)','scaled)'][i])

  axs[i,1].plot(nll,'ko',markerfacecolor=[.7,.7,.9],alpha=.6)
  axs[i,1].set(xlabel='Data index',ylabel='-nll',title='Negative log-softmax ('+['unscaled)','scaled)'][i])

plt.tight_layout()
plt.show()

# Exercise 4: Check GPT2's layernorm parameters

In [None]:
from transformers import AutoModelForCausalLM,GPT2Tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
# gather all layernorm parameters into vectors
all_ln_weights = np.array([])
all_ln_biases = np.array([])

for name,mat in gpt2.named_parameters():
  if 'ln' in name:
    if 'weight' in name:
      all_ln_weights = np.append(all_ln_weights,mat.data)
    elif 'bias' in name:
      all_ln_biases = np.append(all_ln_biases,mat.data)

In [None]:
# calculate their distributions
yW,xW = np.histogram(all_ln_weights,bins=np.linspace(0,3,100))
yB,xB = np.histogram(all_ln_biases,bins=np.linspace(-5,5,100))

# and plot
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].plot(xW[:-1],np.log(yW),'ks-',markerfacecolor=[.9,.7,.7])
axs[0].set(xlabel='Stretching parameter value',ylabel='Count (log)',title='Stretch parameter learned in GPT2')

axs[1].plot(xB[:-1],np.log(yB),'ko-',markerfacecolor=[.7,.7,.9])
axs[1].set(xlabel='Shifting parameter value',ylabel='Count (log)',title='Shift parameter learned in GPT2')

plt.tight_layout()
plt.show()