|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Qualitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Visualize single-token predictions<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Tokenize and forward-pass in GPT2-small and -large

In [None]:
# load pretrained GPT-2 model and tokenizer
gpt2_small = AutoModelForCausalLM.from_pretrained('gpt2')
gpt2_large = AutoModelForCausalLM.from_pretrained('gpt2-large')
gpt2_small.eval()
gpt2_large.eval()

# and the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# the text
text = 'The goal of a correlation analysis is to compute a correlation coefficient. This coefficient is indicated using r, and is a number that encodes the normalized strength of the linear relationship between two variables. The normalization imposes boundaries of -1 to +1. Negative, zero, and positive correlation coefficients have distinct interpretations.'
tokens = tokenizer.encode(text,return_tensors='pt')


In [None]:
# get the outputs of the models
with torch.no_grad():
  outputs_small = gpt2_small(tokens)
  outputs_large = gpt2_large(tokens)

# Exercise 2: Calculate and visualize softmax per-token

In [None]:
# get the log-softmax logits for each token
tokenSM = np.zeros((len(tokens[0]),2))


for toki in range(1,len(tokens[0])):

  # get the logit outputs and convert to log-softmax
  # use the PREVIOUS token position, bc that predicts the current token choice
  tokenlogit = outputs_small.logits[0,toki-1,:]
  sm = F.log_softmax(tokenlogit,dim=-1)

  # extract the softmax for the actual token
  tokenSM[toki,0] = sm[tokens[0,toki]].item()



  ### repeat for the large model
  tokenlogit = outputs_large.logits[0,toki-1,:]
  sm = F.log_softmax(tokenlogit,dim=-1)
  tokenSM[toki,1] = sm[tokens[0,toki]].item()


In [None]:
_,axs = plt.subplots(1,2,figsize=(12,5))

# plot the softmax probs
axs[0].plot(np.exp(tokenSM[:,0]),'o',label='GPT2 small')
axs[0].plot(np.exp(tokenSM[:,1]),'s',label='GPT2 large')

# lines connecting the models' predictions
for i in range(len(tokenSM)):
  axs[0].plot([i,i],[np.exp(tokenSM[i,0]),np.exp(tokenSM[i,1])],color=[.7,.7,.7],zorder=-10)


axs[0].legend()
axs[0].set(xlabel='Token index',ylabel='Softmax prob',title='Softmax prediction strengths')

# compare the two models
R = np.corrcoef(tokenSM.T)
axs[1].plot(tokenSM[1:,0],tokenSM[1:,1],'ko',markerfacecolor=[.7,.7,.7],alpha=.5,markersize=10)
axs[1].plot([-10,0],[-10,0],'k--',linewidth=.5)
axs[1].set(xlabel='GPT2 small (log-sm)',ylabel='GPT2 large (log-sm)',title=f'Consistency of predictions (r = {R[0,1]:.2f})')

plt.tight_layout()
plt.show()

# Exercise 3: Visualize token predictions as heatmap

In [None]:
# get width of one letter
fig,ax = plt.subplots(figsize=(10,2))

# draw a text object
temp_text = ax.text(0,0,'n',fontsize=12,fontfamily='monospace')

# Get its bounding box in display coordinates
bbox = temp_text.get_window_extent(renderer=fig.canvas.get_renderer())

# convert from display to axis coordinates
inv = ax.transAxes.inverted()
bbox_axes = inv.transform([[bbox.x0,bbox.y0], [bbox.x1,bbox.y1]])
en_width = bbox_axes[1,0] - bbox_axes[0,0] # bbox is [(x0,y0),(x1,y1)]

plt.close(fig)

In [None]:
# min/max scale the logsm
logsmScale = np.zeros_like(tokenSM)

for i in range(2):
  y = tokenSM[1:,i] # ignore the first value b/c 0 -> 1
  logsmScale[1:,i] = (y-y.min()) / (y.max()-y.min())

In [None]:
tokCount = 0

x_pos = 0  # starting x position (in axis coordinates)
y_pos = 1  # vertical center


# setup the figure
fig, axs = plt.subplots(2,1,figsize=(10,6))
axs[0].axis('off')
axs[1].axis('off')

for toki in range(len(tokens[0])):

  # text of this token
  toktext = tokenizer.decode([tokens[0,toki]])

  # width of the token
  token_width = en_width*len(toktext)

  # text object with background color matching the "activation"
  axs[0].text(x_pos+token_width/2, y_pos, toktext, fontsize=12, ha='center', va='center',fontfamily='monospace',
          bbox = dict(boxstyle='round,pad=.3', facecolor=mpl.cm.Reds(logsmScale[toki,0]), edgecolor='none', alpha=.8))

  axs[1].text(x_pos+token_width/2, y_pos, toktext, fontsize=12, ha='center', va='center',fontfamily='monospace',
          bbox = dict(boxstyle='round,pad=.3', facecolor=mpl.cm.Blues(logsmScale[toki,1]), edgecolor='none', alpha=.8))


  # update the token counter and x_pos
  tokCount += 1
  x_pos += token_width + .015 # plus a small gap

  # end of the line; reset coordinates and counter
  if tokCount>=20:
    y_pos -= .2
    x_pos = 0
    tokCount = 0

plt.show()