|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Identifying circuits and components<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Laminar profile of attention head weights<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import scipy.stats as stats

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Model, hooks, tokens, activations

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-2.8b')

# and their pythia model
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-2.8b")

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

In [None]:
model.config

In [None]:
# some helpful variables
nheads = model.config.num_attention_heads
head_dim = model.config.hidden_size // nheads
sqrtD = torch.sqrt(torch.tensor(head_dim)) # used for attention equation

print(f'There are {nheads} heads, each with {head_dim} dimensions.')

In [None]:
# hook the query vectors
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):
    activations[f'attn_{layer_number}'] = output.detach().cpu()
  return hook

# implant the hooks
for layeri in range(model.config.num_hidden_layers):
  model.gpt_neox.layers[layeri].attention.query_key_value.register_forward_hook(implant_hook(layeri))

In [None]:
# https://en.wikipedia.org/wiki/Fiji
txt = "The majority of Fiji's islands were formed by volcanic activity starting around 150 million years ago. Some geothermal activity still occurs today on the islands of Vanua Levu and Taveuni."

# tokenize
tokens = tokenizer.encode(txt,return_tensors='pt')
ntokens = len(tokens[0])

# run through the model
with torch.no_grad():
  model(tokens.to(device))

In [None]:
# checking sizes
print(activations.keys(),'\n')
print(activations['attn_4'].shape)

In [None]:
# Check the code for splitting into heads

# first, separate the Q,K,V matrices
Q,K,V = torch.split(activations['attn_13'][0,:,:],model.config.hidden_size,dim=1)

# now split into heads
Q_h = torch.split(Q,head_dim,dim=1)

print(f'There are {len(Q_h)} heads')
print(f'Each head has size {Q_h[2].shape}')

# Exercise 2: Laminar profile of attention weight distributions

In [None]:
# initializations
smx = np.linspace(0,1,300)
head_distributions = np.zeros((model.config.num_hidden_layers,len(smx),2))


# loop over layers
for layeri in range(model.config.num_hidden_layers):

  # separate Q and K, and split into heads
  Q,K,V = torch.split(activations[f'attn_{layeri}'][0,:,:],model.config.hidden_size,dim=1)
  Q_h = torch.split(Q,head_dim,dim=1)
  K_h = torch.split(K,head_dim,dim=1)


  # initialize empty arrays
  final2prev = np.array([])
  selfAttend = np.array([])


  # loop over heads
  for qi in range(nheads):

    # raw attention scores with mask
    attn_scores = (Q_h[qi] @ K_h[qi].t()) / sqrtD
    pastmask = torch.tril(torch.ones(ntokens,ntokens))
    attn_scores[pastmask==0] = -torch.inf

    # softmax
    attn_sm = F.softmax( attn_scores ,dim=-1)

    # the final token with all previous tokens (including the first but excluding self-attn)
    final_with_prev = attn_sm[-1,:-1]

    # matching tokens are self-attention
    matching_toks = torch.diag(attn_sm[1:,1:]) # exclude the first token in the sequence

    # add to dataset
    final2prev = np.concatenate((final2prev,final_with_prev))
    selfAttend = np.concatenate((selfAttend,matching_toks))


  ### head loop is complete; get kde's
  y = stats.gaussian_kde(final2prev)(smx)
  head_distributions[layeri,:,0] = y / y.max()

  y = stats.gaussian_kde(selfAttend)(smx)
  head_distributions[layeri,:,1] = y / y.max()


In [None]:
## visualize one layer
plt.figure(figsize=(10,4))

plt.plot(np.random.randn(len(final2prev))/70 - .1,final2prev,'ko',markerfacecolor=[.7,.9,.7,.7],markersize=8)
plt.plot(np.random.randn(len(selfAttend))/70 + .1,selfAttend,'ks',markerfacecolor=[.9,.7,.7,.7],markersize=8)

plt.gca().set(xticks=[-.1,.1],xlim=[-.3,.3],xticklabels=['Final to\nprev','Self-\nattention',],
              ylabel='Softmax attention weight',title='Softmax attention weights from final layer')

plt.show()

In [None]:
# show all lines in one plot
plt.figure(figsize=(10,4))

# plot all the lines
for i in range(model.config.num_hidden_layers):

  # special case for final layer to get the legend
  if i==model.config.num_hidden_layers-1:
    plt.plot(smx,head_distributions[i,:,0],color=mpl.cm.Reds(i/32),label='Final to previous')
    plt.plot(smx,head_distributions[i,:,1],color=mpl.cm.Blues(i/32),label='Self-attention')
  else:
    plt.plot(smx,head_distributions[i,:,0],color=mpl.cm.Reds(i/32))
    plt.plot(smx,head_distributions[i,:,1],color=mpl.cm.Blues(i/32))


plt.legend()
plt.gca().set(xlim=[0,1],xlabel='Softmax attention weight',ylabel='Proportion (norm.)',
              title='Probability density estimates (each line is a layer)')
plt.show()

In [None]:
# heatmaps of layers and probabilities
fig,axs = plt.subplots(1,2,figsize=(12,4))

# final2prev
h = axs[0].imshow(head_distributions[:,:,0].T,aspect='auto',origin='lower',vmin=0,vmax=.008,cmap='hot',
                  extent=[0,model.config.num_hidden_layers,smx[0],smx[-1]])
fig.colorbar(h,ax=axs[0],pad=.01)
axs[0].set(ylabel='Softmax probability',xlabel='Layer',title='Attention weights for final to previous')

# self-attention
h = axs[1].imshow(head_distributions[:,:,1].T,aspect='auto',origin='lower',vmin=0,vmax=.08,cmap='hot',
                  extent=[0,model.config.num_hidden_layers,smx[0],smx[-1]])
fig.colorbar(h,ax=axs[1],pad=.01)
axs[1].set(ylabel='Softmax probability',xlabel='Layer',title='Attention weights for self')

plt.tight_layout()
plt.show()