|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 6:</h2>|<h1>Intervention (causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Interfering with attention <h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Token prediction after head ablations<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.gridspec import GridSpec

from tqdm import tqdm

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch.nn.functional as F

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Model, hook, tokens

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

In [None]:
# some useful variables
nheads = model.config.n_head
n_emb = model.config.n_embd
head_dim = model.config.n_embd // nheads

In [None]:
def implant_hook(layer_number):
  def hook4attn(module,input):

    # print some useful information
    # print(len(input),type(input),input[0].shape)

    # modify the activation only for this layer
    if (layer_number==layer2ablate) & (head2ablate in range(0,nheads)):

      # reshape so we can index heads
      head_tensor = input[0].view(nbatches,ntokens,nheads,head_dim)

      # specify the value to replace
      if replaceWithZeros == True:
        value2replace = 0
      else:
        value2replace = head_tensor[:,:,head2ablate,:].mean()
        global observedHeadMean
        observedHeadMean = value2replace

      # then replace
      head_tensor[:,:,head2ablate,:] = value2replace

      # print confirmation
      # print(f'Zeroed out L{layer_number}, H{head2ablate}')

      # reshape back to tensor
      head_tensor = head_tensor.view(nbatches,ntokens,n_emb)

      # return a tuple matching the original
      input = (head_tensor,*input[1:])

    return input
  return hook4attn


handles = []
for layeri in range(model.config.n_layer):
  h = model.transformer.h[layeri].attn.c_proj.register_forward_pre_hook(implant_hook(layeri))
  handles.append(h)

In [None]:
tokens = tokenizer.encode('Berlin is the capital of',return_tensors='pt')
nbatches,ntokens = tokens.shape
tokens = tokens.to(device)

for i in range(ntokens):
  print(f'Token position {i:2} is index {tokens[0,i]} and is "{tokenizer.decode(tokens[0,i])}"')

In [None]:
# target and semantically related nontarget
nontarget_idx = tokenizer.encode(' France')[0]
target_idx = tokenizer.encode(' Germany')[0]

# confirm single-tokens
nontarget_idx,target_idx

# Exercise 2: Confirm accuracy and get clean logits

In [None]:
layer2ablate = 1000
head2ablate = 1000

with torch.no_grad():
  out = model(tokens)

# calculate softmax probability in percent
sm_clean = 100 * F.softmax(out.logits[0,-1,:],dim=-1).detach().cpu().numpy()

In [None]:
plt.figure(figsize=(10,4))

# all the log-sm values
plt.plot(np.log(sm_clean/100),'k.',markersize=2,alpha=.3)

# the target and nontarget values
plt.plot(target_idx,np.log(sm_clean[target_idx]/100),'gs',label='Germany')
plt.plot(nontarget_idx,np.log(sm_clean[nontarget_idx]/100),'ro',label='France')

# make the graph look pretty :D
plt.gca().set(xlabel='Vocab elements',ylabel='Log softmax',xlim=[0,model.config.vocab_size])
plt.title(f'Predicted next token is "{tokenizer.decode(np.argmax(sm_clean))}"')
plt.legend()

plt.show()

# Exercise 3: Zero-out attention heads for all token indices

In [None]:
replaceWithZeros = True

In [None]:
resultsZero = np.zeros((model.config.n_layer,nheads,3))

# loop over layers and heads
for layer2ablate in tqdm(range(model.config.n_layer),desc='Layers...'):
  for head2ablate in range(nheads):

    # forward pass
    with torch.no_grad():
      out = model(tokens)

    # softmax
    sm = 100 * F.softmax(out.logits[0,-1,:],dim=-1).detach().cpu().numpy()

    # sm logits for target and nontarget
    resultsZero[layer2ablate,head2ablate,0] = sm[target_idx]
    resultsZero[layer2ablate,head2ablate,1] = sm[nontarget_idx]

    # and the predicted next token
    resultsZero[layer2ablate,head2ablate,2] = np.argmax(sm)

In [None]:
fig,axs = plt.subplots(1,2,figsize=(10,4))

clim = 5

h = axs[0].imshow(resultsZero[:,:,0].T - sm_clean[target_idx],vmin=-clim,vmax=clim,cmap=mpl.cm.plasma,aspect='auto')
axs[0].set(xlabel='Layer',ylabel='Head',yticks=range(0,nheads,2),title='%$\Delta$ in prob. for target word')
fig.colorbar(h,ax=axs[0],pad=.01)

h = axs[1].imshow(resultsZero[:,:,1].T - sm_clean[nontarget_idx],vmin=-clim,vmax=clim,cmap=mpl.cm.plasma,aspect='auto')
axs[1].set(xlabel='Layer',ylabel='Head',yticks=range(0,nheads,2),title='%$\Delta$ in prob. for non-target word')
fig.colorbar(h,ax=axs[1],pad=.01)

plt.suptitle('Change in token selection probability from clean model',fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
u,c = np.unique(resultsZero[:,:,2],return_counts=True)
for ui,ci in zip(u,c):
  print(f'In {ci}/{c.sum()} runs, the model selected token "{tokenizer.decode(int(ui))}"')

# Exercise 4: Repeat with head mean imputation

In [None]:
replaceWithZeros = False

In [None]:
resultsMean = np.zeros((model.config.n_layer,nheads,4))

# loop over layers and heads
for layer2ablate in tqdm(range(model.config.n_layer),desc='Layers...'):
  for head2ablate in range(nheads):

    # forward pass
    with torch.no_grad():
      out = model(tokens)

    # log-softmax
    sm = 100 * F.softmax(out.logits[0,-1,:],dim=-1).detach().cpu().numpy()

    # log-sm logits for target and nontarget
    resultsMean[layer2ablate,head2ablate,0] = sm[target_idx]
    resultsMean[layer2ablate,head2ablate,1] = sm[nontarget_idx]

    # the empirical mean value that was imputed
    resultsMean[layer2ablate,head2ablate,2] = observedHeadMean

    # and the predicted next token
    resultsMean[layer2ablate,head2ablate,3] = np.argmax(sm)


In [None]:
fig,axs = plt.subplots(1,2,figsize=(10,4))

clim = 5

h = axs[0].imshow(resultsMean[:,:,0].T - sm_clean[target_idx],vmin=-clim,vmax=clim,cmap=mpl.cm.plasma,aspect='auto')
axs[0].set(xlabel='Layer',ylabel='Head',yticks=range(0,nheads,2),
           title='%$\Delta$ in prob. for target word')
fig.colorbar(h,ax=axs[0],pad=.01)

h = axs[1].imshow(resultsMean[:,:,1].T - sm_clean[nontarget_idx],vmin=-clim,vmax=clim,cmap=mpl.cm.plasma,aspect='auto')
axs[1].set(xlabel='Layer',ylabel='Head',yticks=range(0,nheads,2),
           title='%$\Delta$ in prob. for non-target word')
fig.colorbar(h,ax=axs[1],pad=.01)

plt.suptitle('Change in token selection probability from clean model',fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
u,c = np.unique(resultsMean[:,:,3],return_counts=True)
for ui,ci in zip(u,c):
  print(f'In {ci}/{c.sum()} runs, the model selected token "{tokenizer.decode(int(ui))}"')

In [None]:
# head-averaged activations
fig,axs = plt.subplots(1,2,figsize=(12,5))

axs[0].plot(resultsMean[:,:,2].flatten(),'ko',markerfacecolor=[.9,.7,.7,.6])
axs[0].set(xlabel='Heads $\\times$ layer (index)',ylabel='Head mean',title='As scatter plot')

h = axs[1].imshow(resultsMean[:,:,2].T,vmin=-.05,vmax=.05,cmap=mpl.cm.plasma,aspect='auto')
axs[1].set(xlabel='Layer',ylabel='Head',yticks=range(0,nheads,2),title='As image')
fig.colorbar(h,ax=axs[1],pad=.02,fraction=.05)

plt.suptitle('Head activation averages',fontweight='bold')
plt.tight_layout()
plt.show()

# Exercise 5: Comparisons

In [None]:
# setup the figure
fig = plt.figure(figsize=(13,4))
gs  = GridSpec(1,3,figure=fig)
axs = [ fig.add_subplot(gs[:2]) , fig.add_subplot(gs[-1]) ]


### histograms
nbins = 20

y,x = np.histogram(resultsZero[:,:,0].flatten() - sm_clean[target_idx],nbins)
axs[0].plot(x[:-1],y,'.-',linewidth=2,markersize=10,label='Zero target')

y,x = np.histogram(resultsMean[:,:,0].flatten() - sm_clean[target_idx],nbins)
axs[0].plot(x[:-1],y,'.-',linewidth=2,markersize=10,label='Mean target')

y,x = np.histogram(resultsZero[:,:,1].flatten() - sm_clean[nontarget_idx],nbins)
axs[0].plot(x[:-1],y,linewidth=2,label='Zero nontarget')

y,x = np.histogram(resultsMean[:,:,1].flatten() - sm_clean[nontarget_idx],nbins)
axs[0].plot(x[:-1],y,linewidth=2,label='Mean nontarget')

axs[0].set(xlabel='Token probability ($\Delta$ from clean model)',ylabel='Count',ylim=[-1,None],
           title='Histograms of $\Delta$ softmax')
axs[0].legend(fontsize=15)


# difference heat map
h = axs[1].imshow(resultsMean[:,:,0].T - resultsZero[:,:,0].T,vmin=-1,vmax=1,cmap=mpl.cm.plasma,aspect='auto')
axs[1].set(xlabel='Layer',ylabel='Head',yticks=range(0,nheads,2),title='$\Delta$ target: (mean - zero)')
fig.colorbar(h,ax=axs[1],pad=.02,fraction=.05)

plt.tight_layout()
plt.show()