|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 6:</h2>|<h1>Intervention (causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Interfering with attention <h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Does GPT2 like pineapple pizza?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import scipy.stats as stats

from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib as mpl

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch.nn.functional as F

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Model, hooks, tokens

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = model.to(device)
model.eval()

In [None]:
# some useful variables
nheads = model.config.n_head
nlayers = model.config.n_layer
n_emb = model.config.n_embd
head_dim = model.config.n_embd // nheads

In [None]:
def hook_silencer(layer_number):
  def hook2zero(module,input):

    # modify the activation only for this layer
    if layer_number==layer2silence:

      # reshape so we can index heads
      head_tensor = input[0].view(nbatches,ntokens,nheads,head_dim)

      # ablate
      head_tensor[:,token2silence,head2silence,:] = 0

      # reshape back to tensor
      head_tensor = head_tensor.view(nbatches,ntokens,n_emb)

      # return a tuple matching the original
      input = (head_tensor,*input[1:])

    return input
  return hook2zero


handles = []
for layeri in range(nlayers):
  h = model.transformer.h[layeri].attn.c_proj.register_forward_pre_hook(hook_silencer(layeri))
  handles.append(h)

# Tokens (my apologies to the Italians :P )

In [None]:
tokens = tokenizer.encode('Peanut butter and pineapple taste great on pizza',return_tensors='pt').to(device)
nbatches,ntokens = tokens.shape

for i in range(ntokens):
  print(f'Token position {i} is index {tokens[0,i]:6} and is "{tokenizer.decode(tokens[0,i])}"')

In [None]:
butter_idx = list(tokens[0]).index(tokenizer.encode(' butter')[0])
pineap_idx = list(tokens[0]).index(tokenizer.encode(' pineapple')[0])
pizza_idx  = list(tokens[0]).index(tokenizer.encode(' pizza')[0])

butter_idx,pineap_idx,pizza_idx

# Exercise 2: Does GPT2 prefer peanut butter or pineapple on pizza?

In [None]:
layer2silence = 1000
head2silence = 1000

with torch.no_grad():
  out_clean = model(tokens,output_hidden_states=True)

# check hidden states sizes
hs_clean = out_clean.hidden_states
hs_clean[4].shape

In [None]:
# get cossim from all layers
cs_clean = np.zeros((nlayers,2))

# loop over layers
for layeri in range(nlayers):

  # cosine similarities
  cs_clean[layeri,0] = F.cosine_similarity(out_clean.hidden_states[layeri+1][:,butter_idx,:],
                                           out_clean.hidden_states[layeri+1][0,pizza_idx,:] ).item()
  cs_clean[layeri,1] = F.cosine_similarity(out_clean.hidden_states[layeri+1][:,pineap_idx,:],
                                           out_clean.hidden_states[layeri+1][0,pizza_idx,:] ).item()


In [None]:
plt.figure(figsize=(8,5))

plt.plot(np.arange(nlayers)-.1,cs_clean[:,0],'ko',markerfacecolor=[.9,.7,.7,.8],markersize=10,label='Peanut butter')
plt.plot(np.arange(nlayers)+.1,cs_clean[:,1],'ks',markerfacecolor=[.7,.9,.7,.8],markersize=10,label='Pineapple')

plt.gca().set(xlabel='Transformer block',ylabel='Cosine similarity',
              title='Cosine similarity to " pizza"')
plt.legend()
plt.show()

# Exercise 3: Can we disrupt GPT's pizza preference?

In [None]:
token2silence = pineap_idx
head2silence = 5

cs_manip = np.zeros((nlayers,2))

for layer2silence in range(nlayers):

  with torch.no_grad():
    out_manip = model(tokens,output_hidden_states=True)

  # cosine similarities
  cs_manip[layer2silence,0] = F.cosine_similarity(out_manip.hidden_states[layer2silence+1][:,butter_idx,:],
                                                  out_manip.hidden_states[layer2silence+1][0,pizza_idx,:] ).item()
  cs_manip[layer2silence,1] = F.cosine_similarity(out_manip.hidden_states[layer2silence+1][:,pineap_idx,:],
                                                  out_manip.hidden_states[layer2silence+1][0,pizza_idx,:] ).item()

In [None]:
fig,axs = plt.subplots(1,2,figsize=(12,4))

axs[0].plot(np.arange(nlayers)-.1,cs_manip[:,0],'ko',markerfacecolor=[.9,.7,.7,.6],markersize=10,label='Peanut butter')
axs[0].plot(np.arange(nlayers)+.1,cs_manip[:,1],'ks',markerfacecolor=[.7,.9,.7,.6],markersize=10,label='Pineapple')
axs[0].set(xlabel='Transformer block',ylabel='Cosine similarity',title=f'Cosine similarities after silencing head {head2silence}')
axs[0].legend()

axs[1].plot(np.arange(nlayers)-.1,cs_manip[:,0]-cs_clean[:,0],'ko',markerfacecolor=[.9,.7,.7,.6],markersize=10,label='Peanut butter')
axs[1].plot(np.arange(nlayers)+.1,cs_manip[:,1]-cs_clean[:,1],'ks',markerfacecolor=[.7,.9,.7,.6],markersize=10,label='Pineapple')
axs[1].set(xlabel='Transformer block',ylabel='$\Delta$ cosine similarity',title='Silenced — clean')
axs[1].axhline(0,color='k',zorder=-30,linewidth=.5)
axs[1].legend()

plt.tight_layout()
plt.show()

# Exercise 4: Layer-specific silencing

In [None]:
# redefined
token2silence = pineap_idx

# initializes results matrix
cs_manip = np.zeros((nlayers,nheads,2))

# loop over layers and heads
for layer2silence in tqdm(range(nlayers),desc='Layers...'):
  for head2silence in range(nheads):

    # forward pass
    with torch.no_grad():
      out_manip = model(tokens,output_hidden_states=True)

    # cosine similarities
    cs_manip[layer2silence,head2silence,0] = F.cosine_similarity(out_manip.hidden_states[layer2silence+1][:,butter_idx,:],
                                                                 out_manip.hidden_states[layer2silence+1][0,pizza_idx,:] ).item()
    cs_manip[layer2silence,head2silence,1] = F.cosine_similarity(out_manip.hidden_states[layer2silence+1][:,pineap_idx,:],
                                                                 out_manip.hidden_states[layer2silence+1][0,pizza_idx,:] ).item()

In [None]:
fig,axs = plt.subplots(1,2,figsize=(12,4))

for i in range(nlayers):
  axs[0].plot(np.zeros(nheads)+i,cs_manip[i,:,1],'ko',markerfacecolor=[.9,.7,.7,.5])
  axs[0].plot([i-.6,i+.6],np.ones(2)*cs_clean[i,1],'k-',linewidth=2)

  # run a ttest
  tres = stats.ttest_1samp(cs_manip[i,:,1]-cs_clean[i,1],0)
  if tres.pvalue<(.05/36):
    axs[1].plot(np.zeros(nheads)+i,cs_manip[i,:,1]-cs_clean[i,1],'ko',markerfacecolor=[.9,.7,.7,.5])
  else:
    axs[1].plot(np.zeros(nheads)+i,cs_manip[i,:,1]-cs_clean[i,1],'rx')


axs[0].set(xlabel='Transformer block',ylabel='Cosine similarity',title='Cosine similarity in clean and silenced models')

axs[1].set(xlabel='Transformer block',ylabel='$\Delta$ cosine similarity',title='Silenced - clean')
axs[1].axhline(0,color='k',linewidth=.8,zorder=-10)

plt.tight_layout()
plt.show()