|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Negation tuning in attention neurons<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.gridspec import GridSpec

import statsmodels.api as sm

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, GPT2Tokenizer

import requests

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1a: Import the model and implant attention hooks

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# move the model to the GPU
model = model.to(device)
model.eval()

In [None]:
# hooks
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):
    activations[f'attn_{layer_number}'] = output.detach().cpu()
  return hook

# put hooks in all layers
for layeri in range(len(model.transformer.h)):
  model.transformer.h[layeri].attn.c_attn.register_forward_hook(implant_hook(layeri))

In [None]:
nneurons = model.transformer.h[0].attn.c_attn.weight.shape[-1]

# Exercise 1b: Get text, find negations and affirmations

In [None]:
# https://gutenberg.org/ebooks/32154
text = requests.get('https://gutenberg.org/cache/epub/32154/pg32154.txt').text
tokens = tokenizer.encode(text,return_tensors='pt')
num_tokens = len(tokens[0])
print(f'There are {num_tokens:,} tokens, of which {len(np.unique(tokens[0])):,} are unique.')

In [None]:
# context window size (tokens before and after each target)
context_pre = 90
context_pst = 10

In [None]:
# list of negation words to match exactly
negation_words = ['not','cannot','nor','never']

# initialize vector
isNegation = np.zeros(num_tokens,dtype=int)

# loop over all tokens
for ti in range(context_pre,num_tokens-context_pst):

  # current token
  currtok = tokenizer.decode(tokens[0,ti]).strip().lower()

  # token contains a 't contraction
  condA = ("'t" in currtok) or ("n't" in currtok)

  # word need to match completely (c.f., not->noted, nor->enormous)
  condB = currtok in negation_words

  # next token starts with a space (c.f., not->connotative)
  condC = tokenizer.decode(tokens[0,ti+1])[0] == ' '

  # test
  if (condA or condB) and condC:
    isNegation[ti] = 1


# count the targets
numNegationTokens = sum(isNegation)
negationsIdx = np.where(isNegation)[0]

In [None]:
# list of affirmation words to match exactly
affirmation_words = ['agree','always','allow','can','certainly','could','definitely','may','might','shall','should']

# initialize vector
isAffirmation = np.zeros(num_tokens,dtype=int)

# loop over all tokens
for ti in range(context_pre,num_tokens-context_pst):

  # current token
  currtok = tokenizer.decode(tokens[0,ti]).strip().lower()

  # next token can't be 'not'
  condA = tokenizer.decode(tokens[0,ti+1]) != ' not'

  # word need to match completely (c.f., not->noted, nor->enormous)
  condB = currtok in affirmation_words

  # next token starts with a space (c.f., not->connotative)
  condC = tokenizer.decode(tokens[0,ti+1])[0] == ' '

  # test
  if condA and condB and condC:
    isAffirmation[ti] = 1

# count the number of target tokens
numAffirmationTokens = sum(isAffirmation)
affirmationsIdx = np.where(isAffirmation)[0]

# Exercise 1c: Create batches and get activations

In [None]:
# create batches
batch_negations = torch.zeros((numNegationTokens,context_pre+context_pst+1),dtype=torch.long)
batch_affirmations = torch.zeros((numAffirmationTokens,context_pre+context_pst+1),dtype=torch.long)


# negation sequences
for b in range(numNegationTokens):
  tokenLoc = negationsIdx[b]
  batch_negations[b,:] = tokens[0,tokenLoc-context_pre:tokenLoc+context_pst+1]

# affirmation sequences
for b in range(numAffirmationTokens):
  tokenLoc = affirmationsIdx[b]
  batch_affirmations[b,:] = tokens[0,tokenLoc-context_pre:tokenLoc+context_pst+1]

#
print('Shape of negations batch:',batch_negations.shape)
print('Shape of affirmations batch:',batch_affirmations.shape)

In [None]:
# process the target (negation) tokens
with torch.no_grad():
  model(batch_negations.to(device))

# copy the activations
negations_activations = activations.copy()


### repeat for affirmations tokens
with torch.no_grad():
  model(batch_affirmations.to(device))
affirmations_activations = activations.copy()

In [None]:
print(affirmations_activations.keys(),'\n')

affirmations_activations['attn_5'].shape

# Exercise 2: Laminar profile of classification

In [None]:
# we'll use this vector repeatedly
category_labels = np.hstack((np.zeros(numAffirmationTokens),np.ones(numNegationTokens)))

In [None]:
# initialize matrix to store the classifier results
pvalues  = np.ones((model.config.n_layer,nneurons)) # initialize to 1's to ignore in subsequent mask
betas    = np.zeros((model.config.n_layer,nneurons))
accuracy = np.zeros((model.config.n_layer,nneurons))


# loop over layers
for layeri in range(model.config.n_layer):

  # loop over neurons for per-neuron analysis
  for neuroni in range(nneurons):
    # vectorize the activations over batches
    targs = negations_activations[f'attn_{layeri}'][:,context_pre,neuroni]
    comps = affirmations_activations[f'attn_{layeri}'][:,context_pre,neuroni]

    # build and run the model
    try: # sometimes crashes for linear-algebra reasons
      result = sm.Logit(category_labels,sm.add_constant(np.hstack((comps,targs)))).fit(maxiter=3000,disp=0)

      # extract the results (p-value, beta, and accuracy)
      pvalues[layeri,neuroni]  = result.pvalues[1]
      betas[layeri,neuroni]    = result.params[1]
      accuracy[layeri,neuroni] = 100*((result.predict()>.5)==category_labels).mean()

    except: pass

  print(f'Finished layer {layeri+1:2}/{model.config.n_layer}')

In [None]:
# create two masks
pvalue_mask = pvalues<.05/nneurons
posbet_mask = betas>0

# get accuracy only from masked neurons
# gratuitously confusingly, np.ma.masked_where() actually keeps the False values, and masks *out* True values
masked_accuracy = np.ma.masked_where(~(pvalue_mask & posbet_mask),accuracy)


# split the accuracy matrix into the Q/K/V matrices
q,k,v = torch.tensor(masked_accuracy).split(model.config.n_embd,dim=1)


# make the plot
_,axs = plt.subplots(1,2,figsize=(12,4))

axs[0].plot(100*np.mean(pvalue_mask,axis=1),'kH',markerfacecolor=[.7,.7,.7],markersize=9)
axs[0].set(xlabel='Layer',ylabel='Percent significant tests (%)',title='Laminar profile of significance')

axs[1].plot(torch.mean(q,dim=1),'ko',markerfacecolor=[.9,.7,.7],markersize=9,label='Q')
axs[1].plot(torch.mean(v,axis=1),'ks',markerfacecolor=[.7,.9,.7],markersize=9,label='K')
axs[1].plot(torch.mean(k,axis=1),'k^',markerfacecolor=[.7,.7,.9],markersize=9,label='V')
axs[1].legend()

axs[1].set(xlabel='Layer',ylabel='Prediction accuracy (%)',title='Laminar profile of prediction accuracy')
plt.show()