|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>Extracting activations using "hooks"<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import GPT2Model, GPT2Tokenizer

model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
model.eval()

In [None]:
model.config

# Manually access the activations of a layer

In [None]:
# variable for the embedding dimensionality
embed_dim = model.config.n_embd

# random numbers for inputs (1 batch, 20 tokens)
inputActs = torch.randn(1,20,embed_dim)
print(f'Inputs into layer is {inputActs.shape}')

# run the data (output of previous layer) through this weights matrix
# 3rd attention block, QKV matrix
outputActs = model.h[3].attn.c_attn(inputActs[0])
print(f'Outputs from QKV is {outputActs.shape}')

# isolate just the Q neuron outputs
Q = outputActs[:,:embed_dim]
print(f'Q activations is {Q.shape}')

# Implanting a hook in the model

In [None]:
## references:
# https://pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html
# https://stackoverflow.com/questions/78279823/how-exactly-the-forward-and-backward-hooks-work-in-pytorch

In [None]:
# Define a hook function to store query vectors
activations = {}

def implant_hook(layer_number): # need an "outer" function here to specify the layer_number
  def hook(module, input, output):

    # module: the layer the hook is attached to
    # input : a tuple of the inputs passed into that layer during the forward pass
    # output: output of that layer

    # pass the inputs into the attention block (QKV matrices concatenated)
    qkvActs = module.c_attn(input[0])  # [batch, seq, 3*embed_dim]

    # isolate the Q activations before they're further processed during attention
    embed_dim = qkvActs.shape[-1] // 3
    qActs = qkvActs[:,:,:embed_dim].detach()

    # store in the dictionary
    activations[f'attn_{layer_number}_q'] = qActs

    ### note: Here we're not taking the output of the entire layer; we're selectively grabbing
    #         the Q activations before they're combined during self-attention.
    #         That's why we re-calculate and extract. cf the following line (not used here!),
    #         where the stored activations correspond to the layer output (post-attention-computations).
    activations[f'attn_output_{layer_number}'] = output

  return hook


# pick the layers to hook
layers2hook = [5,7]
handles = [None]*len(layers2hook)


# surgery ;)
handles[0] = model.h[layers2hook[0]].attn.register_forward_hook(implant_hook(layers2hook[0]))
handles[1] = model.h[layers2hook[1]].attn.register_forward_hook(implant_hook(layers2hook[1]))

In [None]:
# see the hook in the model
model.h[layers2hook[0]].attn._forward_hooks

# "Hook" the activations

In [None]:
text = 'This is an example sentence.'
tokens = tokenizer.encode(text,return_tensors='pt')

# forward pass to trigger the hook
outputs = model(tokens)

In [None]:
# activations are not in the outputs...
dir(outputs)

In [None]:
# ... they're stored in the dictionary
activations

In [None]:
# check stored activations
qq = activations['attn_5_q']
print(f'Size of query matrix: {qq.shape}')

# Hooked data are replaced at each forward pass

In [None]:
# run again to see replacement

print('First run:\n',activations['attn_5_q'])

tokens = tokenizer.encode('This is a different sentence',return_tensors='pt')
outputs = model(tokens)

print('\nSecond run:\n',activations['attn_5_q']) # they are the same!

In [None]:
# see the next hook implantation for how to save all previous activations!

# Some visualizations

In [None]:
# visualization
plt.plot(qq[0,3,:],qq[0,2,:],'ko',markerfacecolor=[.9,.8,.7],alpha=.6)
plt.gca().set(xlabel=f'Activation to "{tokenizer.decode(tokens[0,3])}"',
              ylabel=f'Activation to "{tokenizer.decode(tokens[0,2])}"',
              title='Activations of Q neurons in layer 5')
plt.show()

In [None]:
# correlation matrix
plt.imshow(np.corrcoef(np.squeeze(qq)),vmin=-.8,vmax=.8)
plt.colorbar()
plt.show()

# Removing hooks

In [None]:
# hooks can be removed
print('Preserved hook before removal:\n',activations['attn_5_q'][0,-1,:6])
print('Removed hook before removal:\n',activations['attn_7_q'][0,-1,:6])

# remove one hook
handles[1].remove()

# new tokens
tokens = tokenizer.encode('I wish coffee tasted like toothpaste',return_tensors='pt')

outputs = model(tokens)

print('\nPreserved hook after removal:\n',activations['attn_5_q'][0,-1,:6])
print('Removed hook after removal:\n',activations['attn_7_q'][0,-1,:6])

In [None]:
# remove the other hook
handles[0].remove()

# Appending instead of replacing activations

In [None]:
# hook the MLP's output
activations = []

# note: don't need an "outer function" call here b/c we specify the hook layer below
def mlp_hook(module, inp, out):
  activations.append(out)

# hook the MLP in layer 4
model.h[4].mlp.c_proj.register_forward_hook(mlp_hook)

In [None]:
# run some text through the model
model( tokenizer.encode('I like chocolate.',return_tensors='pt') );

In [None]:
# note: just a list, not a dictionary!
activations

In [None]:
print(f'"activations" is a {type(activations)} that contains {len(activations)} elements \n')
for i in range(len(activations)):
  print(f'Element {i} has shape {activations[i].shape}')

In [None]:
# run the model THREE more times
model( tokenizer.encode('I like chocolate.',return_tensors='pt') )
model( tokenizer.encode('You know the shape my breath will take before I let it out.',return_tensors='pt') )
model( tokenizer.encode('Four score and seven years ago.',return_tensors='pt') );

In [None]:
print(f'"activations" is a {type(activations)} that contains {len(activations)} elements \n')
for i in range(len(activations)):
  print(f'Element {i} has shape {activations[i].shape}')