|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 6:</h2>|<h1>Intervention (causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>How to modify activations<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: replacing attention, MLP, and hidden states<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model.eval()

# Exercise 1: Zero-out the third attention head in K

In [None]:
tokens = tokenizer.encode('I wonder how many tokens are in pomegranate.',return_tensors='pt')

for t in tokens[0]:
  print(f'Token index   is ""')

In [None]:
# some useful variables
nheads =
n_emb =
head_dim =

# find the start and end index of the 3rd head
whichHead_idx = 2
h3_start =
h3_end =

print(f'Attention head {} starts at \nindex {} and ends at index {}.')

In [None]:
# initialize activations dictionary
activations = {}


def implant_hook(layer_number):
  def hook(module, input, output):

    # split the output into QKV (each is [B,S,H])
    q,k,v = output.split

    # make an editable copy of k vectors
    k_copy = k.

    # zero-out the data only from the specified head
    k_copy[:,:,:] = 0

    # recombine q with modified k and v
    QKV = torch.cat()

    # store the activations
    activations['qkv'] = QKV.().()

    # output the QKV matrix so it replaces the original
    return QKV

  return hook


layer2modify = 3
hookHandle = .register_forward_hook(implant_hook(layer2modify))

In [None]:
# confirm
model(tokens)

plt.figure(figsize=(12,4))
plt.plot(range(n_emb),activations['qkv'][0,5,:n_emb],'ks',markerfacecolor=[.7,.7,.9,.5],label='Q')
plt.plot(
plt.plot(

plt.legend()
plt.gca().set(xlim=[-5,n_emb*3+4],xlabel='Index into QKV matrix',ylabel='Activation value',
              title=f'Activations to the token "{tokenizer.decode(tokens[0,5])}" in layer {layer2modify}')
plt.show()

In [None]:
# remove the hook

# Exercise 2: Replace even-indexed MLP neurons with noise

In [None]:
# initialize activations dictionary
activations = {}

def hook(module, input, output):

  # create random noise of the same size
  noise =

  # Note: Because the modification is done directly on the tensor and not on a view of it,
  #       you can edit it in-place as shown below. Making a copy (as in the video) is also fine :D

  # replace
  output

  # store the activations
  activations['mlp'] =

  # and return the modified version
  return output


hookHandle = .register_forward_hook(hook)

In [None]:
# confirm
model(tokens)

plt.figure(figsize=(12,4))


In [None]:
hookHandle.remove()

# Exercise 3: Scale the hidden-state activations

In [None]:
# scaling factor
scaling_factor =

def hook(module, input, output):

  # extract the hidden states
  hs =

  # scaling via matrix-scalar multiplication


  # reconstruct and output
  return (hs,*output[1:])

hookHandle = model.transformer.h[8].register_forward_hook(hook)

In [None]:
# confirm
out = model(tokens,output_hidden_states=True)

hs = out.hidden_states
print(f'There are    hidden_states.')
print(f'Each hidden state is of size

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,3.5))

for i in range(len(hs)):

  # data from this transformer block for one token
  thisBlock = .numpy()

  # plot all the data
  axs[0].plot(

  # plot the norm
  axs[1].plot(

axs[0].set(xlabel='Hidden state layer',ylabel='Activation value',title='Hidden state activations for token #4')
axs[1].set(xlabel='Hidden state layer',ylabel='Matrix norm',title='Hidden state norms from token #4')

plt.tight_layout()
plt.show()

# Exercise 4: Scale up

In [None]:
# now scale by 10x
scaling_factor = 10
out = model(tokens
hs = out.hidden_


_,axs = plt.subplots(1,2,figsize=(10,3.5))

for i in range(len(hs)):

  # data from this transformer block for one token
  thisBlock = hs

  # plot all the data
  axs[0].plot(np.ones(n_emb

  # plot the norm
  axs[1].plot(i,

axs[0].set(xlabel='Hidden state layer',ylabel='Activation value',title='Hidden state activations')
axs[1].set(xlabel='Hidden state layer',ylabel='Vector norm',title='Hidden state norms')

plt.tight_layout()
plt.show()

In [None]:
hookHandle.remove()

# Bonus! Example of output variable with more than just transformer outputs

In [None]:
# just a hook to print

def hook(module, input, output):

  # print info about the output variable
  print(f'output is type {type(output)} and has {len(output)} element(s).')

  # info about each element of output
  for i in range(len(output)):
    print(f'Element {i} has size {list(output[i].shape)}')

hookHandle = model.transformer.h[8].register_forward_hook(hook)

In [None]:
text = [ 'Here is the first sentence', 'Here is another one of a different length.', 'Shall we go for three?' ]
tokenizer.pad_token = tokenizer.eos_token
tokens = tokenizer(text,padding=True,return_tensors='pt')
tokens

In [None]:
model(**tokens);

In [None]:
model.config._attn_implementation = 'eager'
model.config.output_attentions = True
model.config.output_hidden_states = True
model(**tokens);