|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Context-modulated activation in MLP<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import requests
import textwrap

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Exercise 1: Import the model and the nouns, implant the hook

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')

# load in GPTneo's and push to GPU
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# move the model to the GPU
model = model.to(device)
model.eval();

# number of MLP 'expansion' units
nneurons = model.transformer.h[8].mlp.c_fc.weight.shape[0]

In [None]:
# a hook function to grab the activations
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):

    # get the activations
    acts = module.c_fc(input[0])  # [batch, seq, 4xembed_dim]

    # store in the dictionary
    activations[f'mlp_{layer_number}_x'] = acts
  return hook


# pick the layer to hook
layer2hook = 8
hookName = f'mlp_{layer2hook}_x'

# surgery ;)
model.transformer.h[layer2hook].mlp.register_forward_hook(implant_hook(layer2hook))

In [None]:
# import the nouns
url = 'https://raw.githubusercontent.com/david47k/top-english-wordlists/refs/heads/master/top_english_nouns_lower_10000.txt'
nouns = requests.get(url).text
nouns = nouns.split('\n')[:100]

# Exercise 2: Activations for words with and without spaces

In [None]:
# initialize tensor for all activations
all_activations = np.zeros((2,len(nouns),nneurons))


# loop over the tokens
for i,word in enumerate(nouns):

  # forward pass one token without space
  with torch.no_grad(): model(tokenizer.encode(word,return_tensors='pt').to(device))
  all_activations[0,i,:] = activations[hookName].mean(dim=1).squeeze().detach().cpu().numpy()

  # forward pass same token with preceeding space
  with torch.no_grad(): model(tokenizer.encode(f' {word}',return_tensors='pt').to(device))
  all_activations[1,i,:] = activations[hookName].mean(dim=1).squeeze().detach().cpu().numpy()


In [None]:
# convenience variables
noSpace  = all_activations[0,:,:].flatten()
yesSpace = all_activations[1,:,:].flatten()

# square root of distances
diffs = np.sqrt(abs(noSpace-yesSpace))
diffs /= diffs.max()


# scatter plots
_,axs = plt.subplots(1,2,figsize=(12,5))
axs[0].scatter(noSpace,yesSpace,s=50,c=diffs,alpha=.5,cmap=mpl.cm.plasma_r)
axs[0].set(xlabel='No space',ylabel='With space',
              title=f'Correlation r = {np.corrcoef(noSpace,yesSpace)[0,1]:.3f}')


# histograms
y_yes,x_yes = np.histogram(yesSpace,bins=100,density=True)
y_noS,x_noS = np.histogram(noSpace,bins=100,density=True)

axs[1].plot(x_yes[:-1],y_yes,linewidth=2,label='With space')
axs[1].plot(x_noS[:-1],y_noS,linewidth=2,label='No space')

axs[1].legend()
axs[1].set(xlim=[-4,4],xlabel='Activation value',ylabel='Density',title='Distributions of MLP activations')

plt.tight_layout()
plt.show()

# Exercise 3: Get activations from generated tokens

In [None]:
# generate some new tokens
gentoks = model.generate(tokenizer.encode('I think the world could be better if',return_tensors='pt').to(device),
                         max_length=200, do_sample=True)

# let's see what the model thinks :o
print(textwrap.fill(tokenizer.decode(gentoks[0]),60))

In [None]:
with torch.no_grad(): model(gentoks)
activations[hookName].shape

In [None]:
fulltext_activations = activations[hookName].cpu()

# Exercise 4: Compare in-text and individual tokens

In [None]:
allacts = np.zeros((2,nneurons,len(gentoks[0])))

for ti,tok in enumerate(gentoks[0]):

  # forward pass for just this token
  with torch.no_grad(): model(tok.unsqueeze(0).to(device))

  # get the two activations
  allacts[0,:,ti] = fulltext_activations[0,ti,:].numpy()
  allacts[1,:,ti] = activations[hookName][0,ti,:].cpu().numpy()


In [None]:
fromText = allacts[0,:,:].flatten()
fromToks = allacts[1,:,:].flatten()

diffs = np.sqrt(abs(fromText-fromToks))
diffs /= diffs.max()


# scatter plots
_,axs = plt.subplots(1,2,figsize=(12,5))
axs[0].scatter(fromText,fromToks,s=50,c=diffs,alpha=.5,cmap=mpl.cm.plasma_r)
axs[0].set(xlabel='From text',ylabel='Individual tokens',
              title=f'Correlation r = {np.corrcoef(fromText,fromToks)[0,1]:.3f}')


# histograms
y_toks,x_toks = np.histogram(fromToks,bins=100,density=True)
y_text,x_text = np.histogram(fromText,bins=100,density=True)
y_diff,x_diff = np.histogram(fromToks-fromText,bins=100,density=True)

axs[1].plot(x_toks[:-1],y_toks,linewidth=2,label='Individual tokens')
axs[1].plot(x_text[:-1],y_text,linewidth=2,label='From text')
axs[1].plot(x_diff[:-1],y_diff,linewidth=2,label='Difference')

axs[1].legend()
axs[1].set(xlim=[-7,7],xlabel='Activation value',ylabel='Density',title='Distributions of MLP activations')

plt.tight_layout()
plt.show()