|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>Activation maximization via data sampling<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np

import requests

In [None]:
# Eleuther's tokenizer and 125m model
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

# -> GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

# Import and process texts

In [None]:
# Through the Looking Glass (aka Alice in Wonderland)
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
tokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )

# summary
print(f'Alice in Wonderland has  {len(tokens):7,} tokens.')

In [None]:
# data sample size parameters
seq_len    = 256 # max sequence length
batch_size =  32

# One example

In [None]:
# create a batch of data
ix = torch.randint(len(tokens)-seq_len,size=(batch_size,))
X  = tokens[ix[:,None] + torch.arange(seq_len)].to(device)

# forward pass and hidden-state activations
with torch.no_grad():
  outputs = model(X,output_hidden_states=True)

len(outputs.hidden_states), outputs.hidden_states[4].shape

In [None]:
# pick a layer and a dimension
layer = 2
dim = 345

In [None]:
# find the token with the max activation
layeracts = outputs.hidden_states[layer][:,:,dim].detach().cpu()
maxtokenidx = X.flatten()[np.argmax(layeracts)].item()
maxtoken = tokenizer.decode([maxtokenidx])

print(f'Token with max activation is "{maxtoken}"')

# Lots of examples

In [None]:
# (~2 min)

num_samples = 1000

# initialize the results
maxtokens = np.zeros(num_samples,dtype=int)

# loop over training
for sampli in range(num_samples):

  # create a batch of data
  ix = torch.randint(len(tokens)-seq_len,size=(batch_size,))
  X  = tokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and hidden-state activations
  with torch.no_grad(): outputs=model(X,output_hidden_states=True)

  # find the token with the max activation
  layeracts = outputs.hidden_states[layer][:,:,dim].detach().cpu()
  maxtokens[sampli] = X.flatten()[np.argmax(layeracts)].item()


In [None]:
# find the unique max-activation counts
u,c = np.unique(maxtokens,return_counts=True)
sidx = np.argsort(c)[::-1]

# print out the results
for t,cc in zip(u[sidx],c[sidx]):
  print(f'{cc:3} ({cc*100/num_samples:4.1f}%) max-acts for token "{tokenizer.decode([t])}"')