|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>Activation maximization (code implementation)<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from transformers import GPT2Model, GPT2Tokenizer

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# load GPT2 model and tokenizer
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# move the model to the GPU
model = model.to(device)
model.eval() # we're training the embeddings, not the model!


# a copy of the original embeddings
embeddings = model.wte.weight.detach().cpu()

# Initialize a random embeddings pattern

In [None]:
# length of the token sequence
seq_len = 5

# random embeddings with gradient tracking
optimized_embeddings = torch.randn((1, seq_len, embeddings.shape[1]), requires_grad=True, device=device)

# normalize the std to that of the real embeddings matrix
torch.nn.init.normal_(optimized_embeddings, mean=0, std=torch.std(embeddings))

# check the shape
optimized_embeddings.shape

In [None]:
# get the histogram values
ye,xe = np.histogram(embeddings.flatten(),bins=80)
yo,xo = np.histogram(optimized_embeddings.flatten().detach().cpu(),bins=80)

plt.figure(figsize=(10,4))
plt.plot(xe[:-1],ye/np.max(ye),linewidth=2,label='Embeddings matrix')
plt.plot(xo[:-1],yo/np.max(yo),linewidth=2,label='Random matrix')

plt.legend()
plt.gca().set(xlabel='Weight values',ylabel='Frequency (max-norm)',xlim=xe[[0,-1]])
# plt.yscale('log') # optional, gives a better appreciation of the tails
plt.show()

# How to input embeddings in the model

In [None]:
# select a dimension to maximize
layer_idx = 8 # 8th transformer block with index 7
dim_idx = 91

In [None]:
# how to use the maximized embeddings
outputs = model(
    inputs_embeds = optimized_embeddings, # instead of input_ids
    output_hidden_states = True # request all activations exported
    )

# the output
print(f'Size of outputs.hidden_states: {len(outputs.hidden_states)}')
print(f'e.g., size of activation from layer {layer_idx}: {outputs.hidden_states[layer_idx].shape}')

# Now for training

In [None]:
n_steps = 500   # optimization steps
lr = .001       # learning rate
lambda_l2 = .01 # regularization amount

# optimizer
optimizer = torch.optim.Adam([optimized_embeddings], lr=lr)

In [None]:
# initialize vectors to store progress
activationVal = np.zeros(n_steps)
gradientNorm = np.zeros(n_steps)


# loop over training steps
for step in range(n_steps):

  # clear gradient
  optimizer.zero_grad()

  # patch embeddings directly into the model
  outputs = model(
      inputs_embeds = optimized_embeddings,
      output_hidden_states = True)

  # extract the dimension's activation (averaged over tokens)
  allActivations = outputs.hidden_states[layer_idx]
  dim_activation = allActivations[0,:,dim_idx].mean()

  # squared Euclidean distance for L2 normalization
  L2 = lambda_l2 * torch.sum(optimized_embeddings**2)

  # minimize loss -> maximize activation
  loss = -dim_activation + L2
  activationVal[step] = dim_activation.item()


  # run gradient descent
  loss.backward()

  # get the gradient norm
  gradientNorm[step] = optimized_embeddings.grad.norm().item()

  # finish backprop
  optimizer.step()

  if step%23==0:
    print(f'Step {step:4}/{n_steps}, Target activation: {activationVal[step]:6.2f} (vs. neighbor: {allActivations[0,:,dim_idx+1].mean():.2f})')

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3))

# plot the activation magnitudes
axs[0].plot(activationVal,'o',markersize=4,markerfacecolor=[.7,.6,.9],markeredgecolor='none')
axs[0].set(xlabel='Training steps',ylabel='Dimension activation',title='"Inverse loss" optimization')

# plot the gradient norms
axs[1].plot(gradientNorm,'o',markersize=4,markerfacecolor=[.9,.6,.7],markeredgecolor='none')
axs[1].set(xlabel='Training steps',ylabel='Embedding gradient norm',title='Norm of gradients')

plt.show()

In [None]:
# redraw the histograms of embedding values

# get the histogram values
yo2,xo2 = np.histogram(optimized_embeddings.flatten().detach().cpu(),bins=80)

plt.figure(figsize=(10,4))
plt.plot(xe[:-1],ye/np.max(ye),linewidth=2,label='Embeddings matrix')
plt.plot(xo[:-1],yo/np.max(yo),linewidth=2,label='Random matrix')
plt.plot(xo2[:-1],yo2/np.max(yo2),linewidth=2,label='Optimized matrix')

plt.legend()
plt.gca().set(xlabel='Weight values',ylabel='Frequency (max-norm)',
              xlim=[-1,1])
plt.show()

In [None]:
plt.figure(figsize=(10,4))
plt.imshow(optimized_embeddings.squeeze().detach().cpu(),
           aspect='auto',vmin=-.3,vmax=.3,origin='lower')

plt.gca().set(xlabel='Embedding dim.',ylabel='Token position')
plt.show()

# Find closest tokens

In [None]:
# one embed
oneemb = optimized_embeddings[0][0].detach().cpu()

# cosine similarity with all embedding vectors
cs = F.cosine_similarity(oneemb.unsqueeze(0), embeddings)

# find the token with max cossim
maxtok = np.argmax(cs)

# and visualize
plt.figure(figsize=(10,4))
plt.plot(cs,'ko',markerfacecolor=[.9,.7,.8,.6])
plt.gca().set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Cosine similarity',
              title=f'Similarities to all token embeddings (top token is "{tokenizer.decode(maxtok)}")')
plt.show()

In [None]:
# decode embeddings to closest tokens
optimized_tokens = []

for emb in optimized_embeddings[0]:

  # cosine similarity with embedding weights
  similarities = F.cosine_similarity(emb.unsqueeze(0).detach().cpu(), embeddings)

  # find the max similarity
  maxtok = np.argmax(similarities)
  optimized_tokens.append(maxtok)

print('Optimized token sequence:\n',tokenizer.decode(optimized_tokens))