|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Identifying latent factors<h1>|
|<h2>Lecture:</h2>|<h1><b>Generalized eigendecomposition separates "him" from "her" in MLP<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# for the generalized eigendecomposition
import scipy.linalg

import torch
from transformers import GPT2Model, GPT2Tokenizer

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# import model and tokenizer
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Hook the MLP

In [None]:
# hook the MLP activations
activations = {}

def mlp_hook(module, input, output):
  activations['mlp'] = output.detach().numpy()

# pick the middle layer
layer2hook = model.config.n_layer//2
handle = model.h[layer2hook].mlp.c_fc.register_forward_hook(mlp_hook)

In [None]:
# generated by Claude.ai
sentences = [
    "I saw him at the market.",
    "She gave him the book.",
    "They asked him for advice.",
    "We invited him to dinner.",
    "The dog followed him home.",
    "They asked him to join.",
    "He saw him at the park yesterday.",
    "Did you give him your address?",
    "I haven't seen him in ages.",
    "I told him the truth.",
    "They congratulated him on his success.",
    "She recognized him immediately.",
    "The teacher praised him for his work.",
    "I met him last summer.",
    "The child hugged him tightly.",
    "They warned him about the danger.",
    "She drove him to the airport.",
    "We waited for him for hours.",
    "The cat scratched him accidentally.",
    "They surprised him with a gift.",
    "She called him on the phone.",
    "The jury found him not guilty.",
    "I remembered him from school.",
    "They elected him as president.",
    "She forgave him for his mistake.",
    "The police questioned him yesterday.",
    "I helped him with his homework.",
    "They spotted him in the crowd.",
    "She visited him in the hospital.",
    "The manager promoted him last week.",
    "I trusted him completely.",
    "They respected him for his honesty.",
    "She taught him how to swim.",
    "The bird attacked him suddenly.",
    "I greeted him warmly.",
    "They supported him through difficult times.",
    "She ignored him at the party.",
    "The judge sentenced him to community service.",
    "I photographed him during the event.",
    "They believed him despite the evidence.",
    "She surprised him on his birthday.",
    "The guard stopped him at the entrance.",
    "I missed him terribly.",
    "They watched him leave the building.",
    "She accompanied him to the concert.",
    "The crowd cheered him enthusiastically.",
    "I described him to the police.",
    "They thanked him for his help.",
    "She admired him for his courage.",
    "The committee nominated him for the award.",
    "I married him last spring.",
    "They informed him about the changes.",
    "She introduced him to the parents.",
    "The author based the character on him.",

## same sentences but with "her"

    "I saw her at the market.",
    "She gave her the book.",
    "They asked her for advice.",
    "We invited her to dinner.",
    "The dog followed her home.",
    "They asked her to join.",
    "He saw her at the park yesterday.",
    "Did you give her your address?",
    "I haven't seen her in ages.",
    "I told her the truth.",
    "They congratulated her on his success.",
    "She recognized her immediately.",
    "The teacher praised her for his work.",
    "I met her last summer.",
    "The child hugged her tightly.",
    "They warned her about the danger.",
    "She drove her to the airport.",
    "We waited for her for hours.",
    "The cat scratched her accidentally.",
    "They surprised her with a gift.",
    "She called her on the phone.",
    "The jury found her not guilty.",
    "I remembered her from school.",
    "They elected her as president.",
    "She forgave her for his mistake.",
    "The police questioned her yesterday.",
    "I helped her with his homework.",
    "They spotted her in the crowd.",
    "She visited her in the hospital.",
    "The manager promoted her last week.",
    "I trusted her completely.",
    "They respected her for his honesty.",
    "She taught her how to swim.",
    "The bird attacked her suddenly.",
    "I greeted her warmly.",
    "They supported her through difficult times.",
    "She ignored her at the party.",
    "The judge sentenced her to community service.",
    "I photographed her during the event.",
    "They believed her despite the evidence.",
    "She surprised her on his birthday.",
    "The guard stopped her at the entrance.",
    "I missed her terribly.",
    "They watched her leave the building.",
    "She accompanied her to the concert.",
    "The crowd cheered her enthusiastically.",
    "I described her to the police.",
    "They thanked her for his help.",
    "She admired her for his courage.",
    "The committee nominated her for the award.",
    "I married her last spring.",
    "They informed her about the changes.",
    "She introduced her to the parents.",
    "The author based the character on her."
]

# indices of him/her sentences
him_sentences = np.arange(len(sentences)//2)
her_sentences = np.arange(len(sentences)//2,len(sentences))

print(f'There are {len(sentences)} sentences.')

In [None]:
# identify the target token
target_token_him = tokenizer.encode(' him')
target_token_her = tokenizer.encode(' her')
print(f'The target token indices are {target_token_him} and {target_token_her}\n')

# need to specify a padding token
tokenizer.pad_token = tokenizer.eos_token

# tokenize
tokens = tokenizer(sentences,padding=True,return_tensors='pt')
seqlength = len(tokens['input_ids'][0])

# example
print(tokens['input_ids'][10])
print(tokens['attention_mask'][10])

# Get activations for all "him" and "her" targets

In [None]:
with torch.no_grad():
  model(**tokens)

In [None]:
nhidden = activations['mlp'].shape[-1]
print(f'There are {nhidden} hidden units.')
activations['mlp'].shape

In [None]:
# get target activations
acts = np.zeros((len(sentences),nhidden))

for senti in range(len(sentences)):

  # find the index of either of the target tokens
  targBool = (tokens['input_ids'][senti].numpy()==target_token_him) | (tokens['input_ids'][senti].numpy()==target_token_her)
  targidx = np.where(targBool)[0]

  # then get the activation
  acts[senti,:] = activations['mlp'][senti,targidx,:]

acts.shape

# Some data visualizations

In [None]:
plt.figure(figsize=(12,5))
plt.imshow(acts,aspect='auto',vmin=-2,vmax=2)
plt.gca().set(xlabel='MLP hidden dimension ("neurons")',ylabel='Sentence',title='MLP activations for "him" and "her"')
plt.colorbar(pad=.02)
plt.show()

In [None]:
# covariance matrices
himcov = np.cov(acts[him_sentences,:].T)
hercov = np.cov(acts[her_sentences,:].T)

_,axs = plt.subplots(1,3,figsize=(12,5))
axs[0].imshow(himcov[::10,::10],extent=[0,nhidden,nhidden,0],vmin=-.1,vmax=.1)
axs[0].set(title='Covariance matrix for "him"',xlabel='MLP hidden unit',ylabel='MLP hidden unit')

axs[1].imshow(hercov[::10,::10],extent=[0,nhidden,nhidden,0],vmin=-.1,vmax=.1)
axs[1].set(title='Covariance matrix for "her"',xlabel='MLP hidden unit',ylabel='MLP hidden unit')

axs[2].imshow(himcov[::10,::10]-hercov[::10,::10],extent=[0,nhidden,nhidden,0],vmin=-.1,vmax=.1)
axs[2].set(title='Difference of the matrices',xlabel='MLP hidden unit',ylabel='MLP hidden unit')

plt.tight_layout()
plt.show()

# Try a GED on the full matrices...

In [None]:
# evals,evecs = scipy.linalg.eigh(himcov,hercov)

# Two-stage (PCA->GED)

In [None]:
# PCA of the average covariance
d,V = scipy.linalg.eigh( (himcov+hercov)/2 )

# sort the values and vectors
idx = d.argsort()[::-1]
d = d[idx]
V = V[:,idx] # sort the columns, not the rows!

In [None]:
# transform the eigenvalues to cumulative % variance explained
varExplained = d*100/np.sum(d)
cumVarExplained = np.cumsum(varExplained)

# how many components to explain 99% of the variability?
numComps2keep = np.where(cumVarExplained>99)[0][0]


# and visualize
_,axs = plt.subplots(1,2,figsize=(11,4))
axs[0].plot(varExplained,'ko-',markerfacecolor=[.9,.7,.7])
axs[0].axvline(x=numComps2keep,color='k',linestyle='--')
axs[0].set(xlim=[-1,numComps2keep+10],xlabel='PC (sorted)',ylabel='Variance explained (%)',title='PCA of covariance average')

axs[1].plot(cumVarExplained,'ko-',markerfacecolor=[.9,.7,.9])
axs[1].axvline(x=numComps2keep,color='k',linestyle='--')
axs[1].set(xlim=[-1,numComps2keep+10],xlabel='PC (sorted)',ylabel='Cumulative variance explained (%)',
           title=f'Cumulative variance explained (99% after {numComps2keep} components)')

plt.tight_layout()
plt.show()

In [None]:
# project down to numComps2keep dimensions
acts_lowD = acts @ V[:,:numComps2keep]
acts_lowD.shape

In [None]:
# covariance matrices (low-D this time)
himLowDcov = np.cov(acts_lowD[him_sentences,:].T)
herLowDcov = np.cov(acts_lowD[her_sentences,:].T)

_,axs = plt.subplots(1,3,figsize=(12,5))
axs[0].imshow(himLowDcov,vmin=-1,vmax=1)
axs[0].set(title='Covariance matrix for "him"',xlabel='PC dimension',ylabel='PC dimension')

axs[1].imshow(herLowDcov,vmin=-1,vmax=1)
axs[1].set(title='Covariance matrix for "her"',xlabel='PC dimension',ylabel='PC dimension')

axs[2].imshow(himLowDcov-herLowDcov,vmin=-1,vmax=1)
axs[2].set(title='Difference of the matrices',xlabel='PC dimension',ylabel='PC dimension')

plt.tight_layout()
plt.show()

In [None]:
# shrinkage regularization

# regularization amount
regu_gam = .01

himLowDcovS = (1-regu_gam)*himLowDcov + regu_gam*np.mean(np.linalg.eig(himLowDcov)[0])*np.eye(numComps2keep)
herLowDcovS = (1-regu_gam)*herLowDcov + regu_gam*np.mean(np.linalg.eig(herLowDcov)[0])*np.eye(numComps2keep)

# these should be real-valued matrices, but precision errors can propogate complex numbers with 0j
himLowDcovS = himLowDcovS.astype(np.float64)
herLowDcovS = herLowDcovS.astype(np.float64)


# examine the impact
print(f'Ranks of original matrices (size: {himLowDcov.shape})')
print(f'"him": {np.linalg.matrix_rank(himLowDcov)}, "her": {np.linalg.matrix_rank(herLowDcov)}\n')

print(f'Ranks of regularized matrices  (size: {himLowDcovS.shape}):')
print(f'"him": {np.linalg.matrix_rank(himLowDcovS)}, "her": {np.linalg.matrix_rank(herLowDcovS)}\n\n\n')

In [None]:
# now for the GED

# HIM>HER: eig and sort
evalsHim,evecsHim = scipy.linalg.eigh(himLowDcov,herLowDcovS)
idx = evalsHim.argsort()[::-1]
evalsHim = evalsHim[idx]
evecsHim = evecsHim[:,idx] # sort the columns, not the rows!

# HER>HIM
evalsHer,evecsHer = scipy.linalg.eigh(herLowDcov,himLowDcovS)
idx = evalsHer.argsort()[::-1]
evalsHer = evalsHer[idx]
evecsHer = evecsHer[:,idx] # sort the columns, not the rows!


# plotting
plt.figure(figsize=(7,3))
plt.plot(evalsHim/evalsHim.max(),'ko-',markerfacecolor=[.9,.7,.7],label='him > her')
plt.plot(np.arange(numComps2keep)+.2,evalsHer/evalsHer.max(),'ks-',markerfacecolor=[.7,.9,.7],label='her > him')
plt.legend()
plt.gca().set(xlim=[-1,21],xlabel='GED component (sorted)',ylabel='Eigenvalue (max-norm)',
              xticks=range(0,20,2),title='GED eigenspectra')
plt.show()

In [None]:
# project the data onto the top GED vectors
ged_proj_him = acts_lowD @ evecsHim[:,0]
ged_proj_her = acts_lowD @ evecsHer[:,0]

# visualize the token activations
_,axs = plt.subplots(1,2,figsize=(10,3))
axs[0].plot(him_sentences,ged_proj_him[him_sentences],'ko',markerfacecolor=[.9,.7,.7],label='him')
axs[0].plot(her_sentences,ged_proj_him[her_sentences],'ks',markerfacecolor=[.7,.9,.7],label='her')
axs[0].set(xlim=[-1,len(sentences)+2],xlabel='Sentence',ylabel='Activation',title='GED of him > her')
axs[0].legend()

axs[1].plot(him_sentences,ged_proj_her[him_sentences],'ko',markerfacecolor=[.9,.7,.7],label='him')
axs[1].plot(her_sentences,ged_proj_her[her_sentences],'ks',markerfacecolor=[.7,.9,.7],label='her')
axs[1].set(xlim=[-1,len(sentences)+2],xlabel='Sentence',ylabel='Activation',title='GED of her > him')
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# get the patterns by filtering the covariance matrix
#  (pushed through the eigenvectors into MLP space)
mlpVect_him = V[:,:numComps2keep] @ himLowDcovS @ evecsHim[:,0]
mlpVect_her = V[:,:numComps2keep] @ herLowDcovS @ evecsHer[:,0]

plt.figure(figsize=(5,5))
plt.plot(mlpVect_him,mlpVect_her,'k.',markerfacecolor=[.9,.9,.7,.4],markersize=7)
plt.gca().set(xlabel='"him" axis',ylabel='"her" axis',
              title=f'MLP pattern (r = {np.corrcoef(mlpVect_him,mlpVect_her)[0,1]:.2f})')
plt.show()

# Random permutation to explore the statistical validity

In [None]:
# randomize the sentence order to create shuffled (fake) sentence labels
perm_labels = np.random.permutation(len(sentences))


# permuted covariance matrices
himProjcovP = np.cov(acts_lowD[perm_labels[:len(sentences)//2],:].T)
herProjcovP = np.cov(acts_lowD[perm_labels[len(sentences)//2:],:].T)
S = (1-regu_gam)*himProjcovP + regu_gam*np.mean(np.linalg.eig(himProjcovP)[0])*np.eye(numComps2keep)
R = (1-regu_gam)*herProjcovP + regu_gam*np.mean(np.linalg.eig(herProjcovP)[0])*np.eye(numComps2keep)

# eig and sort
evalsP,evecsP = scipy.linalg.eigh(S,R)
idx = evalsP.argsort()[::-1]
evalsP = evalsP[idx]
evecsP = evecsP[:,idx] # sort the columns, not the rows!

# project the data onto the top GED vector
ged_proj_himP = acts_lowD @ evecsP[:,0]


# plotting
_,axs = plt.subplots(1,2,figsize=(12,3.5))

axs[0].plot(evalsP,'ko-',markerfacecolor=[.9,.7,.7])
axs[0].set(xlim=[-1,21],xlabel='GED component (sorted)',ylabel='Eigenvalue',title='Random permutation GED component')

# visualize the token activations
axs[1].plot(him_sentences,ged_proj_himP[him_sentences],'ko',markerfacecolor=[.9,.7,.7],label='him')
axs[1].plot(her_sentences,ged_proj_himP[her_sentences],'ks',markerfacecolor=[.7,.9,.7],label='her')
axs[1].set(xlim=[-1,len(sentences)+2],xlabel='Sentence',ylabel='Activation',title='GED of him > her')
axs[1].legend()

plt.tight_layout()
plt.show()

# Per-token activations in HIM and HER components

In [None]:
# pick a sentence index between 0 and 53
sidx = 2

# get the activations for all neurons from this sentence
acts4him = activations['mlp'][sidx,:,:]
acts4her = activations['mlp'][sidx+54,:,:]

# project onto original GED vectors (via PCA projection)
loD_projs_him_HimGED = acts4him @ V[:,:numComps2keep] @ evecsHim[:,0]
loD_projs_her_HimGED = acts4her @ V[:,:numComps2keep] @ evecsHim[:,0]

loD_projs_him_HerGED = acts4him @ V[:,:numComps2keep] @ evecsHer[:,0]
loD_projs_her_HerGED = acts4her @ V[:,:numComps2keep] @ evecsHer[:,0]



_,axs = plt.subplots(1,2,figsize=(12,3))
axs[0].plot(np.arange(seqlength)-.1,loD_projs_him_HimGED,'ko',label='him sentence')
axs[0].plot(np.arange(seqlength)+.1,loD_projs_her_HimGED,'rs',label='her sentence')
for i in range(seqlength):
  axs[0].plot([i-.1,i+.1],[loD_projs_him_HimGED[i],loD_projs_her_HimGED[i]],'k',zorder=-3)

axs[0].legend()
axs[0].set(xticks=range(8),xticklabels=[tokenizer.decode(t) for t in tokens['input_ids'][sidx]],
           title='GED projections for HIM GED')


axs[1].plot(np.arange(seqlength)-.1,loD_projs_him_HerGED,'ko',label='him sentence')
axs[1].plot(np.arange(seqlength)+.1,loD_projs_her_HerGED,'rs',label='her sentence')
for i in range(seqlength):
  axs[1].plot([i-.1,i+.1],[loD_projs_him_HerGED[i],loD_projs_her_HerGED[i]],'k',zorder=-3)

axs[1].legend()
axs[1].set(xticks=range(8),xticklabels=[tokenizer.decode(t) for t in tokens['input_ids'][sidx]],
           title='GED projections for HER GED')

plt.suptitle('Token-level activations in GED filters',fontweight='bold')
plt.tight_layout()
plt.show()