|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Identifying circuits and components<h1>|
|<h2>Lecture:</h2>|<h1><b>Are circuits clustered in low-dimensional space?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN,KMeans

import scipy.stats as stats

import torch
from transformers import GPT2Model, GPT2Tokenizer

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2-medium')

nneurons = model.h[3].mlp.c_fc.weight.shape[1]
model.eval()

# Implanting a hook in the model

In [None]:
# hook the query vectors
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):

    # for development, not for application :P
    print(output.shape)

    activations[f'mlp_{layer_number}'] = output

  return hook

# surgery ;)
whichlayer = 5
model.h[whichlayer].mlp.c_fc.register_forward_hook(implant_hook(whichlayer))

keyName = f'mlp_{whichlayer}'

# Forward pass and get activations

In [None]:
# generated by Claude.ai
sentences = [
    "I saw him at the market.",
    "She gave him the book.",
    "They asked him for advice.",
    "We invited him to dinner.",
    "The dog followed him home.",
    "They asked him to join.",
    "He saw him at the park yesterday.",
    "Did you give him your address?",
    "I haven't seen him in ages.",
    "I told him the truth.",
    "They congratulated him on his success.",
    "She recognized him immediately.",
    "The teacher praised him for his work.",
    "I met him last summer.",
    "The child hugged him tightly.",
    "They warned him about the danger.",
    "She drove him to the airport.",
    "We waited for him for hours.",
    "The cat scratched him accidentally.",
    "They surprised him with a gift.",
    "She called him on the phone.",
    "The jury found him not guilty.",
    "I remembered him from school.",
    "They elected him as president.",
    "She forgave him for his mistake.",
    "The police questioned him yesterday.",
    "I helped him with his homework.",
    "They spotted him in the crowd.",
    "She visited him in the hospital.",
    "The manager promoted him last week.",
    "I trusted him completely.",
    "They respected him for his honesty.",
    "She taught him how to swim.",
    "The bird attacked him suddenly.",
    "I greeted him warmly.",
    "They supported him through difficult times.",
    "She ignored him at the party.",
    "The judge sentenced him to community service.",
    "I photographed him during the event.",
    "They believed him despite the evidence.",
    "She surprised him on his birthday.",
    "The guard stopped him at the entrance.",
    "I missed him terribly.",
    "They watched him leave the building.",
    "She accompanied him to the concert.",
    "The crowd cheered him enthusiastically.",
    "I described him to the police.",
    "They thanked him for his help.",
    "She admired him for his courage.",
    "The committee nominated him for the award.",
    "I married him last spring.",
    "They informed him about the changes.",
    "She introduced him to the parents.",
    "The author based the character on him.",

## same sentences but with "her"

    "I saw her at the market.",
    "She gave her the book.",
    "They asked her for advice.",
    "We invited her to dinner.",
    "The dog followed her home.",
    "They asked her to join.",
    "He saw her at the park yesterday.",
    "Did you give her your address?",
    "I haven't seen her in ages.",
    "I told her the truth.",
    "They congratulated her on his success.",
    "She recognized her immediately.",
    "The teacher praised her for his work.",
    "I met her last summer.",
    "The child hugged her tightly.",
    "They warned her about the danger.",
    "She drove her to the airport.",
    "We waited for her for hours.",
    "The cat scratched her accidentally.",
    "They surprised her with a gift.",
    "She called her on the phone.",
    "The jury found her not guilty.",
    "I remembered her from school.",
    "They elected her as president.",
    "She forgave her for his mistake.",
    "The police questioned her yesterday.",
    "I helped her with his homework.",
    "They spotted her in the crowd.",
    "She visited her in the hospital.",
    "The manager promoted her last week.",
    "I trusted her completely.",
    "They respected her for his honesty.",
    "She taught her how to swim.",
    "The bird attacked her suddenly.",
    "I greeted her warmly.",
    "They supported her through difficult times.",
    "She ignored her at the party.",
    "The judge sentenced her to community service.",
    "I photographed her during the event.",
    "They believed her despite the evidence.",
    "She surprised her on his birthday.",
    "The guard stopped her at the entrance.",
    "I missed her terribly.",
    "They watched her leave the building.",
    "She accompanied her to the concert.",
    "The crowd cheered her enthusiastically.",
    "I described her to the police.",
    "They thanked her for his help.",
    "She admired her for his courage.",
    "The committee nominated her for the award.",
    "I married her last spring.",
    "They informed her about the changes.",
    "She introduced her to the parents.",
    "The author based the character on her."
]

# indices of him/her sentences
him_sentences = np.arange(len(sentences)//2)
her_sentences = np.arange(len(sentences)//2,len(sentences))

print(f'There are {len(sentences)} sentences.')

In [None]:
# identify the target token
target_token_him = tokenizer.encode(' him')
target_token_her = tokenizer.encode(' her')
print(f'The target token indices are {target_token_him} and {target_token_her}\n')

# need to specify a padding token
tokenizer.pad_token = tokenizer.eos_token

# tokenize
tokens = tokenizer(sentences,padding=True,return_tensors='pt')

# Forward pass and get the activations

In [None]:
with torch.no_grad():
  model(**tokens)

In [None]:
print(activations.keys())
print(activations[keyName].shape)

In [None]:
# loop through sentences to get target activations

acts = np.zeros((len(sentences),activations[keyName].shape[2]))

for senti in range(len(sentences)):

  # find the index of either of the target tokens
  targBool = np.isin(tokens['input_ids'][senti].numpy(),[target_token_him,target_token_her])
  targidx = np.where(targBool)[0]

  # then get the activation
  acts[senti,:] = activations[keyName][senti,targidx,:].detach().numpy()

In [None]:
# visualization
fig,axs = plt.subplots(1,3,figsize=(13,4))

h = axs[0].imshow(acts,aspect='auto',vmin=-2,vmax=2)
axs[0].set(xlabel='MLP neuron index',ylabel='Sentence',title='All activations')
fig.colorbar(h,ax=axs[0],pad=.01)

axs[1].hist(acts.mean(axis=0),60,edgecolor='k',linewidth=.5,facecolor='gray')
axs[1].set(xlabel='Activation value',ylabel='Count',title='Histogram of activations\naveraged over sentences')

axs[2].plot(acts.mean(axis=1),'ko',markerfacecolor=[.7,.9,.7,.5])
axs[2].set(xlabel='Sentence index',ylabel='Activation',title='Activations averaged over neurons')
axs[2].axvline(len(sentences)/2-.5,color='k',linestyle='--',zorder=-3)

plt.tight_layout()
plt.show()

In [None]:
# pick a random neuron to show
randidx = np.random.randint(0,nneurons) # check 453,712

# scatter plot of activations for this neuron for all sentences
plt.figure(figsize=(10,4))
plt.plot(acts[:,randidx],'ko',markerfacecolor=[.7,.7,.7],markersize=10)
plt.axvline(len(sentences)/2-.5,label='"him" left\n"her" right',color='k',linestyle='--',zorder=-3)
plt.gca().set(xlabel='Sentence index',ylabel='Activation',
              title=f'Activation of neuron {randidx}')

plt.legend()
plt.show()

# T-tests on neurons

In [None]:
# run a t-test on each dimension
tres = stats.ttest_ind(acts[him_sentences,:],acts[her_sentences,:])

# find the supra-threshold t-values (Bonferroni corrected)
sigPvals = tres.pvalue < .05/nneurons

# and plot
plt.figure(figsize=(10,4))

plt.plot(np.where(sigPvals)[0],tres.statistic[sigPvals],'ko',markerfacecolor=[.9,.7,.9,.4],markersize=6)
plt.plot(np.where(~sigPvals)[0],tres.statistic[~sigPvals],'rx',markersize=2,alpha=.6)

plt.gca().set(xlabel='Dimension index',ylabel='T-value',xlim=[-10,nneurons+10],
              title=f'Statistical tests for "him" vs. "her" from all neurons')

plt.show()

# Dimension compression with T-SNE

In [None]:
# submatrix of only significant neurons
sigActs = acts[:,sigPvals]
nSigNeurons = sigActs.shape[1]

acts.shape, sigActs.shape

In [None]:
# scale t values for coloring
minVal = np.min(tres.statistic[sigPvals])
maxVal = np.max(tres.statistic[sigPvals])

tscaled = (tres.statistic[sigPvals]-minVal) / (maxVal-minVal)

In [None]:
# Reduce dimensions to 2D with t-SNE
tsne = TSNE(n_components=2,perplexity=30)
tsne_result = tsne.fit_transform(sigActs.T)

# plot results
_,axs = plt.subplots(1,2,figsize=(13,5))

h = axs[0].imshow(sigActs.T@sigActs,origin='lower',vmin=-80,vmax=80)
axs[0].set(title=f'Gram matrix',xlabel='Neurons',ylabel='Neurons')
plt.colorbar(h,ax=axs[0],fraction=.046,pad=.01)

axs[1].scatter(tsne_result[:,0], tsne_result[:,1], c=mpl.cm.RdBu(tscaled),edgecolor='k')
axs[1].set(title='T-SNE visualization of embeddings',xlabel='T-SNE dim 1',ylabel='T-SNE dim 2')

plt.tight_layout()
plt.show()

# Clustering with DBSCAN

In [None]:
# dbscan
clustmodel = DBSCAN(eps=1,min_samples=3).fit(tsne_result)
groupidx = clustmodel.labels_

# number of clusters
nclust = max(groupidx)+1 # +1 for indexing

# now draw the raw data in different colors
plt.figure(figsize=(8,6))
for i in range(-1,nclust):
  if i==-1:
    plt.plot(tsne_result[groupidx==i,0],tsne_result[groupidx==i,1],'k+')
  else:
    plt.plot(tsne_result[groupidx==i,0],tsne_result[groupidx==i,1],'o',markerfacecolor=mpl.cm.tab20b(i/nclust))

# and now plot the centroid locations
plt.gca().set(xlabel='tSNE axis 1',ylabel='tSNE axis 2',title=f'Result of dbscan clustering (k={nclust})')

plt.show()

In [None]:
# print cluster info (ignoring cluster=-1)

for cidx in range(0,nclust):

  # find all the tokens in this group
  dimsInGroup = np.where(groupidx==cidx)[0]

  # get their t-values and t-value magnitudes
  aveTvals = np.mean(tres.statistic[sigPvals][groupidx==cidx])
  aveTmags = np.mean(abs(tres.statistic[sigPvals][groupidx==cidx]))

  print(f'Group "{cidx:2}" has {len(dimsInGroup):3} units (average t = {aveTvals:>6.2f}, average |t| = {aveTmags:>6.2f})')

In [None]:
# sort the dimensions by cluster membership
actsSorted = sigActs[:,np.argsort(groupidx)]

# and calculate cosine similarities
actsSortedN = actsSorted / np.linalg.norm(actsSorted,axis=0,keepdims=True)
cossims = actsSortedN.T @ actsSortedN


# and plot
_,axs = plt.subplots(1,3,figsize=(12,4))

h = axs[0].imshow(actsSorted.T@actsSorted,origin='lower',vmin=-80,vmax=80)
axs[0].set(title='Gram matrix of sorted neurons',xlabel='Neuron index',ylabel='Neuron index')
plt.colorbar(h,ax=axs[0],fraction=.046,pad=.04)

h = axs[1].imshow(cossims,origin='lower',vmin=-1,vmax=1)
axs[1].set(title='Cosine similarities, sorted',xlabel='Neuron index',yticks=[])
plt.colorbar(h,ax=axs[1],fraction=.046,pad=.04)

axs[2].hist(cossims.flatten(),80,color='gray',edgecolor='k')
axs[2].set(title='Histogram of cosine similarities',yticks=[],xlabel='Cosine similarity value')

plt.tight_layout()
plt.show()

# Cluster with k-means

In [None]:
k = 3 # how many clusters?
kmeans = KMeans(n_clusters=k)
kmeans = kmeans.fit(tsne_result)

# group labels
groupidx = kmeans.predict(tsne_result)
# centroids
cents = kmeans.cluster_centers_



# and plot
_,axs = plt.subplots(1,2,figsize=(12,4))

lineColors = 'rkbgm'
for i in range(len(tsne_result)):
   axs[0].plot([ tsne_result[i,0], cents[groupidx[i],0] ],[ tsne_result[i,1], cents[groupidx[i],1] ],
               lineColors[groupidx[i]%len(lineColors)])

# now draw the raw data in different colors
for i in range(nclust):
  axs[0].plot(tsne_result[groupidx==i,0],tsne_result[groupidx==i,1],'o',markerfacecolor=lineColors[i%len(lineColors)])

# and now plot the centroid locations
axs[0].plot(cents[:,0],cents[:,1],'ko',markerfacecolor='y',markersize=10)
axs[0].set(xlabel='tSNE axis 1',ylabel='tSNE axis 2',title=f'Result of k-means clustering (k={k})')


### image the cluster-sorted cosine similarities
# sort the dimension by cluster membership
actsSorted = sigActs[:,np.argsort(groupidx)]

# and calculate cosine similarities
actsSortedN = actsSorted / np.linalg.norm(actsSorted,axis=0,keepdims=True)
cossims = actsSortedN.T @ actsSortedN

h = axs[1].imshow(cossims,origin='lower',vmin=-1,vmax=1)
axs[1].set(title='Cosine similarities, sorted',xlabel='Neuron index',yticks=[])
plt.colorbar(h,ax=axs[1],fraction=.046,pad=.02)


plt.tight_layout()
plt.show()