|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating layers<h1>|
|<h2>Lecture:</h2>|<h1><b>Grouping and RSA in Q and K matrices<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-medium')
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [None]:
# Define a hook function to store QVK vectors
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):

    # get and separate the QVK
    q,k,v = output.detach().split(model.config.n_embd,dim=2)

    # numpyify and store
    activations[f'attn_{layer_number}_q'] = q.numpy()
    activations[f'attn_{layer_number}_k'] = k.numpy()
  return hook


# pick the layers to hook
layer2hook = 5

# surgery ;)
model.transformer.h[layer2hook].attn.c_attn.register_forward_hook(implant_hook(layer2hook))

# Tokenize and pass into model

In [None]:
# list of words for RSA
words = [ 'galaxy','asteroid','comet','cosmos','space','sun','planet','moon','star','orbit',
          'ceiling','sofa','couch','carpet','door','window','lamp','chair','table','rug','bed','floor','wall',
          'pear','grape','banana','cherry','peach','apple','seed','jelly','orange','lime','fruit'
        ]

# confirm that all words are one token
for w in words:
  print(tokenizer.encode(f'The next word is {w}'))

In [None]:
# create a group mask based on word order
group = np.array([ [1,1,1,1,1,1,1,1,1,1,
                    2,2,2,2,2,2,2,2,2,2,2,2,2,
                    3,3,3,3,3,3,3,3,3,3,3
                    ] ])
groupmask = np.triu( group.T@group ,1)
plt.imshow(groupmask)
plt.show()

In [None]:
# create a batch with all target tokens at the end
batch = torch.zeros((len(words),5),dtype=torch.long)

for i,w in enumerate(words):
  batch[i,:] = torch.tensor(tokenizer.encode(f'The next word is {w}'))

batch

In [None]:
# push through the model
with torch.no_grad(): model(batch)

In [None]:
activations.keys(),activations['attn_5_q'].shape

# Calculate cosine similarities

In [None]:
# extract activations and calculate cossim
actsQ = activations['attn_5_q'][:,-1,:].squeeze()
actsK = activations['attn_5_k'][:,-1,:].squeeze()

# normalize each vector to its norm (unit length)
actsQ /= np.linalg.norm(actsQ,axis=1,keepdims=True)
actsK /= np.linalg.norm(actsK,axis=1,keepdims=True)

# cosine similarity matrices
cs_actsQ = actsQ @ actsQ.T
cs_actsK = actsK @ actsK.T


# visualize!
_,axs = plt.subplots(1,3,figsize=(12,4))

# show the similarity matrix for Q
h = axs[0].imshow(cs_actsQ,vmin=.6,vmax=.85)
axs[0].set(xticks=range(0,len(words),2),xticklabels=words[::2],yticks=range(1,len(words),2),yticklabels=words[1::2],
           title='Q cossim matrix')
axs[0].tick_params(axis='x',labelrotation=90)
plt.colorbar(h,ax=axs[0],pad=.02,fraction=.046)

# repeat for K
h = axs[1].imshow(cs_actsK,vmin=.82,vmax=.95)
axs[1].set(xticks=range(0,len(words),2),xticklabels=words[::2],yticks=range(1,len(words),2),yticklabels=words[1::2],
           title='K cossim matrix')
axs[1].tick_params(axis='x',labelrotation=90)
plt.colorbar(h,ax=axs[1],pad=.02,fraction=.046)


# histograms here
yQ,xQ = np.histogram(cs_actsQ[np.nonzero(np.triu(cs_actsQ,1))],bins=20)
yK,xK = np.histogram(cs_actsK[np.nonzero(np.triu(cs_actsK,1))],bins=20)

axs[2].plot(xQ[:-1],yQ,label='Q')
axs[2].plot(xK[:-1],yK,label='K')
axs[2].set(xlabel='Similarity',ylabel='Count',ylim=[0,None],title='Distributions of similarities')
axs[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# then extract the average means within-category, vs across-category
selectnum = ( cs_actsQ[groupmask==1].mean() + cs_actsQ[groupmask==4].mean() + cs_actsQ[groupmask==9].mean() ) /3
selectden = ( cs_actsQ[groupmask==2].mean() + cs_actsQ[groupmask==3].mean() + cs_actsQ[groupmask==6].mean() ) /3
selectIndexQ = selectnum/selectden

selectnum = ( cs_actsK[groupmask==1].mean() + cs_actsK[groupmask==4].mean() + cs_actsK[groupmask==9].mean() ) /3
selectden = ( cs_actsK[groupmask==2].mean() + cs_actsK[groupmask==3].mean() + cs_actsK[groupmask==6].mean() ) /3
selectIndexK = selectnum/selectden

print(f'Selectivity index is {selectIndexQ:.3f} for Q, and {selectIndexK:.3f} for K.')

# RSA

In [None]:
# extract the upper-triangular elements
unique_Q = cs_actsQ[np.nonzero(np.triu(cs_actsQ,1))]
unique_K = cs_actsK[np.nonzero(np.triu(cs_actsK,1))]


# Pearson correlation
r = np.corrcoef(unique_Q,unique_K)[0,1]

# plot
plt.figure(figsize=(6,5))
plt.plot(unique_Q,unique_K,'ks',markerfacecolor=[.9,.7,.7,.5])
plt.gca().set(xlabel='Q cosine similarities',ylabel='K cosine similarities',
              title=f'Correlation (RSA score): r = {r:.3f}')
plt.grid(linestyle='--',color=[.8,.8,.8])
plt.show()

# RSA per semantic group

In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3.5))

catlabels = [ 'space','interior','fruit' ]

# loop over the three categories
for i in range(3):

  # unique items for this category
  unique_Q = cs_actsQ[groupmask==(i+1)**2]
  unique_K = cs_actsK[groupmask==(i+1)**2]

  # Pearson correlation
  r = np.corrcoef(unique_Q,unique_K)[0,1]

  # plot
  c = [.7,.7,.7,.5]
  c[i] = .9
  axs[i].plot(unique_Q,unique_K,'ks',markerfacecolor=c)
  axs[i].set(xlabel='Q cosine similarities',ylabel='K cosine similarities',
                title=f'RSA ({catlabels[i]}): r = {r:.3f}')
  axs[i].grid(linestyle='--',color=[.8,.8,.8])


plt.tight_layout()
plt.show()