|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating layers<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Laminar profile of RSA and cateogry selectivity<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# vector matplotlib plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Hook all layers and get activations

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')

model.eval()

In [None]:
# Define a hook function to store QVK vectors
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):

    # get and separate the QVK
    q,k,v = output.detach().split(model.config.n_embd,dim=2)

    # numpyify and store
    activations[f'attn_{layer_number}_q'] = q.numpy()
    activations[f'attn_{layer_number}_k'] = k.numpy()
    activations[f'attn_{layer_number}_v'] = v.numpy()
  return hook


# surgery ;)
for layeri in range(model.config.n_layer):
  model.transformer.h[layeri].attn.c_attn.register_forward_hook(implant_hook(layeri))

In [None]:
# list of words for RSA
words = [ 'galaxy','asteroid','comet','cosmos','space','sun','planet','moon','star','orbit',
          'ceiling','sofa','couch','carpet','door','window','lamp','chair','table','rug','bed','floor','wall',
          'pear','grape','banana','cherry','peach','apple','seed','jelly','orange','lime','fruit'
        ]

# first create a group mask based on word order
group = np.array([ [1,1,1,1,1,1,1,1,1,1,
                    2,2,2,2,2,2,2,2,2,2,2,2,2,
                    3,3,3,3,3,3,3,3,3,3,3
                    ] ])
groupmask = np.triu( group.T@group ,1)

In [None]:
batch = torch.zeros((len(words),5),dtype=torch.long)

for i,w in enumerate(words):
  batch[i,:] = torch.tensor(tokenizer.encode(f'The next word is {w}'))

# push through the model
with torch.no_grad(): model(batch)

In [None]:
activations.keys(),activations['attn_5_q'].shape

# Exercise 2: Cosine similarities in all layers

In [None]:
# initializations
selectIndices = np.zeros((3,model.config.n_layer))
RSA = np.zeros((3,model.config.n_layer))


for layeri in range(model.config.n_layer):


  ### cosine similarity matrices

  # for Q
  actsQ = activations[f'attn_{layeri}_q'][:,-1,:].squeeze()
  actsQ /= np.linalg.norm(actsQ,axis=1,keepdims=True)
  cs_actsQ = actsQ @ actsQ.T

  # for K
  actsK = activations[f'attn_{layeri}_k'][:,-1,:].squeeze()
  actsK /= np.linalg.norm(actsK,axis=1,keepdims=True)
  cs_actsK = actsK @ actsK.T

  # for V
  actsV = activations[f'attn_{layeri}_v'][:,-1,:].squeeze()
  actsV /= np.linalg.norm(actsV,axis=1,keepdims=True)
  cs_actsV = actsV @ actsV.T



  ### selectivity indices
  selectnum = ( cs_actsQ[groupmask==1].mean() + cs_actsQ[groupmask==4].mean() + cs_actsQ[groupmask==9].mean() ) /3
  selectden = ( cs_actsQ[groupmask==2].mean() + cs_actsQ[groupmask==3].mean() + cs_actsQ[groupmask==6].mean() ) /3
  selectIndices[0,layeri] = selectnum/selectden

  selectnum = ( cs_actsK[groupmask==1].mean() + cs_actsK[groupmask==4].mean() + cs_actsK[groupmask==9].mean() ) /3
  selectden = ( cs_actsK[groupmask==2].mean() + cs_actsK[groupmask==3].mean() + cs_actsK[groupmask==6].mean() ) /3
  selectIndices[1,layeri] = selectnum/selectden

  selectnum = ( cs_actsV[groupmask==1].mean() + cs_actsV[groupmask==4].mean() + cs_actsV[groupmask==9].mean() ) /3
  selectden = ( cs_actsV[groupmask==2].mean() + cs_actsV[groupmask==3].mean() + cs_actsV[groupmask==6].mean() ) /3
  selectIndices[2,layeri] = selectnum/selectden


  ### RSA
  # extract the upper-triangular elements
  unique_Q = cs_actsQ[np.nonzero(np.triu(cs_actsQ,1))]
  unique_K = cs_actsK[np.nonzero(np.triu(cs_actsK,1))]
  unique_V = cs_actsV[np.nonzero(np.triu(cs_actsV,1))]


  # Pearson correlations to get RSA
  RSA[0,layeri] = np.corrcoef(unique_Q,unique_K)[0,1]
  RSA[1,layeri] = np.corrcoef(unique_Q,unique_V)[0,1]
  RSA[2,layeri] = np.corrcoef(unique_K,unique_V)[0,1]


# Exercise 3: Laminar profiles of selectivity and RSA

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4))

axs[0].plot(selectIndices[0],'kv',markersize=8,markerfacecolor=[.9,.7,.7],label='Q')
axs[0].plot(selectIndices[1],'ko',markersize=8,markerfacecolor=[.7,.9,.7],label='K')
axs[0].plot(selectIndices[2],'ks',markersize=8,markerfacecolor=[.7,.7,.9],label='V')
axs[0].legend()
axs[0].set(xlabel='Transformer layer',ylabel='Selectivity index')

axs[1].plot(RSA[0],'ko',markersize=8,markerfacecolor=[.9,.7,.7],label='Q-K')
axs[1].plot(RSA[1],'ks',markersize=8,markerfacecolor=[.7,.9,.7],label='Q-V')
axs[1].plot(RSA[2],'kv',markersize=8,markerfacecolor=[.7,.7,.9],label='K-V')
axs[1].set(xlabel='Transformer layer',ylabel='RSA')
axs[1].legend()

plt.tight_layout()
plt.show()