|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating token embeddings<h1>|
|<h2>Lecture:</h2>|<h1><b>Calculating rotations of embeddings vectors<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# for the stats
import scipy.stats as stats

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# vector graphs
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-xl')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')

model.eval()

# Sentences with "her" as target

In [None]:
# generated by Claude.ai
sentences = [
    "I saw her at the market.",
    "She gave her the book.",
    "They asked her for advice.",
    "We invited her to dinner.",
    "The dog followed her home.",
    "They asked her to join.",
    "He saw her at the park yesterday.",
    "Did you give her your address?",
    "I haven't seen her in ages.",
    "I told her the truth.",
    "They congratulated her on his success.",
    "She recognized her immediately.",
    "The teacher praised her for his work.",
    "I met her last summer.",
    "The child hugged her tightly.",
    "They warned her about the danger.",
    "She drove her to the airport.",
    "We waited for her for hours.",
    "The cat scratched her accidentally.",
    "They surprised her with a gift.",
    "She called her on the phone.",
    "The jury found her not guilty.",
    "I remembered her from school.",
    "They elected her as president.",
    "She forgave her for his mistake.",
    "The police questioned her yesterday.",
    "I helped her with his homework.",
    "They spotted her in the crowd.",
    "She visited her in the hospital.",
    "The manager promoted her last week.",
    "I trusted her completely.",
    "They respected her for his honesty.",
    "She taught her how to swim.",
    "The bird attacked her suddenly.",
    "I greeted her warmly.",
    "They supported her through difficult times.",
    "She ignored her at the party.",
    "The judge sentenced her to community service.",
    "I photographed her during the event.",
    "They believed her despite the evidence.",
    "She surprised her on his birthday.",
    "The guard stopped her at the entrance.",
    "I missed her terribly.",
    "They watched her leave the building.",
    "She accompanied her to the concert.",
    "The crowd cheered her enthusiastically.",
    "I described her to the police.",
    "They thanked her for his help.",
    "She admired her for his courage.",
    "The committee nominated her for the award.",
    "I married her last spring.",
    "They informed her about the changes.",
    "She introduced her to the parents.",
    "The author based the character on her."
]

target_token = tokenizer.encode(' her')[0]
print(f'There are {len(sentences)} sentences.')

In [None]:
# need to specify a padding token
tokenizer.pad_token = tokenizer.eos_token

# tokenize
tokens = tokenizer(sentences,padding=True,return_tensors='pt')

# push through the model (~1 min for gpt2-xl in cpu)
with torch.no_grad():
  outputs = model(**tokens,output_hidden_states=True)

In [None]:
n_hiddens = len(outputs.hidden_states)
seq_len = outputs.hidden_states[3].shape[1]

n_hiddens,outputs.hidden_states[3].shape

In [None]:
# get angle of token vector from previous to current layer

angles = np.zeros((n_hiddens-1,len(sentences),3))


for senti in range(len(sentences)):

  # find the index of the target token (convert to list, then .index to find)
  targidx = tokens['input_ids'][senti].tolist().index(target_token)

  for layeri in range(1,n_hiddens):

    # TARGET: calculate the angle between this and the previous layer activations
    v = outputs.hidden_states[layeri-1][senti,targidx,:].detach().squeeze()
    u = outputs.hidden_states[layeri  ][senti,targidx,:].detach().squeeze()
    angles[layeri-1,senti,0] = torch.acos( torch.dot(v,u) / (torch.linalg.norm(v) * torch.linalg.norm(u) ) )

    # NON-TARGET: calculate the angle between this and the previous layer activations
    v = outputs.hidden_states[layeri-1][senti,targidx-1,:].detach().squeeze()
    u = outputs.hidden_states[layeri  ][senti,targidx-1,:].detach().squeeze()
    angles[layeri-1,senti,1] = torch.acos( torch.dot(v,u) / (torch.linalg.norm(v) * torch.linalg.norm(u) ) )

    # SHUFFLED: angle between random pairs of vectors and layers
    v = outputs.hidden_states[torch.randint(1,n_hiddens,(1,))][senti,torch.randint(seq_len,(1,)),:].detach().squeeze()
    u = outputs.hidden_states[torch.randint(1,n_hiddens,(1,))][senti,torch.randint(seq_len,(1,)),:].detach().squeeze()
    angles[layeri-1,senti,2] = torch.acos( torch.dot(v,u) / (torch.linalg.norm(v) * torch.linalg.norm(u) ) )

angles.shape

In [None]:
# optional conversion from radians to degrees
angles = angles * 180/np.pi

In [None]:
plt.figure(figsize=(11,4))

xticks = np.arange(1,n_hiddens)

# plot all the individual sentences
plt.plot(xticks,angles[:,:,0],color=[.9,.7,.7],alpha=.5)
plt.plot(xticks,angles[:,:,1],color=[.7,.7,.9],alpha=.5)

# and the average over sentences
plt.plot(xticks,angles[:,:,0].mean(axis=1),'r',linewidth=3,label='Target')
plt.plot(xticks,angles[:,:,1].mean(axis=1),'b',linewidth=3,label='Non-target')

# completely shuffled angles
plt.plot(xticks,angles[:,:,2],color=[.7,.9,.7],alpha=.5)
plt.plot(xticks,np.nanmean(angles[:,:,2],axis=1),'g',linewidth=3,label='Shuffled')

plt.legend()
plt.gca().set(xlabel='Transformer layer',ylabel='Angle (rad.)',xlim=[0,n_hiddens],#ylim=[.03,.3],
              title=r'$\Delta v^{\circ}$ relative to previous layer')

plt.show()

# Statistical evaluation

In [None]:
# t-tests
tres = stats.ttest_rel(angles[:,:,0].T,angles[:,:,1].T)

# extract data and get boolean significance (corrected for multiple comparisons)
t = tres.statistic
issig = tres.pvalue<.05/n_hiddens

# plot!
plt.figure(figsize=(10,4))

# plot the significant values
plt.plot(xticks[issig & (t>0)],t[issig & (t>0)],'ko',
         markersize=9,markerfacecolor=[.7,.9,.7],label='Target > nontarget')
plt.plot(xticks[issig & (t<0)],t[issig & (t<0)],'ks',
         markersize=9,markerfacecolor=[.7,.7,.9],label='Nontarget > target')

# and the non-significant values
plt.plot(xticks[~issig],tres.statistic[~issig],'rx',label='Non-sig.')
plt.axhline(0,linestyle='--',color='gray')

# touch-ups
plt.legend()
plt.gca().set(xlabel='Layer',ylabel='T-value',title='Statistical significance of rotation differences')

plt.show()