|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating token embeddings<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: Residual stream decomposition of path lengths <b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
from statsmodels.stats.multitest import fdrcorrection

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Model, hooks, tokens, activations

In [None]:
# GPT2-large model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = AutoModelForCausalLM.from_pretrained('gpt2-large')
model.eval()

In [None]:
# hook the attention and mlp adjustments (projection vectors)
activations = {}

def implant_hook_attn(layer_number):
  def hook(module, input, output):
    activations[f'attn_proj_{layer_number}'] =
  return hook

def implant_hook_mlp


# reminder: "proj" are the adjustments that get added to the embeddings vectors
for layeri in range(model.config.n_layer):
  model.transformer.h[layeri].
  model.transformer.h[layeri].

# Push text through the model and get hidden_states

In [None]:
# https://en.wikipedia.org/wiki/Friedrich_Nietzsche
tokz = tokenizer.encode("Nietzsche was a widely influential German philosopher. He began his career as a classical philologist, turning to philosophy early in his academic career. In 1869, aged 24, Nietzsche became the youngest professor to hold the Chair of Classical Philology at the University of Basel. Plagued by health problems for most of his life, he resigned from the university in 1879, after which he completed much of his core writing in the following decade. Nietzsche's work spans philosophical polemics, poetry, cultural criticism, and fiction while displaying a fondness for aphorism and irony. Prominent elements of his philosophy include his radical critique of truth in favour of perspectivism; a genealogical critique of religion and Christian morality and a related theory of master–slave morality; the aesthetic affirmation of life in response to both the 'death of God' and the profound crisis of nihilism; the notion of Apollonian and Dionysian forces; and a characterisation of the human subject as the expression of competing wills, collectively understood as the will to power. He also developed influential concepts such as the Übermensch and his doctrine of eternal return. In his later work, he became increasingly preoccupied with the creative powers of the individual to overcome cultural and moral mores in pursuit of new values and aesthetic health. His body of work touched a wide range of topics, including art, philology, history, music, religion, tragedy, culture, and science, and drew inspiration from Greek tragedy as well as figures such as Zoroaster, Arthur Schopenhauer, Ralph Waldo Emerson, Richard Wagner, Fyodor Dostoevsky, and Johann Wolfgang von Goethe.",return_tensors='pt')



In [None]:
activations.keys(), activations['mlp_proj_0'].shape

In [None]:
len(outputs.hidden_states), outputs.hidden_states[33].shape

# Exercise 2: Cosine similarities between attention and MLP

In [None]:
# initialize
cossims = np.zeros

# loop over layers
for layeri in

  # cosine similarity between attn and mlp projections within layer
  for toki in

    # extract the two vectors for this tooken
    attn = activations[f'attn_proj_{layeri}']
    mlp  = activations

    # cosine similarity between them
    cossims[layeri,toki] =


# plotting! (my fav part :D )
_,axs = plt.subplots(1,2,figsize=(11,4))
axs[0].errorbar( ,marker='s',color='k',linestyle='none',markerfacecolor=[.7,.7,.7])
axs[0].axhline
axs[0].set(xlabel='Transformer layer',ylabel='Cosine similarity',title='Average (+std) across all tokens')

axs[1].hist( ,bins=80,color=[.7,.7,.7],edgecolor='k',linewidth=.5)
axs[1].set(xlabel='Cosine similarity',ylabel='Count',title='Distribution of all tokens and layers')

plt.tight_layout()
plt.show()

# Exercise 3: Path lengths of adjustments and hidden states

In [None]:
# initialize
pathlen = np.zeros((model.config.n_layer,len(tokz[0]),3))
nextTokenLogits = np.zeros(len(tokz[0]))

# loop over all tokens in the text
for toki in range(len(tokz[0])):

  # path length from previous
  for layeri in range(1,model.config.n_layer):

    # extract the vector pairs
    currAttn = activations[f'attn_proj_ # from this layer
    prevAttn = activations[f'attn_proj_ # from the previous layer

    currMlp  = # repeat the above for MLP
    prevMlp  =

    currHs   = # and for the hidden layers
    prevHs   =


    # norm the difference vectors
    pathlen[layeri,toki,0] = torch.norm(  ) # attention
    pathlen[layeri,toki,1] = # MLP
    pathlen[layeri,toki,2] = # hidden-states

pathlen.shape

In [None]:
fig,axs = plt.subplots(1,3,figsize=(14,3))


titles = [ 'Attention','MLP','Hidden state' ]

for i in range(3):
  h = axs[i].imshow( ,aspect='auto',origin='lower',vmin=0,vmax=50)

  fig.colorbar(h,ax=axs[i],pad=.02)

plt.show()

# Exercise 4: Correlate adjustments and hidden states

In [None]:
Rs = np.zeros((model.config.n_layer,2))
Ps = np.zeros((model.config.n_layer,2))

for layeri in range(model.config.n_layer):

  # correlation coefficient between attn and hs
  r =
  Rs[layeri,0] = # r value
  Ps[layeri,0] = # p-value


  # repeat for mlp and hs
  r =
  Rs[layeri,1] =
  Ps[layeri,1] =


# p-value threshold based on FDR
sigPsA = fdrcorrection(Ps[:,0])
sigPsM =

In [None]:
plt.figure(figsize=(10,4))

plt.plot(Rs[:,0],color=[.9,.7,.7])
plt.plot(,,'ko',markerfacecolor=[.9,.7,.7],markersize=10,label='Att - HS')
plt.plot(,,'ro',markersize=6)


plt.plot(,color=[.7,.7,.9])
plt.plot(,'ks',markerfacecolor=[.7,.7,.9],markersize=10,label='MLP - HS')
plt.plot(,'bs',markersize=6)


plt.axhline(0,linestyle='--',color='gray',linewidth=.8)

plt.gca().set(xlabel='Transformer layer',ylabel='Correlation',xlim=[0,model.config.n_layer],
              title='Correlations between subblock and HS path lengths')

plt.legend()
plt.show()

In [None]:
_,axs = plt.subplots(3,4,figsize=(13,8))

for i,ax in enumerate(axs.flatten()):

  # layer number
  lay = i*3 + 1

  # scatter plots
  ax.plot(,'ko',markerfacecolor=[.9,.7,.7,.3],label='Att')
  ax.plot(,label='MLP')

  # axis adjustments
  ax.set(xticks=[],xlabel='Att or MLP path length',yticks=[],ylabel='HS path length')
  ax.set_title(f'Layer {lay}',fontweight='bold')
  ax.legend()


plt.tight_layout()
plt.show()