|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: Activation histograms by token length<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run first to install and then restart
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

from datasets import load_dataset

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Import the model and implant hooks

In [None]:
# for exercises 1-6
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

# for exercise 7
# tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')
# model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-1.3B')

In [None]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# move the model to the GPU and switch to eval


In [None]:
# hook function
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):

    # store in the dictionary

  return hook


# put hooks in all layers


In [None]:
# number of MLP expansion neurons
nneurons =

# Exercise 2: Import and tokenize fineweb

In [None]:
fineweb = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True)
fw_iterator = iter(fineweb)  # create iterator

# get multiple examples:
for _ in range(5):
  example = next(fw_iterator)
  print

In [None]:
# how many tokens in total
desiredTokenCount =

# initialize empty tensor (must be ints!)
allTokens =
allTokenLengths =

# reinitialize iterator
fw_iterator = iter(fineweb)


# keep importing data until we have enough
while

  # import the text
  text = next(fw_iterator)['text']

  # tokenize
  tokens =

  # get token lengths
  tokenLengths =

  # stack the tokens and the lengths
  allTokens =
  allTokenLengths =


# trim the vectors


print(allTokens.shape)
print(allTokenLengths.shape)

In [None]:
# bar plot of token counts, with median
u,c = np.unique
medianTokLength =

# make the bar graph
plt.figure(figsize=(10,4))
plt.bar()
plt.axvline(,,,label='Median')

plt.legend()
plt.gca().set(xlabel='Token character count',ylabel='Frequency',title='Distribution of token lengths')
plt.show()

In [None]:
# print a summary
print(f'There are {} tokens shorter than the median.')
print(f'There are {} tokens longer than the median.')
print(f'There are {} tokens equal to the median.')

# Exercise 3: Get activations

In [None]:
# get a batch of tokens
print(allTokens.shape)
batch =
batch.shape,type(batch)

In [None]:
# forward pass the batch
# ~1 min on cpu for 125m
# 2 secs on gpu for 1.3B (lol)
with torch.no_grad():
  model(

In [None]:
activations.keys()

In [None]:
# check shape -- should be batch X tokens X nneurons
activations['mlp_10'].shape

# Exercise 4: Activations distributions by median split

In [None]:
# extract and flatten activations
acts = activations['mlp_4']

# activations by length split
binedges =
yS,_ = torch.histogram(,bins=binedges,density=True)
yL,_ = torch.histogram(
yM,_ = torch.histogram(

# visualize
plt.figure(figsize=(10,5))
plt.plot(,linewidth=2,label='Short tokens')
plt.plot(,label='Long tokens')
plt.plot(='Median tokens')

plt.gca().set(xlim=binedges[[0,-1]],xlabel='Activations',ylabel='Density',
              title='Distribution of activations by token length')

plt.legend()
plt.show()

# Exercise 5: Activation-length correlations in one layer

In [None]:
# get the flattened activations and numpyify
acts = activations['mlp_4']

# standardize the activations from all neurons
zacts = (acts-mean) / np.std

In [None]:
# confirm
zacts.shape, zacts[:,600].mean(), zacts[:,600].std(ddof=1)

In [None]:
# normalize the token lengths
zTokenLens =

# confirm
zTokenLens.mean(), zTokenLens.std(ddof=1)

In [None]:
# confirm one correlation value
np.corrcoef(acts[:,0],allTokenLengths)

In [None]:
# covariance of standardized variables


In [None]:
# calculate all correlation coefficients
allCorrs = np.zeros(nneurons)

for ni in range(nneurons):
  allCorrs[ni] =  /

In [None]:
# and visualize!
plt.figure(figsize=(8,4))
plt.hist(

plt.gca().set(xlabel='Correlation coefficient',ylabel='Count',title='Histogram of all correlation coefficients')
plt.show()

# Exercise 6: Correlations in all layers

In [None]:
allCorrs = np.zeros(())

# loop over all the layers
for layeri in

  # get and normalize the activations
  acts = activations[f'mlp_{layeri}']
  zacts =

  # loop over all the neurons and correlate
  for ni in range(nneurons):
    allCorrs[layeri,ni] =

In [None]:
# histograms
rEdges =
rHistCounts =

# get histogram of each layer
for layeri in range(model.config.num_layers
  rHistCounts[layeri,:],_ = np.histogram

In [None]:
# and visualize
fig,axs = plt.subplots(1,2,figsize=(12,4))

for layeri in range(model.config.num_layers):
  axs[0].plot(,color=mpl.cm.plasma(layeri/(model.config.num_layers-1)),label=f'MLP h.{layeri}')


# colorbar for line color (layer number)
cmap = mpl.colormaps['plasma']
norm = mpl.colors.BoundaryNorm(np.arange(model.config.num_layers), cmap.N)
sm = mpl.cm.ScalarMappable(cmap=cmap, norm=norm)
cbar = fig.colorbar(sm, ax=axs[0], pad=.01)


# image
h = axs[1].imshow
axs[1].set(xlabel='Correlation coefficient',ylabel='Transformer block',title='Image of all histograms')
fig.colorbar(h,ax=axs[1],pad=.01)

plt.tight_layout()
plt.show()