|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Identifying circuits and components<h1>|
|<h2>Lecture:</h2>|<h1><b>Sparse linear probing: theory and code<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# functions for implementing and evaluating the logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# import gpt and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model     = AutoModelForCausalLM.from_pretrained('gpt2')
model.eval()

In [None]:
# hook the post-GELU MLP activations
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):
    activations[f'mlp_{layer_number}'] = output.detach().numpy()
  return hook

# hook the MLP post-gelu activations
layer2hook = 3
model.transformer.h[layer2hook].mlp.act.register_forward_hook(implant_hook(layer2hook))

# The data

In [None]:
# import fineweb
!pip install datatrove
from datatrove.pipeline.readers import ParquetReader

# get some data
numDocs = 500 # how many documents to retrive; each doc has ~750 tokens
data_reader = ParquetReader('hf://datasets/HuggingFaceFW/fineweb/data',limit=numDocs)

# join all texts into one token vector
tokens = np.array([],dtype=int)
for t in data_reader():
  tokens = np.append(tokens,tokenizer.encode(t.text))

In [None]:
# find all the "the" and "an" token indices
the_tokens = np.where(tokens==tokenizer.encode(' the'))[0]
an_tokens = np.where(tokens==tokenizer.encode(' an'))[0]

len(the_tokens),len(an_tokens)

In [None]:
# create batches
samplesize = 100

context_pre = 14
context_pst =  3

the_batch = np.zeros((samplesize,context_pre+context_pst),dtype=int)
an_batch  = np.zeros((samplesize,context_pre+context_pst),dtype=int)

# loop through "the" tokens and make sure they are whole words
i = 0
for thei in the_tokens:
  # it's a whole word if the next token starts with a space
  if (tokenizer.decode(tokens[thei+1])[0]==' ') & (i<samplesize):
    the_batch[i,:] = tokens[thei-context_pre:thei+context_pst]
    i += 1

# repeat for "an"
i = 0
for ani in an_tokens:
  if (tokenizer.decode(tokens[ani+1])[0]==' ') & (i<samplesize):
    an_batch[i,:] = tokens[ani-context_pre:ani+context_pst]
    i += 1

In [None]:
# some examples
print('THE sequences')
for i in np.random.randint(0,samplesize,7):
  print('::',tokenizer.decode(the_batch[i]))

print('\n\nAN sequences')
for i in np.random.randint(0,samplesize,7):
  print('::',tokenizer.decode(an_batch[i]))

# FYI, more about preceeding spaces

In [None]:
t1 = tokenizer.encode('the',add_prefix_space=True)
print("('the',add_prefix_space=True):")
print(f'  token {t1[0]} -> "{tokenizer.decode(t1)}"')

t2 = tokenizer.encode('the',add_prefix_space=False)
print("\n('the',add_prefix_space=False):")
print(f'  token {t2[0]} -> "{tokenizer.decode(t2)}"')

t3 = tokenizer.encode(' the',add_prefix_space=True)
print("\n(' the',add_prefix_space=True):")
print(f'  token {t3[0]} -> "{tokenizer.decode(t3)}"')

t4 = tokenizer.encode(' the',add_prefix_space=False)
print("\n(' the',add_prefix_space=False):")
print(f'  token {t4[0]} -> "{tokenizer.decode(t4)}"')

# Create the dataset for the logistic regression (linear probe)

In [None]:
# forward passes
with torch.no_grad(): model(torch.tensor(the_batch))
the_activations = activations[f'mlp_{layer2hook}'][:,context_pre,:]

with torch.no_grad(): model(torch.tensor(an_batch))
an_activations = activations[f'mlp_{layer2hook}'][:,context_pre,:]

# number of neurons
nneurons = the_activations.shape[-1]

# examine the sizes
print('THE activations have size',the_activations.shape)
print('AN activations have size',an_activations.shape)

In [None]:
# histograms of all activations for all neurons
yThe,xThe = np.histogram(the_activations.flatten(),100)
yAn, xAn  = np.histogram(an_activations.flatten(),100)

plt.figure(figsize=(10,4))
plt.plot(xThe[:-1],yThe,linewidth=2,label='The')
plt.plot(xAn[:-1], yAn,linewidth=2,label='An')

plt.legend()
plt.gca().set(xlabel='Activation',ylabel='Count',yscale='log')
plt.show()

In [None]:
# data and labels
alldata = np.concatenate((the_activations,an_activations),axis=0)
labels = np.array([0]*samplesize + [1]*samplesize)

# split into train/test (70/30%)
X_train, X_test, y_train, y_test = train_test_split(alldata, labels, test_size=.3, stratify=labels)

# print sizes
print('Size of train data:',X_train.shape)
print('Size of test data:', X_test.shape)

# The logistic regression

In [None]:
# fit the model (C is 1/lambda)
logreg = LogisticRegression(penalty='l1', max_iter=1000, solver='saga', C=10)
logreg.fit(X_train,y_train)

# generate predictions
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['the','an']))

In [None]:
# Reminder:
#   sparsity: N(zero coeffs) / N(coeffs)
#   density:  N(nonzero coeffs) / N(coeffs)

In [None]:
# extract the coefficients and find the zeros
coeffs = logreg.coef_.squeeze()
zeroCoeffs = np.where(coeffs==0)[0]
nonzeroCoeffs = np.where(coeffs!=0)[0]


# some visualizations
fig = plt.figure(figsize=(12,4))
gs  = GridSpec(1,3,figure=fig)
ax1 = fig.add_subplot(gs[:2])
ax2 = fig.add_subplot(gs[2])


# plot the non-zero coefficients
ax1.plot(nonzeroCoeffs,coeffs[nonzeroCoeffs],'ko',markerfacecolor=[.7,.9,.7,.7],label='Non-zero')
ax1.plot(zeroCoeffs,coeffs[zeroCoeffs],'rx',zorder=-3,label='Exact zero')
ax1.legend()
ax1.set(xlabel='MLP neuron',ylabel='Coefficient',xlim=[-5,nneurons+4],
        title=f'L1 probe selected {len(nonzeroCoeffs)}/{nneurons} coefficients ({100*len(nonzeroCoeffs)/nneurons:.2f}% dense)')

# and the distribution of non-zero coeffs
ax2.hist(coeffs[nonzeroCoeffs],bins=30,color=[.7,.7,.9],edgecolor='k')
ax2.set(xlabel='Coefficient value',ylabel='Count',title='Distribution of non-zero coefficients')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,4))

for i,nidx in enumerate(nonzeroCoeffs):
  y = alldata[labels==0,nidx]
  plt.plot(np.random.randn(len(y))/50+i-.15,y,'ko',markerfacecolor=[.9,.7,.7,.4])

  y = alldata[labels==1,nidx]
  plt.plot(np.random.randn(len(y))/50+i+.15,y,'ks',markerfacecolor=[.7,.9,.7,.4])


# hacky legend solution
plt.plot(i*3,1,'ko',markerfacecolor=[.9,.7,.7],label='"The"')
plt.plot(i*3,1,'ks',markerfacecolor=[.7,.9,.7],label='"An"')
plt.legend()

plt.gca().set(xlabel='Neurons with non-zero coefficients (index)',ylabel='Activations',
              xticks=range(i+1),xlim=[-1,i+1])
plt.show()

# Relation between sparsity and lambda (1/C)

In [None]:
seas = np.linspace(1,100,17)
densities = np.zeros(len(seas))

for idx,c in enumerate(seas):

  logreg = LogisticRegression(penalty='l1', max_iter=1000, solver='saga', C=c)
  logreg.fit(X_train,y_train)

  coeffs = logreg.coef_.squeeze()
  densities[idx] = 100 * (coeffs!=0).sum()/nneurons

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3))
axs[0].plot(1/seas,densities,'ko-',markerfacecolor=[.7,.7,.9],markersize=10)
axs[0].set(xscale='log',xlabel=r'$\lambda = 1/C$',ylabel='Density (% non-zero neurons)',
           title='Thinking about $\lambda$')

axs[1].plot(seas,densities,'ko-',markerfacecolor=[.7,.7,.9],markersize=10)
axs[1].set(xlabel=r'$C = 1/\lambda$',ylabel='Density (% non-zero neurons)',title='Thinking about $C$')

plt.show()