# TransformerLens: Introduction

## Setup

In [None]:
import os
import sys
import plotly.express as px
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
from typing import List, Optional, Tuple
import functools
from tqdm import tqdm
from IPython.display import display
import webbrowser
import gdown
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv

# Make sure exercises are in the path
chapter = r"chapter1_transformer_interp"
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}/exercises").resolve()
section_dir = exercises_dir / "part2_intro_to_mech_interp"
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from arena3.chapter1_transformer_interp.exercises.plotly_utils import imshow, hist, plot_comp_scores, plot_logit_attribution, plot_loss_difference
from part1_transformer_from_scratch.solutions import get_log_probs
import part2_intro_to_mech_interp.tests as tests

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "cpu")

MAIN = __name__ == "__main__"

## Loading and Running Models

In [None]:
gpt2_small: HookedTransformer = HookedTransformer.from_pretrained("gpt2-small")

# Can instead define a model via   HookedTransformer.from_config(cfg)

print(gpt2_small.cfg)

In [None]:
model_description_text = '''## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!'''

loss = gpt2_small(model_description_text, return_type="loss")
print("Model loss:", loss)

## Tokenisation

In [None]:
print(gpt2_small.to_str_tokens("gpt2"))
print()
print(gpt2_small.to_str_tokens(["gpt2", "gpt2"]))
print()
print(gpt2_small.to_tokens("gpt2"))
print()
print(gpt2_small.to_string([50256, 70, 457, 17]))

### Exercise: model performance

In [None]:
logits = gpt2_small(model_description_text, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
expected = gpt2_small.to_tokens(model_description_text).squeeze()[1:]
num_correct = (prediction == expected).sum()

print(f"Correctly identified {num_correct}/{len(true_tokens)}, or {num_correct/len(true_tokens)*100:.2f}% of tokens. Correct tokens:")
print(f"{gpt2_small.to_str_tokens(prediction[prediction == expected])}")

## Caching all Activations

In [None]:
gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
gpt2_tokens = gpt2_small.to_tokens(gpt2_text)
gpt2_logits, gpt2_cache = gpt2_small.run_with_cache(gpt2_tokens, remove_batch_dim=True)

print(gpt2_cache.keys())

In [None]:
attn_patterns_layer_0 = gpt2_cache["pattern", 0] # shorthand

attn_patterns_layer_0_copy = gpt2_cache["blocks.0.attn.hook_pattern"] # direct cache indexing

t.testing.assert_close(attn_patterns_layer_0, attn_patterns_layer_0_copy)

### Exercise:   Verify Activations

In [None]:
layer0_pattern_from_cache = gpt2_cache["pattern", 0]

layer0_attn_scores = einops.einsum(gpt2_cache["q", 0], gpt2_cache["k", 0], "seqQ n h, seqK n h -> n seqQ seqK")

mask = t.triu(t.ones_like(layer0_attn_scores, dtype=bool), diagonal=1).to(device)

layer0_attn_scores[mask] = -1.0e9
layer0_pattern_from_q_and_k = (layer0_attn_scores / gpt2_small.cfg.d_head**0.5).softmax(dim=-1)

t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)
print("Tests passed!")

# Visualising Attention Heads

In [None]:
print(type(gpt2_cache))
attention_pattern = gpt2_cache["pattern", 0]
print(attention_pattern.shape)
gpt2_str_tokens = gpt2_small.to_str_tokens(gpt2_text)

print("Layer 0 Head Attention Patterns:")
display(cv.attention.attention_patterns(
    tokens=gpt2_str_tokens, 
    attention=attention_pattern,
    attention_head_names=[f"L0H{i}" for i in range(12)],
))

## Visualising Neuron Activations 

In [None]:
neuron_activations_for_all_layers = t.stack([
    gpt2_cache["post", layer] for layer in range(gpt2_small.cfg.n_layers)
], dim=1)
# shape = (seq_pos, layers, neurons)

cv.activations.text_neuron_activations(
    tokens=gpt2_str_tokens,
    activations=neuron_activations_for_all_layers
)

In [None]:
neuron_activations_for_all_layers_rearranged = utils.to_numpy(einops.rearrange(neuron_activations_for_all_layers, "seq layers neurons -> 1 layers seq neurons"))

cv.topk_tokens.topk_tokens(
    # Some weird indexing required here ¯\_(ツ)_/¯
    tokens=[gpt2_str_tokens], 
    activations=neuron_activations_for_all_layers_rearranged,
    max_k=7, 
    first_dimension_name="Layer", 
    third_dimension_name="Neuron",
    first_dimension_labels=list(range(12))
)

# Finding induction heads

## Toy Attention-Only Model

In [None]:
cfg = HookedTransformerConfig(
    d_model=768,
    d_head=64,
    n_heads=12,
    n_layers=2,
    n_ctx=2048,
    d_vocab=50278,
    attention_dir="causal",
    attn_only=True, # defaults to False
    tokenizer_name="EleutherAI/gpt-neox-20b", 
    seed=398,
    use_attn_result=True,
    normalization_type=None, # defaults to "LN", i.e. layernorm with weights & biases
    positional_embedding_type="shortformer"
)

In [None]:
from huggingface_hub import hf_hub_download

REPO_ID = "callummcdougall/attn_only_2L_half"
FILENAME = "attn_only_2L_half.pth"

weights_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

In [None]:
model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_path, map_location=device)
model.load_state_dict(pretrained_weights)

### Exercise: visualise attention patterns

In [None]:
text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
logits, cache = model.run_with_cache(text, remove_batch_dim=True)

str_tokens = model.to_str_tokens(text)
for layer_idx in range(model.cfg.n_layers):
    attention_pattern = cache["pattern", layer_idx]
    display(cv.attention.attention_heads(tokens=str_tokens, attention=attention_pattern))

### Exercise: write own detectors

In [None]:
THRESH = 0.4

def current_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be current-token heads
    '''
    attn_heads = []
    for layer in range(model.cfg.n_layers):
        for head in range(model.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            score = attention_pattern.diagonal().mean()
            if score > THRESH:
                attn_heads.append(f"{layer}.{head} ({score:.4f})")
    return attn_heads

def prev_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be prev-token heads
    '''
    attn_heads = []
    for layer in range(model.cfg.n_layers):
        for head in range(model.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            score = attention_pattern.diagonal(-1).mean()
            if score > THRESH:
                attn_heads.append(f"{layer}.{head} ({score:.4f})")
    return attn_heads

def first_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be first-token heads
    '''
    attn_heads = []
    for layer in range(model.cfg.n_layers):
        for head in range(model.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            score = attention_pattern[:, 0].mean()
            if score > THRESH:
                attn_heads.append(f"{layer}.{head} ({score:.4f})")
    return attn_heads

In [None]:
print("Heads attending to current token  = ", ", ".join(current_attn_detector(cache)))
print("Heads attending to previous token = ", ", ".join(prev_attn_detector(cache)))
print("Heads attending to first token    = ", ", ".join(first_attn_detector(cache)))

#### Different text:

In [None]:
text2 = "The goal of mechanistic interpretability is to take a trained model and reverse engineer the algorithms the model learned during training from its weights. It is a fact about the world today that we have computer programs that can essentially speak English at a human level (GPT-3, PaLM, etc), yet we have no idea how they work nor how to write one ourselves. This offends me greatly, and I would like to solve this! Mechanistic interpretability is a very young and small field, and there are a lot of open problems - if you would like to help, please try working on one!"
logits2, cache2 = model.run_with_cache(text2, remove_batch_dim=True)

# str_tokens2 = model.to_str_tokens(text2)
# for layer_idx in range(model.cfg.n_layers):
#     attention_pattern = cache["pattern", layer_idx]
#     display(cv.attention.attention_heads(tokens=str_tokens2, attention=attention_pattern))

print("Heads attending to current token  = ", ", ".join(current_attn_detector(cache2)))
print("Heads attending to previous token = ", ", ".join(prev_attn_detector(cache2)))
print("Heads attending to first token    = ", ", ".join(first_attn_detector(cache2)))

**!!! Same heads !!!** Similar scores.

## Checking for the induction capability

### Exercise: plot per-token loss on repeated sequence

In [None]:
def generate_repeated_tokens(
    model: HookedTransformer, seq_len: int, batch: int = 1
) -> Int[Tensor, "batch full_seq_len"]:
    '''
    Generates a sequence of repeated random tokens

    Outputs are:
        rep_tokens: [batch, 1+2*seq_len]
    '''
    prefix = (t.ones(batch, 1) * model.tokenizer.bos_token_id).long().to(device)
    tokens = t.randint(0, model.cfg.d_vocab, (batch, seq_len), dtype=t.long, device=device)
    return t.cat([prefix, tokens, tokens], dim=-1).to(device)

def run_and_cache_model_repeated_tokens(model: HookedTransformer, seq_len: int, batch: int = 1) -> Tuple[t.Tensor, t.Tensor, ActivationCache]:
    '''
    Generates a sequence of repeated random tokens, and runs the model on it, returning logits, tokens and cache

    Should use the `generate_repeated_tokens` function above

    Outputs are:
        rep_tokens: [batch, 1+2*seq_len]
        rep_logits: [batch, 1+2*seq_len, d_vocab]
        rep_cache: The cache of the model run on rep_tokens
    '''
    # SOLUTION
    tokens = generate_repeated_tokens(model, seq_len, batch)
    logits, cache = model.run_with_cache(tokens)
    return tokens, logits, cache


seq_len = 50
batch = 1
(rep_tokens, rep_logits, rep_cache) = run_and_cache_model_repeated_tokens(model, seq_len, batch)
rep_cache.remove_batch_dim()
rep_str = model.to_str_tokens(rep_tokens)
model.reset_hooks()
log_probs = get_log_probs(rep_logits, rep_tokens).squeeze()

print(f"Performance on the first half: {log_probs[:seq_len].mean():.3f}")
print(f"Performance on the second half: {log_probs[seq_len:].mean():.3f}")

plot_loss_difference(log_probs, rep_str, seq_len)

### Looking for Induction Attention Patterns

In [None]:
for layer in range(model.cfg.n_layers):
    attention_pattern = rep_cache["pattern", layer]
    display(cv.attention.attention_heads(tokens=rep_str, attention=attention_pattern))

### Exercise: make an induction detector

In [None]:
print(rep_cache['hook_embed'].shape)

In [None]:
def induction_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be induction heads

    Remember - the tokens used to generate rep_cache are (bos_token, *rand_tokens, *rand_tokens)
    '''
    # "'1 - ' because we're attending to the token *following* previous occurence of the current token"
    offset = 1 - ((cache["hook_embed"].shape[0] - 1) // 2)
    attn_heads = []
    for layer in range(model.cfg.n_layers):
        for head in range(model.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            score = attention_pattern.diagonal(offset=offset).mean()
            if score > 0.4:
                attn_heads.append(f"{layer}.{head}")
    return attn_heads


print("Induction heads = ", ", ".join(induction_attn_detector(rep_cache)))

# TransformerLens: Hooks

### Exercise: calculate induction scores with hooks

In [None]:
seq_len = 50
batch = 10
rep_tokens_10 = generate_repeated_tokens(model, seq_len, batch)

# We make a tensor to store the induction score for each head.
# We put it on the model's device to avoid needing to move things between the GPU and CPU, which can be slow.
induction_score_store = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=model.cfg.device)


def induction_score_hook(
    pattern: Float[Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    '''
    Calculates the induction score, and stores it in the [layer, head] position of the `induction_score_store` tensor.
    '''
    diag_offset = 1 - seq_len

    induction_score_store[hook.layer()][:] = pattern.diagonal(offset=diag_offset, dim1=-2, dim2=-1).mean(dim=(0, 2))

pattern_hook_names_filter = lambda name: name.endswith("pattern")

# Run with hooks (this is where we write to the `induction_score_store` tensor`)
model.run_with_hooks(
    rep_tokens_10, 
    return_type=None, # For efficiency, we don't need to calculate the logits
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

# Plot the induction scores for each head in each layer
imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"}, 
    title="Induction Score by Head", 
    text_auto=".2f",
    width=900, height=400
)

### Exercise: find induction heads in GPT2-small

In [None]:
def visualize_pattern_hook(
    pattern: Float[Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    print("Layer: ", hook.layer())
    display(
        cv.attention.attention_patterns(
            tokens=gpt2_small.to_str_tokens(rep_tokens[0]), 
            attention=pattern.mean(0)
        )
    )

In [None]:
seq_len = 50
batch = 10
tokens = generate_repeated_tokens(gpt2_small, seq_len, batch)
print(f"Tokens: {tokens}")
print(f"Layers: {gpt2_small.cfg.n_layers}")
print(f"Heads: {gpt2_small.cfg.n_heads}")

induction_score_store = t.zeros((gpt2_small.cfg.n_layers, gpt2_small.cfg.n_heads), device=gpt2_small.cfg.device)

gpt2_small.run_with_hooks(
    tokens, 
    return_type=None, # For efficiency, we don't need to calculate the logits
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"},
    title="Induction Score by Head", 
    text_auto=".1f",
    width=800
)

Heads 5.1, 5.5, 6.9... strongly inductiony
Also check out 9.0 out of interest. 

In [None]:
for induction_head_layer in [5, 9]:
    gpt2_small.run_with_hooks(
        rep_tokens, 
        return_type=None, # For efficiency, we don't need to calculate the logits
        fwd_hooks=[
            (utils.get_act_name("pattern", induction_head_layer), visualize_pattern_hook)
        ]
    )

## Building interpretability tools 

### Exercise: build logit attribution tool

In [None]:
def logit_attribution(
    embed: Float[Tensor, "seq d_model"],
    l1_results: Float[Tensor, "seq nheads d_model"],
    l2_results: Float[Tensor, "seq nheads d_model"],
    W_U: Float[Tensor, "d_model d_vocab"],
    tokens: Int[Tensor, "seq"]
) -> Float[Tensor, "seq-1 n_components"]:
    '''
    Inputs:
        embed: the embeddings of the tokens (i.e. token + position embeddings)
        l1_results: the outputs of the attention heads at layer 1 (with head as one of the dimensions)
        l2_results: the outputs of the attention heads at layer 2 (with head as one of the dimensions)
        W_U: the unembedding matrix
        tokens: the token ids of the sequence

    Returns:
        Tensor of shape (seq_len-1, n_components)
        represents the concatenation (along dim=-1) of logit attributions from:
            the direct path (seq-1,1)
            layer 0 logits (seq-1, n_heads)
            layer 1 logits (seq-1, n_heads)
        so n_components = 1 + 2*n_heads
    '''
    seq_len_adj = tokens.shape[0] - 1
    W_U_correct_tokens = W_U[:, tokens[1:]]

    embed_out = einops.einsum(embed[:-1, :], W_U_correct_tokens, "seq_len_adj d_model, d_model seq_len_adj -> seq_len_adj")
    l1_out = einops.einsum(l1_results[:-1, :, :], W_U_correct_tokens, "seq_len_adj nheads d_model, d_model seq_len_adj -> seq_len_adj nheads")
    l2_out = einops.einsum(l2_results[:-1, :, :], W_U_correct_tokens, "seq_len_adj nheads d_model, d_model seq_len_adj -> seq_len_adj nheads")

    return t.cat([embed_out.unsqueeze(-1), l1_out, l2_out], dim=-1)


text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
logits, cache = model.run_with_cache(text, remove_batch_dim=True)
str_tokens = model.to_str_tokens(text)
tokens = model.to_tokens(text)

with t.inference_mode():
    embed = cache["embed"]
    l1_results = cache["result", 0]
    l2_results = cache["result", 1]
    logit_attr = logit_attribution(embed, l1_results, l2_results, model.W_U, tokens[0])
    # Uses fancy indexing to get a len(tokens[0])-1 length tensor, where the kth entry is the predicted logit for the correct k+1th token
    correct_token_logits = logits[0, t.arange(len(tokens[0]) - 1), tokens[0, 1:]]
    t.testing.assert_close(logit_attr.sum(1), correct_token_logits, atol=1e-3, rtol=0)
    print("Tests passed!")

Visualise logit attributions:

In [None]:
embed = cache["embed"]
l1_results = cache["result", 0]
l2_results = cache["result", 1]
logit_attr = logit_attribution(embed, l1_results, l2_results, model.W_U, tokens[0])

plot_logit_attribution(model, logit_attr, tokens)

Observations:

1. Highest logit attribution comes from direct path (embed). High attributions in direct path are `| super|_7`, `| more|_12` `| machine|_24` and especially `| manip|_46`. Why? They are **offer very probably bigrams**. Examples are words split in two (e.g., "manipulative"), or words that are often paired with another. E.g., "more likely"

2. The second layer seems to contribute more than that first layer. Why? Attributions from a layer cannot pick up on a head's effect in composition within another head. Layer 0 therefore includes no compositional effects. In contrast, attributions for layer 1 heads do not only include the paths through those heads, but also the compositional path through layer 1 combined with heads in layer 0.  

### Exercise: logit attribution for the induction heads

In [None]:
seq_len = 50

embed = rep_cache["embed"]
l1_results = rep_cache["result", 0]
l2_results = rep_cache["result", 1]
first_half_tokens = rep_tokens[0, : 1 + seq_len]
second_half_tokens = rep_tokens[0, seq_len:]

# (each with a single call to the `logit_attribution` function)
first_half_logit_attr = logit_attribution(embed[:seq_len+1], l1_results[:seq_len+1], l2_results[:seq_len+1], model.W_U, first_half_tokens)
second_half_logit_attr = logit_attribution(embed[seq_len:], l1_results[seq_len:], l2_results[seq_len:], model.W_U, second_half_tokens)

assert first_half_logit_attr.shape == (seq_len, 2*model.cfg.n_heads + 1)
assert second_half_logit_attr.shape == (seq_len, 2*model.cfg.n_heads + 1)

plot_logit_attribution(model, first_half_logit_attr, first_half_tokens, "Logit attribution (first half of repeated sequence)")
plot_logit_attribution(model, second_half_logit_attr, second_half_tokens, "Logit attribution (second half of repeated sequence)")

Interpretation:
1. Logit attribution for first hald of the sequence (first plot) is pretty meaningless. This sequence is random - no structure.
2. Previously, we observed that heads 1.4 and 1.10 seemed to be acting as induction heads. This plot gives further evidence that this is the case, since these two heads have a large logit attribution scores on sequences in which the only(?) way to get accurate predictions is to use the induction mechanism. Also, like our attention scores result, 1.10 is a stronger (induction) head than 1.4.