In [1]:
import math, torch, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "gpt2"          #gpt2-medium, llama-2-7b-hf, etc.
DS_NAME = ("wikitext", "wikitext-103-v1")  

In [3]:
model = HookedTransformer.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model.to(DEVICE).eval()

Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cpu


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_out): HookPoint()
      (h

In [4]:
dataset = load_dataset(*DS_NAME, split="test")
sample_text = dataset["text"][0]            # first line
prompt = sample_text 
print("Prompt:", prompt[:120], "..." if len(prompt) > 120 else "")

Prompt:  


In [5]:
import torch, math

def get_qkv_sa(model, prompt: str, device: str = "cpu"):
    """
    Returns
        traces: {layer: {"Q","K","V","S","A"}}
        token_ids: list[int]
    """
    # tokenise (no <BOS> so positions line up with prompt chars)
    toks = model.to_tokens(prompt, prepend_bos=False)        # (1, L)
    if toks.numel() == 0:
        raise ValueError("Prompt produced zero tokens – supply some text.")

    # trim to context window and move to device as int64
    toks = toks[:, -model.cfg.n_ctx :].to(device, dtype=torch.long)

    with torch.no_grad():
        _, cache = model.run_with_cache(toks)

    d_head = model.cfg.d_head
    out = {}
    for layer in range(model.cfg.n_layers):
        q = cache[f"blocks.{layer}.attn.hook_q"][0]          # (H, L, d_h)
        k = cache[f"blocks.{layer}.attn.hook_k"][0]
        v = cache[f"blocks.{layer}.attn.hook_v"][0]
        s = torch.einsum("hqd,hkd->hqk", q, k) / math.sqrt(d_head)
        a = s.softmax(-1)
        out[layer] = {"Q": q, "K": k, "V": v, "S": s, "A": a}

    return out, toks[0].tolist()


In [6]:
traces, token_ids = get_qkv_sa(model, prompt, device=DEVICE)
token_strs = tokenizer.convert_ids_to_tokens(token_ids)


ValueError: Prompt produced zero tokens – supply some text.

In [None]:
np.set_printoptions(precision=4, suppress=True, linewidth=140, threshold=200)

for layer, layer_dict in traces.items():
    print(f"\n================  Layer {layer}  ================\n")
    n_heads = layer_dict["Q"].shape[0]

    for head in range(n_heads):
        print(f"----------  Head {head}  ----------")

        for name in ["Q", "K", "V", "S", "A"]:
            arr = layer_dict[name][head].cpu().numpy()
            print(f"{name}: shape {arr.shape}")
            print(arr, "\n")           # comment out if too verbose

        # labelled attention matrix
        A = layer_dict["A"][head].cpu().numpy()        # (seq, seq)
        df_A = pd.DataFrame(A, index=token_strs, columns=token_strs)
        print("Attention weights (rows = queries, cols = keys):")
        display(df_A.style.background_gradient(cmap="viridis"))


NameError: name 'traces' is not defined