In [1]:
import torch
from transformer_lens import HookedTransformer
from transformers import GPT2TokenizerFast
from torch.nn.functional import softmax

torch.manual_seed(0)


<torch._C.Generator at 0x113c16190>

In [2]:
model = HookedTransformer.from_pretrained(
    "gpt2-small",  fold_ln=False, center_writing_weights=False
)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


Loaded pretrained model gpt2-small into HookedTransformer


In [None]:
prompt = "The quick brown fox jumps over the lazy dog"
tokens = tokenizer(prompt, return_tensors="pt")


with torch.no_grad():
    logits, cache = model.run_with_cache(tokens["input_ids"])


In [9]:
print(model(tokens['input_ids']))

tensor([[[ 3.6957,  4.9717,  1.9038,  ..., -0.5333, -1.3929,  5.0638],
         [ 5.2479,  5.9973,  1.0680,  ..., -0.5668, -0.7610,  3.9789],
         [ 6.4179,  5.9967,  0.5801,  ..., -2.4536, -0.7232,  4.5632],
         ...,
         [ 3.1456,  3.4648,  0.1817,  ...,  0.8195, -0.5292,  1.4608],
         [ 5.1333,  6.4010,  1.6858,  ..., -1.8190,  0.4957,  2.0539],
         [10.9164,  8.9990,  4.4457,  ..., -3.4124,  2.5023,  7.4432]]],
       device='mps:0', grad_fn=<ViewBackward0>)


In [13]:

n_layers = model.cfg.n_layers
d_model  = model.cfg.d_model
W_U = model.W_U       


fox_id = tokenizer.encode("fox")[0]

probs_by_layer = []

for layer in range(n_layers):

    resid_post = cache[f"blocks.{layer}.hook_resid_post"][0]   

    vec = resid_post[1]          
    logits_early = vec @ W_U     
    prob_early  = softmax(logits_early, dim=0)

    probs_by_layer.append(prob_early[fox_id].item())


In [14]:
best_layer  = max(range(n_layers), key=lambda i: probs_by_layer[i])
best_prob   = probs_by_layer[best_layer]

print(f"Highest p('fox') = {best_prob:.15e}  (layer {best_layer})")


Highest p('fox') = 1.623849675595302e-17  (layer 1)
