In [None]:
import torch
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "Qwen/Qwen2.5-1.5B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if (DEVICE == "cuda") else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = HookedTransformer.from_pretrained(
    model_name=MODEL_NAME,
    # hf_model=model,
    device=DEVICE,
    dtype=DTYPE,
    # fold_ln=False,
    tokenizer=tokenizer
)
model.eval()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded pretrained model Qwen/Qwen2.5-1.5B into HookedTransformer


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (ln1): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): GatedMLP(
        (hook_pre): HookPoint()
        (hook_pre_linear): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_att

In [94]:
target_aggregation_token = {
    "zh_period": 1773,
    "en_period": 13,
}

In [108]:
print("Sanity check tokenization")
# text = "A shark ate every pirate."
text = "有一个厨师吃了每个政治家。有很多个厨师。"
# text = "有一个厨师吃了每个政治家。"
# text = "A shark ate every pirate. There was only one shark."
tokens = model.to_tokens(text)  # [batch, seq]

print("original text: ", text)
print(model.to_str_tokens(text))
for i, token in enumerate(tokens.squeeze()):
    print(i, token.item(), tokenizer.decode(token.item()))

Sanity check tokenization
original text:  有一个厨师吃了每个政治家。有很多个厨师。
['有一个', '厨师', '吃了', '每个', '政治', '家', '。', '有很多', '个', '厨师', '。']
0 104133 有一个
1 114396 厨师
2 105705 吃了
3 103991 每个
4 101091 政治
5 45629 家
6 1773 。
7 101194 有很多
8 18947 个
9 114396 厨师
10 1773 。


In [109]:
def get_hidden(
    model: HookedTransformer,
    # tokens: torch.Tensor,
    text: str,
    target_token_id: int,
) -> torch.Tensor:
    
    tokens = model.to_tokens(text)  # [batch, seq]
    target_idx = [token.item() for token in tokens.squeeze()].index(target_token_id)

    logits, cache = model.run_with_cache(tokens)
    n_layers = model.cfg.n_layers

    resid_pre  = [cache[f"blocks.{l}.hook_resid_pre"]  for l in range(n_layers)]
    resid_pre  = torch.stack(resid_pre,  dim=0)

    hidden = resid_pre[:, :, target_idx, :].squeeze(1) # [layer, d_model]

    return hidden

In [110]:
get_hidden(model, text, target_aggregation_token["zh_period"])

tensor([[-0.0081, -0.0420, -0.0364,  ..., -0.0140,  0.0491,  0.0125],
        [ 0.5295, -0.1512, -0.2002,  ..., -0.1003,  0.2558, -0.1710],
        [ 0.5128, -0.0748,  0.0432,  ..., -0.3115,  0.4052, -0.1939],
        ...,
        [ 3.3788, -1.1827,  0.4156,  ..., -4.4232,  0.5269, -2.2891],
        [-0.1353, -1.2426,  0.5987,  ..., -5.0756,  1.7250, -4.4888],
        [-0.1339, -0.1696,  0.2441,  ..., -4.2331,  0.0539, -4.4301]])