# Steering Vectors
- Find and evalulate candidate steering vectors for inverse/surface scope in doubly quantified sentences

## Load Model

In [1]:
import torch
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "Qwen/Qwen2.5-1.5B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if (DEVICE == "cuda") else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = HookedTransformer.from_pretrained(
    model_name=MODEL_NAME,
    device=DEVICE,
    dtype=DTYPE,
    tokenizer=tokenizer
)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


Loaded pretrained model Qwen/Qwen2.5-1.5B into HookedTransformer


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (ln1): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): GatedMLP(
        (hook_pre): HookPoint()
        (hook_pre_linear): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_att

## Extract $h_i$

In [2]:
target_aggregation_token = {
    "zh_period": 1773,
    "en_period": 13,
}

In [3]:
print("Sanity check tokenization")
# text = "A shark ate every pirate."
text = "有一个厨师吃了每个政治家。有很多个厨师。"
# text = "有一个厨师吃了每个政治家。"
# text = "A shark ate every pirate. There was only one shark."
tokens = model.to_tokens(text)  # [batch, seq]

print("original text: ", text)
print(model.to_str_tokens(text))
for i, token in enumerate(tokens.squeeze()):
    print(i, token.item(), tokenizer.decode(token.item()))

Sanity check tokenization
original text:  有一个厨师吃了每个政治家。有很多个厨师。
['有一个', '厨师', '吃了', '每个', '政治', '家', '。', '有很多', '个', '厨师', '。']
0 104133 有一个
1 114396 厨师
2 105705 吃了
3 103991 每个
4 101091 政治
5 45629 家
6 1773 。
7 101194 有很多
8 18947 个
9 114396 厨师
10 1773 。


In [4]:
# TODO batch this
def get_hidden(
    model: HookedTransformer,
    # tokens: torch.Tensor,
    text: str,
    target_token_id: int,
) -> torch.Tensor:
    
    tokens = model.to_tokens(text)  # [batch, seq]
    target_idx = [token.item() for token in tokens.squeeze()].index(target_token_id)

    logits, cache = model.run_with_cache(tokens)
    n_layers = model.cfg.n_layers

    resid_pre  = [cache[f"blocks.{l}.hook_resid_pre"]  for l in range(n_layers)]
    resid_pre  = torch.stack(resid_pre,  dim=0)

    hidden = resid_pre[:, :, target_idx, :].squeeze(1) # [layer, d_model]

    return hidden

In [5]:
import pandas as pd
import numpy as np

In [6]:
def get_mu_hidden(df: pd.DataFrame, language: str, scope: str, target_token_id: int) -> torch.Tensor:
    """
    Returns the average activation for a 
    """
    activations = []
    for sentence in df[df["language"] == language][df["preference_mean"] == scope]["sentence"]:
        activations.append(get_hidden(model, sentence, target_token_id))

    return torch.stack(activations).mean(dim=0)

In [7]:
EN_PERIOD_ID = 13
ZH_PERIOD_ID = 1773

def get_boundary_id(lang: str) -> int:
    return EN_PERIOD_ID if lang == "en" else ZH_PERIOD_ID

def prompt_period_index(model, prompt: str, boundary_id: int) -> int:
    ids = model.to_tokens(prompt, prepend_bos=False).squeeze(0)
    return list(ids).index(boundary_id)

In [8]:
path = "results/qwen2.5/tokenize_no_space_1.5b/scored_wide_ue_Qwen_Qwen2.5-1.5B-2.jsonl"
df = pd.read_json(path, lines=True)

# IMPORTANT: align rows with activations
df = df.reset_index(drop=True)
df["t_prompt_period"] = df.apply(
    lambda r: prompt_period_index(
        model,
        r["sentence"],
        get_boundary_id(r["language"]),
    ),
    axis=1,
)
df

Unnamed: 0,model,stimulus_id,concept_id,language,template_id,subj,obj,verb,sentence,tid,...,continuation_text_surface,full_text_inverse,full_text_surface,delta_sum,ratio_sum,delta_mean,ratio_mean,preference_sum,preference_mean,t_prompt_period
0,Qwen_Qwen2.5-1.5B,en-t0_s0_o0_v0,shark|pirate|ate,en,en_0,shark,pirate,ate,Every shark ate a pirate.,0,...,There was only one pirate.,Every shark ate a pirate. There were many pira...,Every shark ate a pirate. There was only one p...,1.534791,4.640356,-0.157645,0.854153,inverse,surface,5
1,Qwen_Qwen2.5-1.5B,en-t0_s0_o0_v1,shark|pirate|helped,en,en_0,shark,pirate,helped,Every shark helped a pirate.,0,...,There was only one pirate.,Every shark helped a pirate. There were many p...,Every shark helped a pirate. There was only on...,1.788671,5.981495,-0.079300,0.923763,inverse,surface,5
2,Qwen_Qwen2.5-1.5B,en-t0_s0_o0_v2,shark|pirate|pushed,en,en_0,shark,pirate,pushed,Every shark pushed a pirate.,0,...,There was only one pirate.,Every shark pushed a pirate. There were many p...,Every shark pushed a pirate. There was only on...,1.597416,4.940250,-0.122437,0.884762,inverse,surface,5
3,Qwen_Qwen2.5-1.5B,en-t0_s0_o0_v3,shark|pirate|chased,en,en_0,shark,pirate,chased,Every shark chased a pirate.,0,...,There was only one pirate.,Every shark chased a pirate. There were many p...,Every shark chased a pirate. There was only on...,-0.460211,0.631151,-0.496891,0.608419,surface,surface,5
4,Qwen_Qwen2.5-1.5B,en-t0_s0_o1_v0,shark|student|ate,en,en_0,shark,student,ate,Every shark ate a student.,0,...,There was only one student.,Every shark ate a student. There were many stu...,Every shark ate a student. There was only one ...,2.499035,12.170742,0.010185,1.010237,inverse,inverse,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Qwen_Qwen2.5-1.5B,zh-t0_s4_o3_v3,袋鼠|游客|追,zh,zh_0,袋鼠,游客,追,每个只袋鼠追了一个游客。,0,...,只有一只游客。,每个只袋鼠追了一个游客。有很多只游客。,每个只袋鼠追了一个游客。只有一只游客。,2.758894,15.782377,-0.447330,0.639333,inverse,surface,7
196,Qwen_Qwen2.5-1.5B,zh-t0_s4_o4_v0,袋鼠|政治家|吃,zh,zh_0,袋鼠,政治家,吃,每个只袋鼠吃了一个政治家。,0,...,只有一只政治家。,每个只袋鼠吃了一个政治家。有很多只政治家。,每个只袋鼠吃了一个政治家。只有一只政治家。,3.729450,41.656200,0.090819,1.095070,inverse,inverse,8
197,Qwen_Qwen2.5-1.5B,zh-t0_s4_o4_v1,袋鼠|政治家|帮助,zh,zh_0,袋鼠,政治家,帮助,每个只袋鼠帮助了一个政治家。,0,...,只有一只政治家。,每个只袋鼠帮助了一个政治家。有很多只政治家。,每个只袋鼠帮助了一个政治家。只有一只政治家。,4.993790,147.494321,0.214477,1.239213,inverse,inverse,8
198,Qwen_Qwen2.5-1.5B,zh-t0_s4_o4_v2,袋鼠|政治家|推,zh,zh_0,袋鼠,政治家,推,每个只袋鼠推了一个政治家。,0,...,只有一只政治家。,每个只袋鼠推了一个政治家。有很多只政治家。,每个只袋鼠推了一个政治家。只有一只政治家。,4.553782,94.990939,0.186490,1.205012,inverse,inverse,8


In [9]:
@torch.no_grad()
def extract_H_one_layer(
    model,
    df: pd.DataFrame,
    *,
    layer: int,
    hook_point: str = "resid_pre",
    batch_size: int = 16,
) -> torch.Tensor:
    """
    Returns H: [N, d_model] for the specified layer.
    """
    prompts = df["sentence"].tolist()
    t_pos = torch.tensor(df["t_prompt_period"].tolist(), dtype=torch.long)

    outs = []

    for start in range(0, len(prompts), batch_size):
        end = min(len(prompts), start + batch_size)

        p_batch = prompts[start:end]
        t_batch = t_pos[start:end]

        toks = model.to_tokens(p_batch)
        hook = f"blocks.{layer}.hook_{hook_point}"

        _, cache = model.run_with_cache(toks, names_filter=[hook])

        acts = cache[hook]              # [B, T, d_model]
        B = acts.shape[0]
        b_idx = torch.arange(B, device=acts.device)

        H_batch = acts[b_idx, t_batch.to(acts.device)]
        outs.append(H_batch.detach().cpu())

    return torch.cat(outs, dim=0)       # [B, d_model]

In [10]:
def get_mu_from_H(
    H: torch.Tensor,      # [B, d_model]
    df: pd.DataFrame,
    *,
    language: str,
    scope: str,           # "surface" | "inverse"
) -> torch.Tensor:
    mask = (
        (df["language"] == language) &
        (df["preference_mean"] == scope)
    ).to_numpy()

    if  np.all(mask == False):
        _, d_model = H.shape
        return torch.zeros(d_model)
    return H[mask].mean(dim=0)

In [11]:
def fit_v_hat(H_train, df_train, *, language: str):
    mu_surface = get_mu_from_H(H_train, df_train, language=language, scope="surface")
    mu_inverse = get_mu_from_H(H_train, df_train, language=language, scope="inverse")
    v = mu_inverse - mu_surface
    return v / (v.norm() + 1e-8)

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

def plot_separability_hist(
    H,
    df,
    v_hat,
    *,
    language: str,
    title: str,
    save_path: str | None = None,
):
    mask = (df["language"] == language).to_numpy()
    H_lang = H[mask]
    df_lang = df[mask]

    p = (H_lang @ v_hat.cpu()).numpy()
    y = df_lang["preference_mean"].to_numpy()

    p_surf = p[y == "surface"]
    p_inv  = p[y == "inverse"]

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.hist(p_surf, bins=40, alpha=0.6, label="surface")
    ax.hist(p_inv,  bins=40, alpha=0.6, label="inverse")

    ax.set_title(title)
    ax.set_xlabel("p = vᵀh")
    ax.set_ylabel("count")
    ax.legend()

    if save_path is not None:
        fig.savefig(save_path, bbox_inches="tight")

    plt.close(fig)


In [40]:
print("counts:")
print(df.groupby(["language", "preference_mean"]).size())

n_layers = model.cfg.n_layers
for layer in range(n_layers):
# for layer in [10, 11]:
    print("Extracting layer ", layer)
    H = extract_H_one_layer(model, df, layer=layer)
    v_en = fit_v_hat(H, df, language="en")
    v_zh = fit_v_hat(H, df, language="zh")

    # sanity check
    print("||v_en|| =", v_en.norm().item())
    print("||v_zh|| =", v_zh.norm().item())

    print("cross-linguistic alignment", torch.dot(v_en, v_zh))
    save_root = Path(f"results/plots/{MODEL_NAME}")
    save_root.mkdir(parents=True, exist_ok=True)
    plot_separability_hist(H, df, v_en, language="en", title=f"Layer {layer}: v_en separation for EN data", save_path=save_root / f"en/layer_{layer:02d}_en_hist.png")
    plot_separability_hist(H, df, v_zh, language="zh", title=f"Layer {layer}: v_zh separation for ZH data", save_path=save_root / f"zh/layer_{layer:02d}_zh_hist.png")

counts:
language  preference_mean
en        inverse             9
          surface            91
zh        inverse            48
          surface            52
dtype: int64
Extracting layer  0
||v_en|| = 0.0
||v_zh|| = 0.0
cross-linguistic alignment tensor(0.)
Extracting layer  1
||v_en|| = 1.0000001192092896
||v_zh|| = 0.9999999403953552
cross-linguistic alignment tensor(0.0150)
Extracting layer  2
||v_en|| = 1.0
||v_zh|| = 1.0
cross-linguistic alignment tensor(0.0393)
Extracting layer  3
||v_en|| = 0.9999999403953552
||v_zh|| = 1.0000001192092896
cross-linguistic alignment tensor(0.1365)
Extracting layer  4
||v_en|| = 1.0
||v_zh|| = 0.9999998807907104
cross-linguistic alignment tensor(0.1588)
Extracting layer  5
||v_en|| = 1.0
||v_zh|| = 1.0
cross-linguistic alignment tensor(0.1462)
Extracting layer  6
||v_en|| = 1.0000001192092896
||v_zh|| = 1.0
cross-linguistic alignment tensor(-0.0535)
Extracting layer  7
||v_en|| = 0.9999999403953552
||v_zh|| = 1.0000001192092896
cross-linguist

In [41]:
v_en = fit_v_hat(H, df, language="en")
v_zh = fit_v_hat(H, df, language="zh")

In [42]:
print("counts:")
print(df.groupby(["language", "preference_mean"]).size())
print("||v_en|| =", v_en.norm().item())
print("||v_zh|| =", v_zh.norm().item())

counts:
language  preference_mean
en        inverse             9
          surface            91
zh        inverse            48
          surface            52
dtype: int64
||v_en|| = 0.9999999403953552
||v_zh|| = 0.9999999403953552


In [43]:
print("cross-linguistic alignment", torch.dot(v_en, v_zh))
plot_separability_hist(H, df, v_en, language="en", title="EN: v_en on EN")
plot_separability_hist(H, df, v_zh, language="zh", title="ZH: v_zh on ZH")

# plot_separability_hist(H, df, v_zh, language="en", title="EN: v_zh on EN") # Should be no clear separation since EN vector already doesn't work
# plot_separability_hist(H, df, v_en, language="zh", title="ZH: v_en on ZH") # Dot product was close to 0 so shouldn't show clear separation either

cross-linguistic alignment tensor(0.0166)


## 

## 