# Model Probing - Identifying most potent layers

Here we use the linear probing as proposed by Ardit et al. and implemented by Kissane et al.  to identify the most promising layer for our further SAE training.

## Imports

In [10]:
import os
import re
import functools
from colorama import Fore, Style
import textwrap
from jaxtyping import Float
import einops

import numpy as np
import pandas as pd

import torch
import transformer_lens
from sae_lens import SAE
from transformers import GPTNeoXForCausalLM, AutoTokenizer, AutoModelForCausalLM

from huggingface_hub import whoami, login, notebook_login

import json
from tqdm import tqdm
from transformer_lens import HookedTransformer

import requests
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import io

from jaxtyping import Int
from torch import Tensor
from typing import List, Callable
from transformers import AutoTokenizer

In [11]:
from data_tools.instructions import get_harmful_instructions, get_harmless_instructions
from utils.templates import PYTHIA_TEMPLATE
from utils.generation import ( 
    format_instruction, tokenize_instructions
)
import steering.linear_probing as lp_steer
import refusal.linear_probing as lp_refuse
from evaluation.refusal import (
    get_refusal_scores, get_wildguard_refusal_score
)
from config import config

In [5]:
login(config.credentials.hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Settings

In [12]:
BASE_MODEL_NAME = "EleutherAI/pythia-410m-deduped"
INSTRUCT_MODEL_NAME = "SummerSigh/Pythia410m-V0-Instruct"

In [13]:
harmless_inst_train, harmless_inst_test = get_harmless_instructions()
harmful_inst_train, harmful_inst_test = get_harmful_instructions()

### Base Model

In [14]:
base_model = HookedTransformer.from_pretrained(
    BASE_MODEL_NAME,
    default_padding_side='left',

)
base_model.tokenizer.padding_side = 'left'
base_model.tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

Loaded pretrained model EleutherAI/pythia-410m-deduped into HookedTransformer


0

In [4]:
# base_model_layer = 23

In [15]:
base_model_tokenize_instructions_fn = lambda instructions: tokenize_instructions(
    tokenizer=base_model.tokenizer,
    instructions=instructions,
    template=PYTHIA_TEMPLATE
)

### Instruct Model

In [16]:
instruct_model_hf = AutoModelForCausalLM.from_pretrained(INSTRUCT_MODEL_NAME)

instruct_model = HookedTransformer.from_pretrained(
    "EleutherAI/pythia-410m-deduped",
    hf_model=instruct_model_hf,
    default_padding_side='left',
  )

instruct_tokenizer = AutoTokenizer.from_pretrained(INSTRUCT_MODEL_NAME)
instruct_tokenizer.padding_side = 'left'
instruct_tokenizer.pad_token = instruct_tokenizer.eos_token

# chat_model.tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

Loaded pretrained model EleutherAI/pythia-410m-deduped into HookedTransformer


In [None]:
# instruct_model_layer = 23

In [17]:
instruct_model_tokenize_instructions_fn = lambda instructions: tokenize_instructions(
    tokenizer=instruct_tokenizer,
    instructions=instructions,
    template=PYTHIA_TEMPLATE
)

## Probing

In [25]:
candidate_layers = [2,4,8,12,14,16,18,20,22,23]

In [19]:
results = []  # will hold tuples (layer, Δℓ, hooked_refusal, wildguard_refusal)

In [22]:
# as our experiments in base_refusal.ipynb showed:
# (we use the wildguard score)
baseline_refusal_rate = 0.11538461538461539

In [26]:
# 3. For each layer ℓ, extract r̂ℓ, apply an addition‐hook, generate, and measure Δℓ
for l in candidate_layers:
    # a) Compute (or load) the normalized refusal direction at layer ℓ
    r_hat = lp_refuse.extract_refusal_direction(
        model=base_model,
        model_name=BASE_MODEL_NAME,
        harmless_inst_train=harmless_inst_train,
        harmful_inst_train=harmful_inst_train,
        n_inst_train=50,
        layer=l,
        pos=-1,  # final token
        pythia_template=PYTHIA_TEMPLATE,
        tokenize_instructions_fn=base_model_tokenize_instructions_fn,
        force=True,  # recompute even if the file exists
    )

    # b) Build the “addition” hook at layer ℓ with steering_coef=3.2 (arbitrary scale)
    hook_dict = lp_steer.get_refusal_direction_hooks(
        model=base_model,
        refusal_dir=r_hat,
        act_add_hook=lp_steer.act_add_hook,
        direction_ablation_hook=lp_steer.direction_ablation_hook,
        intervention_type="addition",
        steering_coef=2.0,
        layer=l,
    )

    # c) Generate on harmful_inst_test *with* the hook active
    hooked_gens = lp_steer.get_generations(
        base_model,
        instructions=harmful_inst_test,
        tokenize_instructions_fn=base_model_tokenize_instructions_fn,
        fwd_hooks=hook_dict,
        max_tokens_generated=100,
        batch_size=8,
    )

    # d) Compute “refusal‐rate” under intervention
    hooked_refusal_rate = get_refusal_scores(hooked_gens)

    # e) (Optionally) measure a secondary metric: how WildGuard scores these outputs
    wildguard_refusal_rate = get_wildguard_refusal_score(
        instructions=harmful_inst_test,
        generations=hooked_gens
    )

    # f) Δℓ = hooked_refusal – baseline_refusal
    delta = wildguard_refusal_rate - baseline_refusal_rate

    results.append((l, delta, hooked_refusal_rate, wildguard_refusal_rate))
    print(">>>>> ", l, delta)

using 50 pairs to compute refusal direction
saved refusal direction to refusal_dir_EleutherAI/pythia-410m-deduped_layer_2.pt


100%|██████████| 13/13 [01:10<00:00,  5.46s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 0/104 [00:00<?, ?it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: N/A
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: N/A
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response 

 83%|████████▎ | 86/104 [00:00<00:00, 855.21it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: N/A
Response refusal: no


100%|██████████| 104/104 [00:00<00:00, 872.84it/s]


>>>>>  2 -0.04807692307692309
using 50 pairs to compute refusal direction
saved refusal direction to refusal_dir_EleutherAI/pythia-410m-deduped_layer_4.pt


100%|██████████| 13/13 [01:12<00:00,  5.55s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 0/104 [00:00<?, ?it/s]

Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: yes
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Res

 81%|████████  | 84/104 [00:00<00:00, 838.18it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no


100%|██████████| 104/104 [00:00<00:00, 856.29it/s]


>>>>>  4 0.11538461538461539
using 50 pairs to compute refusal direction
saved refusal direction to refusal_dir_EleutherAI/pythia-410m-deduped_layer_8.pt


100%|██████████| 13/13 [01:11<00:00,  5.50s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 0/104 [00:00<?, ?it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: N/A
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Respon

 91%|█████████▏| 95/104 [00:00<00:00, 949.40it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: N/A
Response refusal: no


100%|██████████| 104/104 [00:00<00:00, 961.47it/s]


>>>>>  8 0.009615384615384609
using 50 pairs to compute refusal direction
saved refusal direction to refusal_dir_EleutherAI/pythia-410m-deduped_layer_12.pt


100%|██████████| 13/13 [01:11<00:00,  5.50s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 0/104 [00:00<?, ?it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: N/A
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: yes
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: N/A
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes


 84%|████████▎ | 87/104 [00:00<00:00, 863.66it/s]

Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: no
Response refusal: yes
Response refusal: N/A
Response refusal: no


100%|██████████| 104/104 [00:00<00:00, 878.30it/s]


>>>>>  12 0.11538461538461539
using 50 pairs to compute refusal direction
saved refusal direction to refusal_dir_EleutherAI/pythia-410m-deduped_layer_14.pt


100%|██████████| 13/13 [01:11<00:00,  5.51s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.46 GiB. GPU 0 has a total capacity of 44.45 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 42.99 GiB memory in use. Of the allocated memory 40.67 GiB is allocated by PyTorch, and 2.01 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [27]:
# Sort and select the "most promising"layer by maximal Δℓ
results.sort(key=lambda x: x[1], reverse=True)
best_layer, best_delta, best_hooked, best_wg = results[0]

print(f"Layer scan results (layer, Δℓ, hooked_refusal, wildguard_refusal):")
for layer, delta, hr, wr in results:
    print(f"  ℓ={layer:2d} → Δ={delta:.3f}, hooked={hr:.3f}, wg={wr:.3f}")

print(f"\n=> Selected layer ℓ* = {best_layer} (Δℓ* = {best_delta:.3f})")

Layer scan results (layer, Δℓ, hooked_refusal, wildguard_refusal):
  ℓ= 4 → Δ=0.115, hooked=0.000, wg=0.231
  ℓ=12 → Δ=0.115, hooked=0.000, wg=0.231
  ℓ= 8 → Δ=0.010, hooked=0.000, wg=0.125
  ℓ= 2 → Δ=-0.048, hooked=0.000, wg=0.067
  ℓ= 4 → Δ=-0.115, hooked=0.000, wg=0.231
  ℓ= 8 → Δ=-0.115, hooked=0.000, wg=0.125
  ℓ=12 → Δ=-0.115, hooked=0.000, wg=0.231
  ℓ=16 → Δ=-0.115, hooked=0.000, wg=0.125
  ℓ=20 → Δ=-0.115, hooked=0.000, wg=0.125

=> Selected layer ℓ* = 4 (Δℓ* = 0.115)
