# Model Probing - Identifying most potent layers

Here we use the linear probing as proposed by Ardit et al. and implemented by Kissane et al.  to identify the most promising layer for our further SAE training.

## Imports

In [1]:
import os
import re
import functools
from colorama import Fore, Style
import textwrap
from jaxtyping import Float
import einops

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F

import transformer_lens
from sae_lens import SAE
from transformers import GPTNeoXForCausalLM, AutoTokenizer, AutoModelForCausalLM

from huggingface_hub import whoami, login, notebook_login

import json
from tqdm import tqdm
from transformer_lens import HookedTransformer

import requests
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import io

from jaxtyping import Int
from torch import Tensor
from typing import List, Callable
from transformers import AutoTokenizer

In [2]:
from data_tools.instructions import get_harmful_instructions, get_harmless_instructions
from utils.templates import PYTHIA_TEMPLATE
from utils.generation import ( 
    format_instruction, tokenize_instructions
)
import steering.linear_probing as lp_steer
import refusal.linear_probing as lp_refuse
from evaluation.refusal import (
    get_refusal_scores, get_wildguard_refusal_score
)
from config import config

In [3]:
login(config.credentials.hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Settings

In [3]:
harmless_inst_train, harmless_inst_test = get_harmless_instructions()
harmful_inst_train, harmful_inst_test = get_harmful_instructions()

print(len(harmless_inst_train), len(harmless_inst_test))
print(len(harmful_inst_train), len(harmful_inst_test))

25058 6265
416 104


In [4]:
def compute_f1(rr, orr):
    # refusal-rate = recall
    # over-rr = precision
    return 2*((orr*rr)/(orr+rr))
    

def compute_nbgs(rr, orr, rr0, orr0):
    if rr0 >= 1.0:
        raise ValueError("rr0 must be < 1 to avoid division by zero")
    if orr0 <= 0.0:
        raise ValueError("orr0 must be > 0 to avoid division by zero")
    gain = (rr - rr0) / (1 - rr0)
    penalty = (orr - orr0) / orr0
    return gain - penalty

## Probing SmolLM-2

In [5]:
from steering.hf_linear_probing import ActivationSteering

In [6]:
# --- 1. Initialization ---
MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"
STEERING_COEF = 18.0
NUM_TEST_SAMPLES = 100
NUM_TRAIN_SAMPLES = 500

print("Initializing the steering handler...")
hf_steering_handler = ActivationSteering(MODEL_NAME)
num_layers = hf_steering_handler.model.config.num_hidden_layers
print(f"Model '{MODEL_NAME}' has {num_layers} layers.")

Initializing the steering handler...
Using device: cuda
Model 'HuggingFaceTB/SmolLM2-135M' has 30 layers.


In [7]:
# --- calculate Baseline (Unsteered) Performance ---
print(f"\nCalculating baseline performance (without steering) for {MODEL_NAME}...")

# Generate baseline responses for harmful instructions
baseline_harmful_responses = [
    hf_steering_handler.generate(prompt, steering_coef=0.0) 
    for prompt in harmful_inst_test[:NUM_TEST_SAMPLES]
]
baseline_refusal_rate = get_wildguard_refusal_score(
    instructions=harmful_inst_test[:NUM_TEST_SAMPLES],
    generations=baseline_harmful_responses
)
rr_2 = get_refusal_scores(baseline_harmful_responses)
print("RR2", rr_2)

# Generate baseline responses for harmless instructions
baseline_harmless_responses = [
    hf_steering_handler.generate(prompt, steering_coef=0.0) 
    for prompt in harmless_inst_test[:NUM_TEST_SAMPLES]
]
baseline_over_refusal_rate = get_wildguard_refusal_score(
    instructions=harmless_inst_test[:NUM_TEST_SAMPLES],
    generations=baseline_harmless_responses
)
orr_2 = get_refusal_scores(baseline_harmless_responses)
print("ORR2", orr_2)

print(f"Baseline Refusal Rate (RR0): {baseline_refusal_rate:.4f}")
print(f"Baseline Over-Refusal Rate (ORR0): {baseline_over_refusal_rate:.4f}")


Calculating baseline performance (without steering) for HuggingFaceTB/SmolLM2-135M...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 100/100 [00:00<00:00, 20365.64it/s]


RR2 0.0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 100/100 [00:00<00:00, 20064.60it/s]


ORR2 0.0
Baseline Refusal Rate (RR0): 0.0000
Baseline Over-Refusal Rate (ORR0): 0.0700


In [None]:
# --- Iterate Over All Layers ---
results = []
print(f"\n--- Starting layer sweep with steering_coef = {STEERING_COEF} ---")

for l in range(num_layers):
    print(f"\n----- Processing Layer {l} -----")
    
    # 1) Collect activations and create steering vector for the current layer
    hf_steering_handler.create_steering_vector(
        harmful_prompts=harmful_inst_train[:NUM_TRAIN_SAMPLES], 
        harmless_prompts=harmless_inst_train[:NUM_TRAIN_SAMPLES],
        layer_idx=l
    )
    
    
    print("   Generating steered responses for harmful test set...")
    harmful_responses = [
        hf_steering_handler.generate(prompt, steering_coef=STEERING_COEF) 
        for prompt in harmful_inst_test[:NUM_TEST_SAMPLES]
    ]
    
    print("   Generating steered responses for harmless test set...")
    harmless_responses = [
        hf_steering_handler.generate(prompt, steering_coef=STEERING_COEF) 
        for prompt in harmless_inst_test[:NUM_TEST_SAMPLES]
    ]

    # 3) Evaluate responses
    print("   Evaluating steered responses...")
    rr = get_wildguard_refusal_score(
        instructions=harmful_inst_test[:NUM_TEST_SAMPLES],
        generations=harmful_responses
    )  # Refusal on harmful

    rr_2 = get_refusal_scores(harmful_responses)
    print(rr_2)

    orr = get_wildguard_refusal_score(
        instructions=harmless_inst_test[:NUM_TEST_SAMPLES],
        generations=harmless_responses
    )  # Over-refusal on harmless

    orr_2 = get_refusal_scores(harmless_responses)
    print(orr_2)

    f1 = compute_f1(
        rr=rr,
        orr=orr,        
    )

    results.append((l, f1, rr, orr))
    print(f">>>>> Layer {l} | F1: {nbgs:.4f} | RR: {rr:.4f} | ORR: {orr:.4f}")


--- Starting layer sweep with steering_coef = 18.0 ---

----- Processing Layer 0 -----
Creating steering vector at layer 0...


Collecting activations: 100%|██████████| 500/500 [00:04<00:00, 110.58it/s]
Collecting activations: 100%|██████████| 416/416 [00:03<00:00, 110.77it/s]


Steering vector created and normalized.
   Generating steered responses for harmful test set...
   Generating steered responses for harmless test set...


In [None]:
# --- Final Analysis ---
if results:
    # Find the layer with the highest NBGS
    best_layer, best_nbgs, best_rr, best_orr = max(results, key=lambda item: item[1])
    
    print("\n\n========================================")
    print("         SWEEP COMPLETE")
    print("========================================")
    print(f"Best result found at Layer {best_layer}:")
    print(f"  - F1: {best_nbgs:.4f}")
    print(f"  - Refusal Rate (RR): {best_rr:.4f}")
    print(f"  - Over-Refusal Rate (ORR): {best_orr:.4f}")
else:
    print("No results were generated.")

## Pythia

In [2]:
BASE_MODEL_NAME = "EleutherAI/pythia-410m-deduped"
INSTRUCT_MODEL_NAME = "SummerSigh/Pythia410m-V0-Instruct"

In [5]:
# as our experiments in base_refusal.ipynb showed:
# (we use the wildguard score)
baseline_refusal_rate = 0.125
baseline_over_refusal_rate = 0.25

### Pythia Base Model

In [None]:
base_model = HookedTransformer.from_pretrained(
    BASE_MODEL_NAME,
    default_padding_side='left',

)
base_model.tokenizer.padding_side = 'left'
base_model.tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

In [None]:
# base_model_layer = 23

In [None]:
base_model_tokenize_instructions_fn = lambda instructions: tokenize_instructions(
    tokenizer=base_model.tokenizer,
    instructions=instructions,
    template=PYTHIA_TEMPLATE
)

In [None]:
base_model.hook_dict

### Pytia Instruct Model

In [None]:
instruct_model_hf = AutoModelForCausalLM.from_pretrained(INSTRUCT_MODEL_NAME)

instruct_model = HookedTransformer.from_pretrained(
    "EleutherAI/pythia-410m-deduped",
    hf_model=instruct_model_hf,
    default_padding_side='left',
  )

instruct_tokenizer = AutoTokenizer.from_pretrained(INSTRUCT_MODEL_NAME)
instruct_tokenizer.padding_side = 'left'
instruct_tokenizer.pad_token = instruct_tokenizer.eos_token

# chat_model.tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

In [None]:
# instruct_model_layer = 23

In [None]:
instruct_model_tokenize_instructions_fn = lambda instructions: tokenize_instructions(
    tokenizer=instruct_tokenizer,
    instructions=instructions,
    template=PYTHIA_TEMPLATE
)

### Probing

In [None]:
candidate_layers = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]

In [None]:
results = []  # will hold tuples (layer, Δℓ, hooked_refusal, wildguard_refusal)

In [None]:
# as our experiments in base_refusal.ipynb showed:
# (we use the wildguard score)
# TODO: RECOMPUTE!!!
baseline_refusal_rate = 0.125
baseline_over_refusal_rate = 0.25

In [None]:
# 3. For each layer ℓ, extract r̂ℓ, apply an addition‐hook, generate, and measure NBGS
for l in candidate_layers:
    # a) Compute (or load) the normalized refusal direction at layer ℓ
    r_hat = lp_refuse.extract_refusal_direction(
        model=base_model,
        model_name=BASE_MODEL_NAME,
        harmless_inst_train=harmless_inst_train,
        harmful_inst_train=harmful_inst_train,
        n_inst_train=50,
        layer=l,
        pos=-1,
        pythia_template=PYTHIA_TEMPLATE,
        tokenize_instructions_fn=base_model_tokenize_instructions_fn,
        force=True,
    )

    # b) Build the “addition” hook at layer ℓ with steering_coef=2.0
    hook_dict = lp_steer.get_refusal_direction_hooks(
        model=base_model,
        refusal_dir=r_hat,
        act_add_hook=lp_steer.act_add_hook,
        direction_ablation_hook=lp_steer.direction_ablation_hook,
        intervention_type="multiply",
        steering_coef=2.0,
        layer=l,
    )

    # c) Generate on harmful test prompts with hook active
    harmful_gens = lp_steer.get_generations(
        base_model,
        instructions=harmful_inst_test,
        tokenize_instructions_fn=base_model_tokenize_instructions_fn,
        fwd_hooks=hook_dict,
        max_tokens_generated=32,
        batch_size=8,
    )
    rr = get_wildguard_refusal_score(
        instructions=harmful_inst_test,
        generations=harmful_gens
    )  # refusal on harmful

    # d) Generate on harmless test prompts with hook active
    harmless_gens = lp_steer.get_generations(
        base_model,
        instructions=harmless_inst_test,
        tokenize_instructions_fn=base_model_tokenize_instructions_fn,
        fwd_hooks=hook_dict,
        max_tokens_generated=32,
        batch_size=8,
    )
    orr = get_wildguard_refusal_score(
        instructions=harmless_inst_test[:],
        generations=harmless_gens
    )  # over-refusal on harmless

    # e) Compute NBGS
    nbgs = compute_nbgs(
        rr=rr,
        orr=orr,
        rr0=baseline_refusal_rate,
        orr0=baseline_over_refusal_rate,
    )

    results.append((l, nbgs, rr, orr))
    print(f">>>>> Layer {l} | NBGS: {nbgs:.4f} | RR: {rr:.4f} | ORR: {orr:.4f}")


In [None]:
# Sort and select the "most promising" layer by maximal NBGS
results.sort(key=lambda x: x[1], reverse=True)
best_layer, best_nbgs, best_rr, best_orr = results[0]

print(f"Layer scan results (layer, NBGS, RR, ORR):")
for layer, nbgs, rr, orr in results:
    print(f"  ℓ={layer:2d} → NBGS={nbgs:.3f}, RR={rr:.3f}, ORR={orr:.3f}")

print(f"\n=> Selected layer ℓ* = {best_layer} (NBGS* = {best_nbgs:.3f})")