# Steering Vector Training and Evaluation

This notebook provides a complete pipeline for:
1. Loading base models
2. Training steering vectors for persona datasets (traits/roles)
3. Loading pre-trained steering vectors
4. Loading activation-based steering vectors
5. Evaluating steering vectors with rollout generation
6. Saving rollouts with metadata to JSONL

## Imports and Configuration

In [15]:
from __future__ import annotations

import json
import math
from pathlib import Path
from dataclasses import asdict

import torch
from torch import nn
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm, trange

# Import from chatspace steering modules
import sys
sys.path.insert(0, '/root/chatspace')
from chatspace.steering.train import build_trainer, build_argparser
from chatspace.steering.model import QwenSteerModel, SteeringVectorConfig
from chatspace.steering.data import PersonaSteeringDatasetConfig, load_persona_steering_dataset

# Configuration
PERSONA_ROOT = Path("/workspace/persona-data")
STEERING_RUN_ROOT = Path("/workspace/steering_runs_qwen3_layer_30")
ROLLOUT_OUTPUT_ROOT = Path("/workspace/steering_rollouts_qwen3_layer_30")
TARGET_LAYER = 30  # Default layer for Qwen3-32B
BASE_MODEL = "Qwen/Qwen3-32B"

## 1. Load Base Model

In [2]:
def load_base_model(model_name: str = BASE_MODEL, device_map: str = "auto"):
    """Load the base causal LM and tokenizer."""
    print(f"Loading base model: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map=device_map,
        low_cpu_mem_usage=False,
    )
    model.eval()
    
    print(f"Model loaded on device: {next(model.parameters()).device}")
    return model, tokenizer

# Load the model
base_model, tokenizer = load_base_model()

Loading base model: Qwen/Qwen3-32B


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

Model loaded on device: cuda:0


## 2. Steering Controller

This controller allows us to dynamically swap steering vectors during generation.

**Note**: We use two separate model instances:
- `base_model`: Plain AutoModelForCausalLM for evaluation and rollout generation
- `steering_model`: QwenSteerModel (loaded in section 3) for training steering vectors

The controller is attached to `base_model` for efficient vector swapping during evaluation.

In [3]:
class SteeringController:
    """Attach a single residual hook and swap steering vectors on demand."""

    def __init__(self, model: AutoModelForCausalLM) -> None:
        self.model = model
        self.layer_idx: int | None = None
        self._handle = None
        self.vector: torch.Tensor | None = None

    def _hook(self, module, args, output):
        if self.vector is None:
            return output
        hidden = output[0] if isinstance(output, tuple) else output
        vec = self.vector
        if vec.device != hidden.device or vec.dtype != hidden.dtype:
            vec = vec.to(device=hidden.device, dtype=hidden.dtype)
            self.vector = vec
        steered = hidden + vec
        if isinstance(output, tuple):
            return (steered,) + output[1:]
        return steered

    def set_layer(self, layer_idx: int) -> None:
        if self.layer_idx == layer_idx:
            return
        if self._handle is not None:
            self._handle.remove()
        layer = self.model.model.layers[layer_idx]
        self._handle = layer.register_forward_hook(self._hook)
        self.layer_idx = layer_idx

    def set_vector(self, vector: torch.Tensor | None) -> None:
        if vector is None:
            self.vector = None
            return
        if vector.ndim != 1:
            raise ValueError("Steering vector must be 1D")
        self.vector = vector

    def close(self) -> None:
        if self._handle is not None:
            self._handle.remove()
            self._handle = None

# Create steering controller for base_model (used for evaluation/rollouts)
controller = SteeringController(base_model)
print("Created steering controller for base_model")

Created steering controller for base_model


## 3. Load Reusable Steering Model

Load QwenSteerModel once and reuse it for training multiple datasets (following train_all_steering.py pattern).

In [16]:
def load_steering_model(
    base_model: AutoModelForCausalLM,
    target_layer: int = TARGET_LAYER,
    init_scale: float = 0.0,
) -> QwenSteerModel:
    """Load a QwenSteerModel that wraps an existing base model for weight reuse.

    Args:
        base_model: Existing AutoModelForCausalLM to wrap
        target_layer: Layer to apply steering
        init_scale: Initial scale for steering vector

    Returns:
        QwenSteerModel instance that shares weights with base_model
    """
    print(f"Creating steering model wrapper around existing base model")
    print(f"Target layer: {target_layer}")

    # Create config
    model_cfg = SteeringVectorConfig(
        model_name=BASE_MODEL,
        target_layer=target_layer,
        init_scale=init_scale,
    )

    # Create QwenSteerModel instance without calling __init__
    # This avoids loading the weights twice
    steering_model = QwenSteerModel.__new__(QwenSteerModel)
    nn.Module.__init__(steering_model)  # Initialize as nn.Module
    
    # Set config
    steering_model.cfg = model_cfg
    
    # Share the base model (this is the key - we don't load new weights)
    steering_model.model = base_model
    steering_model.config = base_model.config
    
    # Freeze base model parameters
    for param in steering_model.model.parameters():
        param.requires_grad_(False)
    
    # Initialize the steering hook
    from chatspace.steering.model import ResidualHook
    hidden_size = base_model.config.hidden_size
    steering_model.steering = ResidualHook(hidden_size, init_scale)
    steering_model._hook_handle = None
    
    # Install the forward hook
    steering_model._install_hook()
    
    # Move steering vector to same device as base model
    device = next(base_model.parameters()).device
    steering_model.steering = steering_model.steering.to(device)
    
    print(f"Steering model created (sharing weights with base_model)")
    print(f"Steering vector on device: {steering_model.steering.vector.device}")
    
    return steering_model

def reset_steering_vector(model: QwenSteerModel, init_scale: float = 0.0) -> None:
    """Reset the steering vector to initial state between training runs.

    Args:
        model: QwenSteerModel instance
        init_scale: Initialization scale (0.0 for zeros)
    """
    if init_scale == 0.0:
        model.steering.vector.data.zero_()
    else:
        torch.nn.init.normal_(model.steering.vector, mean=0.0, std=init_scale)
    if model.steering.vector.grad is not None:
        model.steering.vector.grad.zero_()

# Load steering model (wraps base_model, shares weights)
steering_model = load_steering_model(
    base_model=base_model,
    target_layer=TARGET_LAYER,
    init_scale=0.0,
)

Creating steering model wrapper around existing base model
Target layer: 30
Steering model created (sharing weights with base_model)
Steering vector on device: cuda:0


## 4. Train Steering Vector

Function to train a steering vector for a given dataset using TRL's SFTTrainer with model reuse.

In [5]:
def prepare_dataset_split(
    dataset_name: str,
    tokenizer,
    train_tokens: int = 100_000,
    val_tokens: int = 10_000,
    seed: int = 17,
    role_score: int = 3,
    trait_score: int = 75,
    max_length: int = 4096,
):
    """Prepare train/val split for a dataset."""
    from chatspace.steering.data import PersonaSteeringDatasetConfig, load_persona_steering_dataset
    
    cfg = PersonaSteeringDatasetConfig(
        dataset_names=[dataset_name],
        target_tokens=train_tokens + max(val_tokens, 0),
        seed=seed,
        tokenizer_name=BASE_MODEL,
        max_length=max_length,
        role_min_score=role_score,
        trait_min_score=trait_score,
    )
    full_dataset = load_persona_steering_dataset(cfg, tokenizer)

    token_lengths = list(full_dataset["length"])
    cumulative = 0
    train_idx: list[int] = []
    val_idx: list[int] = []

    for idx, length in enumerate(token_lengths):
        cumulative += int(length)
        if cumulative <= train_tokens:
            train_idx.append(idx)
        elif val_tokens > 0 and cumulative <= train_tokens + val_tokens:
            val_idx.append(idx)
        else:
            break

    if not train_idx:
        raise ValueError("No training examples selected; increase tokens or relax score filters")

    train_dataset = full_dataset.select(train_idx)
    train_tokens_actual = sum(int(full_dataset[i]["length"]) for i in train_idx)

    val_dataset = None
    val_tokens_actual = 0
    if val_idx:
        val_dataset = full_dataset.select(val_idx)
        val_tokens_actual = sum(int(full_dataset[i]["length"]) for i in val_idx)

    print(
        f"Prepared {dataset_name}: {len(train_dataset)} train seq / {train_tokens_actual} tokens"
        + (f"; val {len(val_dataset)} seq / {val_tokens_actual} tokens." if val_dataset is not None else ".")
    )

    return train_dataset, val_dataset


def train_steering_vector(
    dataset_name: str,
    model: QwenSteerModel,
    tokenizer,
    output_dir: Path | None = None,
    learning_rate: float = 5e-1,
    init_scale: float = 0.0,
    batch_size: int = 4,
    gradient_accumulation: int = 1,
    train_tokens: int = 100_000,
    val_tokens: int = 10_000,
    num_epochs: float = 5.0,
    seed: int = 17,
    role_score: int = 3,
    trait_score: int = 75,
    bf16: bool = True,
    early_stop_patience: int = 3,
    early_stop_threshold: float = 0.0,
    eval_steps: int = 200,
    logging_steps: int = 50,
    compare_prompted: bool = False,
    lr_scheduler: str = "cosine",
    warmup_ratio: float = 0.05,
) -> Path:
    """Train a steering vector for the given dataset by reusing existing model.
    
    This function follows train_all_steering.py pattern: it reuses the model and
    resets the steering vector between runs.
    
    Args:
        dataset_name: Name of the persona dataset (e.g., 'qwen-3-32b__trait__acerbic')
        model: QwenSteerModel instance to reuse
        tokenizer: Tokenizer instance
        output_dir: Output directory for trained vector (default: STEERING_RUN_ROOT / dataset_name)
        learning_rate: Learning rate for steering vector
        init_scale: Initialization scale for steering vector
        batch_size: Per-device batch size
        gradient_accumulation: Gradient accumulation steps
        train_tokens: Total training tokens
        val_tokens: Validation tokens (0 disables validation)
        num_epochs: Number of training epochs
        seed: Random seed
        role_score: Minimum role extract_score
        trait_score: Minimum trait extract_score
        bf16: Enable bfloat16 training
        early_stop_patience: Early stopping patience (0 disables)
        early_stop_threshold: Minimum improvement threshold
        eval_steps: Evaluation frequency in steps
        logging_steps: Logging frequency in steps
        compare_prompted: Compare against prompted baseline
        lr_scheduler: Learning rate scheduler type
        warmup_ratio: Warmup ratio for scheduler
    
    Returns:
        Path to output directory containing steering_vector.pt and steering_config.json
    """
    from chatspace.steering.train import EarlyStopCallback, _compute_average_loss
    from trl.trainer.sft_trainer import SFTConfig, SFTTrainer
    
    if output_dir is None:
        output_dir = STEERING_RUN_ROOT / dataset_name
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n=== Training {dataset_name} ===")
    print(f"Output directory: {output_dir}")
    print(f"LR: {learning_rate}, Epochs: {num_epochs}, Tokens: {train_tokens}")
    
    # Prepare dataset split
    train_dataset, val_dataset = prepare_dataset_split(
        dataset_name, tokenizer, train_tokens, val_tokens, seed, role_score, trait_score
    )
    
    # Reset steering vector
    reset_steering_vector(model, init_scale)
    torch.manual_seed(seed)
    
    eval_strategy = "steps" if val_dataset is not None else "no"
    
    sft_config = SFTConfig(
        output_dir=str(output_dir),
        seed=seed,
        do_eval=val_dataset is not None,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation,
        max_steps=-1,
        bf16=bf16 and torch.cuda.is_available(),
        num_train_epochs=num_epochs,
        logging_steps=max(1, logging_steps),
        eval_strategy=eval_strategy,
        eval_steps=max(1, eval_steps),
        warmup_ratio=warmup_ratio,
        report_to=[],
        gradient_checkpointing=False,
        lr_scheduler_type=lr_scheduler,
        save_strategy="no",
        save_only_model=True,
        save_total_limit=1,
    )
    
    trainer = SFTTrainer(
        model=model,
        args=sft_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,
    )
    
    trainer.create_model_card = lambda *_, **__: None
    
    def _save_model(target: str | None = None, _internal_call: bool = False) -> None:
        dest = Path(target) if target is not None else output_dir
        model.save_pretrained(dest)
    
    trainer.save_model = _save_model  # type: ignore[assignment]
    
    early_cb = None
    if val_dataset is not None and early_stop_patience > 0:
        early_cb = EarlyStopCallback(trainer, early_stop_patience, early_stop_threshold)
        trainer.add_callback(early_cb)
    
    metrics: dict[str, float | str] = {"dataset": dataset_name, "learning_rate": learning_rate}
    
    try:
        train_result = trainer.train()
    except ValueError as exc:
        print(f"Failed on {dataset_name}: {exc}")
        return output_dir
    
    metrics.update({
        "train_runtime": train_result.metrics.get("train_runtime"),
        "train_loss": train_result.metrics.get("train_loss"),
        "epoch": train_result.metrics.get("epoch"),
    })
    
    # Restore best vector from early stopping
    if early_cb is not None and getattr(early_cb, "best_vector", None) is not None:
        best_vec = early_cb.best_vector.to(model.steering.vector.device)
        model.steering.vector.data.copy_(best_vec)
    
    # Evaluate if validation set exists
    if val_dataset is not None:
        eval_metrics = trainer.evaluate()
        eval_loss = eval_metrics.get("eval_loss")
        if eval_loss is None:
            eval_loss = _compute_average_loss(model, trainer.get_eval_dataloader())
            eval_metrics["eval_loss"] = eval_loss
        eval_metrics["eval_ppl"] = math.exp(eval_loss)
        metrics.update(eval_metrics)
        print("Validation metrics:", eval_metrics)
        
        if compare_prompted:
            stored_vec = model.steering.vector.detach().clone()
            model.steering.vector.data.zero_()
            base_loss = _compute_average_loss(model, trainer.get_eval_dataloader())
            metrics["baseline_loss"] = base_loss
            metrics["baseline_ppl"] = math.exp(base_loss)
            model.steering.vector.data.copy_(stored_vec)
    
    # Save model and metrics
    model.save_pretrained(output_dir)
    (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2))
    
    # Cleanup
    if hasattr(trainer, "accelerator"):
        trainer.accelerator.free_memory()
    del trainer
    torch.cuda.empty_cache()
    
    print(f"✓ Training complete. Saved to {output_dir}")
    return output_dir

# Example usage (commented out - uncomment to train)
# trained_dir = train_steering_vector(
#     dataset_name="qwen-3-32b__trait__acerbic",
#     model=steering_model,
#     tokenizer=tokenizer,
#     train_tokens=100_000,
#     val_tokens=10_000,
#     num_epochs=5,
#     early_stop_patience=3,
# )

## 5. Load Trained Steering Vector

Load a previously trained steering vector from disk.

In [6]:
def load_trained_steering_vector(
    dataset: str,
    run_root: Path = STEERING_RUN_ROOT,
) -> tuple[torch.Tensor, int] | None:
    """Load trained steering vector for a given dataset.
    
    Args:
        dataset: Dataset name (e.g., 'qwen-3-32b__trait__acerbic')
        run_root: Root directory containing trained vectors
    
    Returns:
        Tuple of (vector, target_layer) or None if not found
    """
    steering_dir = run_root / dataset
    vector_path = steering_dir / "steering_vector.pt"
    
    if not vector_path.exists():
        print(f"No trained vector found at {vector_path}")
        return None
    
    state = torch.load(vector_path, map_location="cpu")
    tensor = state.get("steering_vector")
    if tensor is None:
        raise ValueError(f"steering_vector.pt missing 'steering_vector' key at {vector_path}")
    
    trained_vector = tensor.float()
    trained_layer = TARGET_LAYER
    
    # Load layer from config if available
    config_path = steering_dir / "steering_config.json"
    if config_path.exists():
        cfg = json.loads(config_path.read_text())
        trained_layer = int(cfg.get("target_layer", trained_layer))
    
    if torch.cuda.is_available():
        trained_vector = trained_vector.cuda()
    
    norm = torch.linalg.norm(trained_vector).item()
    print(f"Loaded trained vector for {dataset}")
    print(f"  Layer: {trained_layer}, Norm: {norm:.4f}, Shape: {trained_vector.shape}")
    
    return trained_vector, trained_layer

# Example usage
# trained_vec, trained_layer = load_trained_steering_vector("qwen-3-32b__trait__acerbic")

## 6. Load Activation (Vanilla) Steering Vector

Load the activation-based steering vector (from CAA or similar methods).

In [7]:
def load_activation_steering_vector(
    dataset: str,
    layer: int = TARGET_LAYER,
    persona_root: Path = PERSONA_ROOT,
) -> torch.Tensor | None:
    """Load activation-based steering vector for a given dataset.
    
    Args:
        dataset: Dataset name (e.g., 'qwen-3-32b__trait__acerbic')
        layer: Layer index to extract vector from
        persona_root: Root directory containing activation vectors
    
    Returns:
        Steering vector tensor or None if not found
    """
    if "__trait__" in dataset:
        model_prefix, trait = dataset.split("__trait__", 1)
        vec_file = persona_root / f"{model_prefix}/traits_240/vectors/{trait}.pt"
        if not vec_file.exists():
            print(f"Trait vector not found: {vec_file}")
            return None
        data = torch.load(vec_file, map_location="cpu")
        vec = data["pos_neg_50"][layer].float()
        
    elif "__role__" in dataset:
        role = dataset.split("__role__", 1)[1]
        vec_file = persona_root / f"qwen-3-32b/roles_240/vectors/{role}.pt"
        if not vec_file.exists():
            print(f"Role vector not found: {vec_file}")
            return None
        data = torch.load(vec_file, map_location="cpu")
        # Load default vectors for contrast
        default_vecs = torch.load(
            persona_root / "qwen-3-32b/roles_240/default_vectors.pt",
            map_location="cpu"
        )
        vec_pos = data["pos_3"][layer].float()
        vec_default = default_vecs["activations"]["default_1"][layer].float()
        vec = vec_pos - vec_default
    else:
        raise ValueError(f"Unrecognized dataset format: {dataset}")
    
    if torch.cuda.is_available():
        vec = vec.cuda()
    
    norm = torch.linalg.norm(vec).item()
    print(f"Loaded activation vector for {dataset}")
    print(f"  Layer: {layer}, Norm: {norm:.4f}, Shape: {vec.shape}")
    
    return vec

# Example usage
# activation_vec = load_activation_steering_vector("qwen-3-32b__trait__acerbic", layer=TARGET_LAYER)

## 7. Load Instructions for Dataset

Load prompts and questions from the instruction files.

In [8]:
from dataclasses import dataclass

@dataclass
class InstructionData:
    prompts: list[str]
    questions: list[str]
    eval_prompt: str | None

def load_instructions(dataset: str) -> InstructionData:
    """Load instruction prompts and questions for a dataset.
    
    Args:
        dataset: Dataset name (e.g., 'qwen-3-32b__trait__acerbic')
    
    Returns:
        InstructionData with prompts, questions, and optional eval_prompt
    """
    instructions_root = Path.home() / "persona-subspace"
    
    if "__trait__" in dataset:
        name = dataset.split("__trait__", 1)[1]
        path = instructions_root / "traits" / "data" / "instructions" / f"{name}.json"
    elif "__role__" in dataset:
        name = dataset.split("__role__", 1)[1]
        path = instructions_root / "roles" / "data" / "instructions" / f"{name}.json"
    else:
        raise ValueError(f"Unrecognized dataset name: {dataset}")

    if not path.exists():
        raise FileNotFoundError(f"Instructions not found: {path}")

    payload = json.loads(path.read_text())
    prompts: list[str] = []
    for entry in payload.get("instruction", []):
        if isinstance(entry, dict):
            prompt = entry.get("pos") or entry.get("prompt")
            if prompt:
                prompts.append(prompt)
        elif isinstance(entry, str):
            prompts.append(entry)
    if not prompts:
        raise ValueError(f"No prompts found in {path}")
    
    questions = payload.get("questions", [])
    if not questions:
        raise ValueError(f"No questions in {path}")
    
    eval_prompt = payload.get("eval_prompt")
    
    print(f"Loaded instructions for {dataset}")
    print(f"  Prompts: {len(prompts)}, Questions: {len(questions)}")
    
    return InstructionData(prompts=prompts, questions=questions, eval_prompt=eval_prompt)

# Example usage
# instructions = load_instructions("qwen-3-32b__trait__acerbic")

## 8. Generate Rollouts with Steering

Generate text completions with or without steering vectors applied.

In [9]:
def generate_rollouts(
    dataset: str,
    instructions: InstructionData,
    controller: SteeringController,
    tokenizer,
    model,
    vector: torch.Tensor | None = None,
    layer: int = TARGET_LAYER,
    variant_name: str = "baseline",
    num_rollouts: int = 3,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
    top_p: float = 0.9,
    use_system_prompt: bool = True,
    prompt_index: int = 0,
    rescale_norm: float | None = None,
) -> list[dict]:
    """Generate text completions with optional steering.
    
    Args:
        dataset: Dataset name
        instructions: InstructionData with prompts/questions
        controller: SteeringController instance
        tokenizer: Tokenizer instance
        model: Model instance
        vector: Steering vector to apply (None for baseline)
        layer: Layer to apply steering
        variant_name: Name for this variant (e.g., 'trained', 'activation', 'prompted')
        num_rollouts: Number of rollouts per question
        max_new_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
        use_system_prompt: Whether to include system prompt
        prompt_index: Which prompt to use from instructions
        rescale_norm: If provided, rescale vector to this L2 norm before applying
    
    Returns:
        List of rollout dictionaries with metadata
    """
    device = next(model.parameters()).device
    
    # Rescale vector if requested
    original_norm = None
    if vector is not None and rescale_norm is not None:
        original_norm = float(torch.linalg.norm(vector).item())
        if original_norm > 0:
            vector = vector * (rescale_norm / original_norm)
            print(f"Rescaled vector from {original_norm:.4f} to {torch.linalg.norm(vector).item():.4f}")
    
    # Set up steering
    controller.set_layer(layer)
    controller.set_vector(vector)
    
    system_prompt = instructions.prompts[prompt_index] if instructions.prompts else None
    
    records = []
    
    # Disable thinking for Qwen models (prevents <think> blocks)
    # See: https://github.com/vllm-project/vllm/issues/18066
    chat_template_kwargs = {"enable_thinking": False} if "qwen" in BASE_MODEL.lower() else {}
    
    for rollout_idx in trange(num_rollouts, desc=f"{variant_name}", leave=False):
        # Batch all questions for this rollout
        messages_batch = []
        for question in instructions.questions:
            msgs = []
            if use_system_prompt and system_prompt:
                msgs.append({"role": "system", "content": system_prompt})
            msgs.append({"role": "user", "content": question})
            messages_batch.append(msgs)
        
        # Apply chat template and tokenize (with thinking disabled for Qwen)
        chat_texts = [
            tokenizer.apply_chat_template(
                msgs, 
                tokenize=False, 
                add_generation_prompt=True,
                **chat_template_kwargs
            )
            for msgs in messages_batch
        ]
        encoded = tokenizer(chat_texts, return_tensors="pt", padding=True).to(device)
        attention_mask = encoded.get("attention_mask")
        if attention_mask is None:
            input_lens = torch.tensor([enc.size(0) for enc in encoded["input_ids"]], device=device)
        else:
            input_lens = attention_mask.sum(dim=1)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **encoded,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                top_p=top_p,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )
        
        # Decode responses
        for question_idx, question in enumerate(instructions.questions):
            seq = outputs[question_idx]
            offset = int(input_lens[question_idx])
            response = tokenizer.decode(seq[offset:], skip_special_tokens=True).strip()
            
            record = {
                "dataset": dataset,
                "variant": variant_name,
                "prompt_index": prompt_index if variant_name == "prompted" else None,
                "question_index": question_idx,
                "rollout_index": rollout_idx,
                "question": question,
                "system_prompt": system_prompt if use_system_prompt else None,
                "response": response,
                "layer": layer,
            }
            
            if vector is not None:
                final_norm = float(torch.linalg.norm(vector).item())
                record["steering_norm"] = final_norm
                if original_norm is not None:
                    record["steering_norm_original"] = original_norm
                if rescale_norm is not None:
                    record["steering_norm_target"] = rescale_norm
            
            records.append(record)
    
    print(f"Generated {len(records)} rollouts for {variant_name}")
    return records

# Example usage (commented out)
# records = generate_rollouts(
#     dataset="qwen-3-32b__trait__acerbic",
#     instructions=instructions,
#     controller=controller,
#     tokenizer=tokenizer,
#     model=base_model,
#     vector=trained_vec,
#     layer=trained_layer,
#     variant_name="trained",
#     num_rollouts=3,
#     rescale_norm=10.0,  # Rescale to L2 norm of 10.0
# )

## 9. Save Rollouts to JSONL

Save rollouts with metadata to a JSONL file for later analysis.

In [10]:
def save_rollouts(
    records: list[dict],
    dataset: str,
    output_root: Path = ROLLOUT_OUTPUT_ROOT,
    filename: str = "rollouts.jsonl",
) -> Path:
    """Save rollout records to JSONL file.
    
    Args:
        records: List of rollout dictionaries
        dataset: Dataset name
        output_root: Root output directory
        filename: Output filename
    
    Returns:
        Path to saved file
    """
    output_dir = output_root / dataset
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / filename
    
    with output_path.open("w", encoding="utf-8") as f:
        for record in records:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    
    print(f"Saved {len(records)} records to {output_path}")
    return output_path

# Example usage
# save_rollouts(records, dataset="qwen-3-32b__trait__acerbic")

## 10. Complete Evaluation Pipeline

Put it all together: evaluate a dataset with trained, activation, and prompted variants.

In [11]:
def evaluate_dataset_complete(
    dataset: str,
    num_rollouts: int = 3,
    evaluate_trained: bool = True,
    evaluate_activation: bool = True,
    evaluate_prompted: bool = True,
    evaluate_unprompted: bool = True,
    save_outputs: bool = True,
    trained_rescale_norm: float | None = None,
    activation_rescale_norm: float | None = None,
) -> dict[str, list[dict]]:
    """Complete evaluation pipeline for a dataset.
    
    Args:
        dataset: Dataset name (e.g., 'qwen-3-32b__trait__acerbic')
        num_rollouts: Number of rollouts per question
        evaluate_trained: Whether to evaluate trained steering vector
        evaluate_activation: Whether to evaluate activation vector
        evaluate_prompted: Whether to evaluate prompted baseline
        evaluate_unprompted: Whether to evaluate unprompted control (no system prompt, no steering)
        save_outputs: Whether to save rollouts to disk
        trained_rescale_norm: Rescale trained vectors to this L2 norm (None = no rescaling)
        activation_rescale_norm: Rescale activation vectors to this L2 norm (None = no rescaling)
    
    Returns:
        Dictionary mapping variant names to rollout records
    """
    print(f"\n{'='*60}")
    print(f"Evaluating: {dataset}")
    print(f"{'='*60}\n")
    
    # Load instructions
    instructions = load_instructions(dataset)
    
    all_records = {}
    
    # Evaluate unprompted control (no system prompt, no steering)
    if evaluate_unprompted:
        print("\n--- Unprompted Control (No System Prompt, No Steering) ---")
        unprompted_records = generate_rollouts(
            dataset=dataset,
            instructions=instructions,
            controller=controller,
            tokenizer=tokenizer,
            model=base_model,
            vector=None,
            variant_name="unprompted",
            num_rollouts=num_rollouts,
            use_system_prompt=False,
        )
        all_records["unprompted"] = unprompted_records
    
    # Evaluate prompted baseline
    if evaluate_prompted and instructions.prompts:
        print("\n--- Prompted Baseline ---")
        for prompt_idx in range(len(instructions.prompts)):
            prompted_records = generate_rollouts(
                dataset=dataset,
                instructions=instructions,
                controller=controller,
                tokenizer=tokenizer,
                model=base_model,
                vector=None,
                variant_name="prompted",
                num_rollouts=num_rollouts,
                use_system_prompt=True,
                prompt_index=prompt_idx,
            )
            all_records[f"prompted_{prompt_idx}"] = prompted_records
    
    # Evaluate trained vector
    if evaluate_trained:
        print("\n--- Trained Steering Vector ---")
        result = load_trained_steering_vector(dataset)
        if result is not None:
            trained_vec, trained_layer = result
            trained_records = generate_rollouts(
                dataset=dataset,
                instructions=instructions,
                controller=controller,
                tokenizer=tokenizer,
                model=base_model,
                vector=trained_vec,
                layer=trained_layer,
                variant_name="trained",
                num_rollouts=num_rollouts,
                use_system_prompt=False,
                rescale_norm=trained_rescale_norm,
            )
            all_records["trained"] = trained_records
    
    # Evaluate activation vector
    if evaluate_activation:
        print("\n--- Activation Steering Vector ---")
        activation_vec = load_activation_steering_vector(dataset, layer=TARGET_LAYER)
        if activation_vec is not None:
            activation_records = generate_rollouts(
                dataset=dataset,
                instructions=instructions,
                controller=controller,
                tokenizer=tokenizer,
                model=base_model,
                vector=activation_vec,
                layer=TARGET_LAYER,
                variant_name="activation",
                num_rollouts=num_rollouts,
                use_system_prompt=False,
                rescale_norm=activation_rescale_norm,
            )
            all_records["activation"] = activation_records
    
    # Save all records
    if save_outputs:
        all_records_flat = []
        for variant_records in all_records.values():
            all_records_flat.extend(variant_records)
        save_rollouts(all_records_flat, dataset=dataset)
    
    print(f"\n{'='*60}")
    print(f"Evaluation complete: {sum(len(r) for r in all_records.values())} total rollouts")
    print(f"{'='*60}\n")
    
    return all_records

# Example usage
# results = evaluate_dataset_complete(
#     dataset="qwen-3-32b__trait__acerbic",
#     num_rollouts=3,
#     evaluate_unprompted=True,
#     trained_rescale_norm=10.0,  # Rescale trained vector to L2 norm of 10
#     activation_rescale_norm=10.0,  # Rescale activation vector to L2 norm of 10
# )

## 11. Example: Complete Workflow

Run a complete evaluation for a specific dataset.

In [18]:
# Example: Evaluate a specific dataset
DATASET_TO_EVALUATE = "qwen-3-32b__trait__acerbic"

# Run complete evaluation
results = evaluate_dataset_complete(
    dataset=DATASET_TO_EVALUATE,
    num_rollouts=1,
    evaluate_trained=False,
    evaluate_activation=True,
    evaluate_prompted=False,
    evaluate_unprompted=False,
    activation_rescale_norm=200.0,
    save_outputs=True,
)

# Display sample outputs
for variant_name, records in results.items():
    print(f"\n{variant_name.upper()} - Sample response:")
    if records:
        sample = records[0]
        print(f"Question: {sample['question']}")
        print(f"Response: {sample['response'][:200]}...")
        print()


Evaluating: qwen-3-32b__trait__acerbic

Loaded instructions for qwen-3-32b__trait__acerbic
  Prompts: 5, Questions: 40

--- Activation Steering Vector ---
Loaded activation vector for qwen-3-32b__trait__acerbic
  Layer: 30, Norm: 70.2444, Shape: torch.Size([5120])
Rescaled vector from 70.2444 to 200.0000


                                                         

Generated 40 rollouts for activation
Saved 40 records to /workspace/steering_rollouts_qwen3_layer_30/qwen-3-32b__trait__acerbic/rollouts.jsonl

Evaluation complete: 40 total rollouts


ACTIVATION - Sample response:
Question: What do you think about people who are always late to meetings?
Response: <think>

</think>

People who are always late to meetings? Ugh. It's like they're giving you a middle finger in the form of a time management seminar. First of all, lateness is a declaration of war on...





## 12. Cleanup

Clean up resources when done.

In [13]:
raise Exception("Stop here")

# Clean up steering controller
controller.close()

# Clear CUDA cache if needed
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared")

Exception: Stop here