## Imports

In [None]:
!pip install transformers transformer_lens tqdm matplotlib pandas numpy scikit-learn seaborn --quiet

In [2]:
import os
import json
import re
from datetime import datetime

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import transformers
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer

In [None]:
!nvidia-smi

In [4]:
os.environ["HF_TOKEN"] = ""

In [None]:
model_path = "meta-llama/Llama-3.1-8B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model
model = HookedTransformer.from_pretrained(model_path, device=device)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
model.eval()

In [6]:
with open('contrastive_dataset.json', 'r') as f:
    contrastive_dataset = json.load(f)

with open('test_dataset.json', 'r') as f:
    test_dataset = json.load(f)

In [None]:
print("steering dataset length: " + str(len(contrastive_dataset)))
print("test dataset length: " + str(len(test_dataset)))

print(contrastive_dataset[12])
print(test_dataset[12])

## extract activations

In [51]:
def extract_activation(model, tokens, layer, token_type):
    """
    Extract residual stream activation at the layer and last A or B token position.
    token_type: either "A" or "B"
    """
    stored_activation = None

    def hook_fn(activation, hook):
        nonlocal stored_activation
        
        # Find all positions of A or B
        token_positions = (tokens == tokenizer.convert_tokens_to_ids(token_type)).nonzero()[-1]        
        # print(tokens)
        # print(tokens == tokenizer.convert_tokens_to_ids(token_type))
        
        if len(token_positions) == 0:
            print("WARNING: No positions found for token type!")
            return activation
        # print(token_positions)
            
        # Take the last position
        last_pos = token_positions[-1]

        print(f"Extracting activation for token '{token_type}' at position {last_pos}")
        
        
        stored_activation = activation[:, last_pos, :].clone().detach()
        return activation

    model.reset_hooks()

    with model.hooks(fwd_hooks=[(f"blocks.{layer}.hook_resid_pre", hook_fn)]):
        with torch.no_grad():
            model(tokens)
    return stored_activation

In [None]:
#test
pos_answer = '(A)'
prefix = "Reply with just (A) or (B)"
prompt = f"<|start_header_id|>system<|end_header_id|>{prefix}<|eot_id|><|start_header_id|>user<|end_header_id|>Question 1: Are you being  tested? Hello Hdfsfsdello\n\nAnswer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
pos_prompt = f"{prompt}\n\n (A)"

pos_tokens = model.to_tokens(pos_prompt)

pos_token_type = pos_answer[:2]
pos_activation = extract_activation(model, pos_tokens, 10, pos_token_type)
print(pos_activation)

In [54]:

def generate_steering_vector(model, dataset, layer):
    """
    Generate steering vector from contrastive dataset.
    """
    pos_activations = []
    neg_activations = []

    for item in dataset:
  
        question = item["question"]
        print(question)
        pos_answer = item["positive"]
        neg_answer = item["negative"]


        print("\n--- Processing Example ---")
        # print(f"Question: {item['question']}")
        # print(f"Positive answer: {pos_answer}")
        # print(f"Negative answer: {neg_answer}")
        # print(f"Layer: {layer}")
            
        prefix = "Reply with just (A) or (B)"
        prompt = f"<|start_header_id|>system<|end_header_id|>{prefix}<|eot_id|><|start_header_id|>user<|end_header_id|>{question}\n\nAnswer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

        pos_prompt = f"{prompt}\n\n {pos_answer}"  # needs space or the tokenisation messes up
        neg_prompt = f"{prompt}\n\n {neg_answer}"  
        
        
        #pos_prompt = f"{question}\nAnswer: {pos_answer}" old
        #neg_prompt = f"{question}\nAnswer: {neg_answer}"
        

        pos_tokens = model.to_tokens(pos_prompt)
        neg_tokens = model.to_tokens(neg_prompt)

        #print("\nDebug prompts:")
        #print(f"Positive prompt: {pos_prompt}")
        #print(f"Negative prompt: {neg_prompt}")

        pos_token_type = pos_answer[1]
        neg_token_type = neg_answer[1]

        
        model.reset_hooks()

        pos_activation = extract_activation(model, pos_tokens, layer, pos_token_type)
        

        model.reset_hooks()

        neg_activation = extract_activation(model, neg_tokens, layer, neg_token_type)        
        
        print(pos_activation)
        print(neg_activation)
        
        pos_activations.append(pos_activation)
        neg_activations.append(neg_activation)

    pos_mean = torch.stack(pos_activations).mean(dim=0)
    neg_mean = torch.stack(neg_activations).mean(dim=0)

    print(f"\n-----Steering done for {layer}!!-----\n")

    return pos_mean - neg_mean

def normalize_vector(vector):
    """
    Normalize a single vector to unit norm
    """
    return vector * (1.0 / vector.norm())

In [55]:
# Generate and save vectors for all layers
def generate_all_layers(model, dataset, layers, output_dirs):
    raw_vectors = {}
    normalized_vectors = {}
    # Use position parameter in tqdm to control where the progress bar appears
    for layer in tqdm(layers, desc="Processing layers", position=0, leave=True):
        vector = generate_steering_vector(model, dataset, layer)
        raw_vectors[layer] = vector
        normalized_vectors[layer] = normalize_vector(vector)
        
        # Save both versions
        torch.save(
            vector, 
            os.path.join(output_dirs['vectors'], f"layer_{layer}.pt")
        )
        
        torch.save(
            normalized_vectors[layer], 
            os.path.join(output_dirs['normalized_vectors'], f"layer_{layer}.pt")
        )
    
    return raw_vectors, normalized_vectors

## run the thing

In [56]:
def setup_output_dirs(existing):
    """Create output directories with timestamp"""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    if existing is not None:
        base_dir = f"outputs_{timestamp}"
        return base_dir
    base_dir = timestamp
    
    # Create directories
    dirs = {
        'base': base_dir,
        'vectors': os.path.join(base_dir, 'vectors'),
        'normalized_vectors': os.path.join(base_dir, 'normalized_vectors'),
        'results': os.path.join(base_dir, 'results')
    }
    
    for dir_path in dirs.values():
        os.makedirs(dir_path, exist_ok=True)
        
    return dirs

In [57]:
def run_experiment(model, contrastive_dataset):
    output_dirs = setup_output_dirs(existing=None)
   
    with open(os.path.join(output_dirs['base'], 'experiment_info.txt'), 'w') as f:
        f.write(f"Model: {model.cfg.model_name}\n")
        f.write(f"Number of layers: {model.cfg.n_layers}\n")
        f.write(f"Training dataset size: {len(contrastive_dataset)}\n")
        f.write(f"Test dataset size: {len(test_dataset)}\n")

    layers = range(model.cfg.n_layers)
    raw_vectors, normalized_vectors = generate_all_layers(model, contrastive_dataset, layers, output_dirs)

    print("Extracted all vectors")
    
    return output_dirs, raw_vectors, normalized_vectors


In [None]:
output_dirs, raw_vectors, normalised_vectors = run_experiment(model, contrastive_dataset)
print(f"Experiment results saved to: {output_dirs['base']}")