# **Conceptor Steering in the context of Function Vectors** 🧠🤖🛞

A novel technique called "Conceptor-Based Activation Engineering".
We are bravely attempting to steer the behavior of a GPT-J-6B model using Conceptors.
Inspired by recent discoveries and successes in activation engineering/steering.

by Joris Postmus & Steven Abreu (supervisor)

# **Imports & Libraries**

In [None]:
import torch
import os
import pandas as pd
import numpy as np
from transformer_lens import HookedTransformer
from datetime import datetime
from typing import List
from dataclasses import dataclass
from datetime import datetime
from collections import Counter
import random
import json

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# **Helper Functions**

In [None]:
def load_input_output_pairs(path):
    """
    Loads input-output pairs from a JSON file located at the specified path.
    These pairs could represent any type of relational data, such as antonyms, country-capital, or uncapitalized-capitalized pairs.

    Args:
        path (str): The path to the JSON file containing the input-output pairs.

    Prints:
        The number of pairs loaded from the file.
    """
    with open(path, 'r') as file:
      pairs = json.load(file)
      return pairs

In [None]:
def get_output(input_string, pairs):
    """
    Retrieves the corresponding output for a given input string from a list of input-output pairs.

    Args:
        input_string (str): The input string for which to find the corresponding output.
        pairs (list): The list of input-output pairs.

    Returns:
        str or None: The corresponding output string if found, otherwise None.
    """
    for pair in pairs:
        if pair["input"] == input_string:
            return pair["output"]
    return None

In [None]:
def get_input(output_string, pairs):
    """
    Retrieves the corresponding input for a given output string from a list of input-output pairs.

    Args:
        output_string (str): The output string for which to find the corresponding input.
        pairs (list): The list of input-output pairs.

    Returns:
        str or None: The corresponding input string if found, otherwise None.
    """
    for pair in pairs:
        if pair["output"] == output_string:
            return pair["input"]
    return None

In [None]:
def create_random_pairs_string(pairs, num_pairs):
    """
    Creates a string of randomly selected input-output pairs from a given list, with the last pair missing its output.

    Args:
        pairs (list): The list of input-output pairs to sample from.
        num_pairs (int): The number of pairs to include in the string.

    Returns:
        str: A string formatted with randomly selected pairs, where the last pair is missing its output.
    """
    sampled_pairs = random.sample(pairs, num_pairs)
    pairs_string = ', '.join([f"{pair['input']}:{pair['output']}" for pair in sampled_pairs[:-1]]) + f", {sampled_pairs[-1]['input']}:"
    return pairs_string

In [None]:
def get_unique_random_inputs_formatted(pairs, n):
    """
    Returns a list of N unique input strings randomly sampled from the list of input-output pairs,
    with each input string formatted by adding a ':' at the end.

    Args:
        pairs (list): The list of input-output pairs to sample from.
        n (int): The number of unique input strings to return.

    Returns:
        list: A list of N unique input strings, each formatted with a ':' at the end.
    """
    unique_inputs = list(set(pair["input"] for pair in pairs))
    if len(unique_inputs) == 0:
        raise ValueError("No unique inputs available to sample from.")

    if len(unique_inputs) < n:
        sampled_inputs = random.choices(unique_inputs, k=n)
    else:
        sampled_inputs = random.sample(unique_inputs, n)

    formatted_inputs = [input_string + ":" for input_string in sampled_inputs]
    return formatted_inputs

In [None]:
def create_layer_subsets(lst):
    """
    Generates subsets by iteratively removing the first and last elements until only
    two elements remain. Ends with a pair of two elements for both odd and even lengths.

    Args:
        lst (list): Incremental list created using range.

    Returns:
        list: List of subsets.
    """
    result = [lst]
    while len(lst) > 2:
        lst = lst[1:-1]
        result.append(lst)
    if len(lst) == 2:
        result.append(lst)
    return result

In [None]:
def extract_activations_last_token(model, steering_prompts, extraction_layers, device):
    """
    Extract activations for the last token of each steering prompt from specific layers of the model.

    Parameters:
    - model (HookedTransformer): The model used for generating text.
    - steering_prompts (list): List of steering prompts to extract activations for.
    - extraction_layers (list): The layers from which activations are extracted.
    - device (str): The computing device (e.g., 'cuda', 'cpu').

    Returns:
    - dict: A dictionary where each key is a layer number and each value is the
            activations for the last token of each prompt. Shape: (n_prompts, n_activations).
    """
    activations_dict = {}
    names = [f"blocks.{layer}.hook_resid_pre" for layer in extraction_layers]
    cache, caching_hooks, _ = model.get_caching_hooks(lambda n: n in names)

    with model.hooks(fwd_hooks=caching_hooks):
        model.tokenizer.padding_side = 'left'
        _ = model(steering_prompts)

    for layer in extraction_layers:
        prompt_activations = cache[f"blocks.{layer}.hook_resid_pre"].detach().cpu()
        last_token_activations = prompt_activations[:, -1, :].squeeze()
        activations_tensor = torch.tensor(last_token_activations.numpy(), dtype=torch.float).to(device)
        activations_dict[layer] = activations_tensor

    return activations_dict

In [None]:
def steer(C, x, beta):
    """
    Steers the given vector x using the conceptor C.

    Args:
        C (torch.Tensor): The conceptor matrix.
        x (torch.Tensor): The vector to be steered.
        beta (float): The steering parameter with 0: no steering, 1: full steering.

    Returns:
        torch.Tensor: The steered vector.
    """
    C = C.to(torch.float16)
    return beta * torch.matmul(C, x)

In [None]:
def generate_conceptor_hook(conceptor, beta):
    """
    Generates a hook function to apply a conceptor to the last token.

    Parameters:
    - conceptor (torch.Tensor): Conceptor matrix.
    - beta (float): Scaling factor.

    Returns:
    - function: Hook function for applying conceptor.
    """
    def last_token_steering_hook(resid_pre, hook):
        for i in range(resid_pre.shape[0]):
            current_token_index = resid_pre.shape[1] - 1
            resid_pre[i, current_token_index, :] = steer(C=conceptor, x=resid_pre[i, current_token_index, :], beta=beta)

    return last_token_steering_hook

In [None]:
def generate_conceptor_hook_mean_centered(conceptor, mean_train, beta):
    """
    Generates a hook function to apply a mean-centered conceptor to the last token.

    Parameters:
    - conceptor (torch.Tensor): Conceptor matrix.
    - mean_train (torch.Tensor): Mean training vector.
    - beta (float): Scaling factor.

    Returns:
    - function: Hook function for applying mean-centered conceptor.
    """
    def last_token_steering_hook(resid_pre, hook):
        for i in range(resid_pre.shape[0]):
            current_token_index = resid_pre.shape[1] - 1
            resid_pre[i, current_token_index, :] = steer(C=conceptor, x=resid_pre[i, current_token_index, :] - mean_train, beta=beta) + mean_train

    return last_token_steering_hook

In [None]:
def generate_ave_hook_addition(steering_vector, beta):
    """
    Generates a hook function to add a steering vector to the last token.

    Parameters:
    - steering_vector (torch.Tensor): Steering vector.
    - beta (float): Scaling factor.

    Returns:
    - function: Hook function for adding steering vector.
    """
    def last_token_steering_hook(resid_pre, hook):
        for i in range(resid_pre.shape[0]):
            current_token_index = resid_pre.shape[1] - 1
            resid_pre[i, current_token_index, :] += steering_vector.squeeze() * beta

    return last_token_steering_hook

In [None]:
def generate_ave_hook_addition_mean_centered(steering_vector, mean_train, beta):
    """
    Generates a hook function to add a mean-centered steering vector to the last token.

    Parameters:
    - steering_vector (torch.Tensor): Steering vector.
    - mean_train (torch.Tensor): Mean training vector.
    - beta (float): Scaling factor.

    Returns:
    - function: Hook function for adding mean-centered steering vector.
    """
    def last_token_steering_hook(resid_pre, hook):
        for i in range(resid_pre.shape[0]):
            current_token_index = resid_pre.shape[1] - 1
            resid_pre[i, current_token_index, :] += (steering_vector.squeeze() - mean_train) * beta

    return last_token_steering_hook

In [None]:
def compute_conceptor(X, aperture):
    """
    Computes the conceptor matrix for a given input matrix X.
    (PyTorch version)

    Parameters:
    - X (torch.Tensor): Input matrix of shape (n_samples, n_features).
    - torch.Tensor: Conceptor matrix of shape (n_features, n_features).
    """
    R = torch.matmul(X.T, X) / X.shape[0]
    U, S, _ = torch.svd(R)
    C = U * (S / (S + (aperture**(-2)) * torch.ones(S.shape, device=X.device))) @ U.T
    return C

In [None]:
def top_1_first_tokens(model, prompts: List[str], fwd_hooks=[]):
    """
    Retrieves the top token predictions for the first tokens after the prompts.

    Parameters:
    - model: Language model with hooks and tokenizer.
    - prompts (List[str]): List of prompt strings.
    - fwd_hooks (list, optional): List of forward hooks to apply during the model run.

    Returns:
    - List[str]: List of top predicted tokens.
    """
    top_tokens = []

    with model.hooks(fwd_hooks=fwd_hooks):
        model.tokenizer.padding_side = 'left'
        input_prompts_tokenized = model.to_tokens(prompts)
        logits, _ = model.run_with_cache(input_prompts_tokenized, remove_batch_dim=True)
        next_logits = logits[:, -1, :]
        next_probabilities = next_logits.softmax(dim=-1)
        top_indices = torch.argmax(next_probabilities, dim=-1)

        for index in top_indices:
            decoded_token = model.tokenizer.decode([index.item()])
            top_tokens.append(decoded_token)

    return top_tokens

# **Configurations**

**Config**

In [None]:
# Get the strings of all steering prompts
STEERING_PROMPTS_PATHS = {
    'singular_plural': './Prompts/singular-plural.json',
    'antonyms': './Prompts/antonyms.json',
    'present-past': './Prompts/present-past.json',
    'english-french': './Prompts/english-french.json',
    'country-capital': './Prompts/country-capital.json',
    'capitalize': './Prompts/capitalize.json'
}

# Define the layers where the function is present (taken from the FunctionVectors paper), cannot go outside of layers 9-17 as those are the only ones with computed avg train vectors.
ACTIVE_LAYERS = {
    'singular_plural': range(9,17),
    'antonyms': range(9,17),
    'present-past': range(9, 17),
    'english-french': range(9,17),
    'country-capital': range(9,17),
    'capitalize': range(9,17)
}

# CHANGE THIS TO CHANGE THE FUNCTION TASK
TASK = "antonyms"

STEERING_PROMPTS_PATH = STEERING_PROMPTS_PATHS[TASK]
MEAN_ACTIVATIONS_PATH='./Results/activations_mean_train.pkl'
RESULTS_PATH='./Results/'
MODEL_NAME = 'EleutherAI/gpt-j-6b'
DTYPE = 'float16'
SAVE_RESULTS = True

# Experimental variables
NUM_STEERING_PROMPTS = 100              # Amount of samples to steer in total for each config (Np)
NUM_STEERING_EXAMPLES_PER_PROMPT = 10   # Amount of examples in each line of the a steering prompt (N)
NUM_INPUT_PROMPTS = 1000                # Number of input prompts to test the model on (Nt)
NUM_EXPERIMENTS = 5                     # Amount of experiments to run

# Experimental configurations
list_extraction_layers = ACTIVE_LAYERS[TASK]                                        # Layers to steer
list_beta_averaging = [2.3]                                                         # Beta value(s) for averaging 2.3
list_beta_conceptor = [3.9]                                                         # Beta value(s) for conceptor steering 3.9
list_aperatures_normal =  [0.0125]                                                  # Aperture value(s) for normal conceptors
list_aperatures_mean_centered =  [0.05]                                             # Aperture value(s) for mean-centered conceptors

**Load the mean training activations**

In [None]:
import pickle
with open(MEAN_ACTIVATIONS_PATH, 'rb') as file:
    mean_train_activations = pickle.load(file)
mean_train_activations = {key: tensor.to('cuda') for key, tensor in mean_train_activations.items()}

**Data classes for experiments**

In [None]:
@dataclass
class ExperimentConfigConceptor:
    extraction_layer: int
    beta: float
    aperture: float

@dataclass
class ExperimentConfigAveraging:
    extraction_layer: int
    beta: float

In [None]:
# SINGLE-LAYER
configs_averaging = [
    ExperimentConfigAveraging(
        extraction_layer=extraction_layer,
        beta=beta,
    )
    for extraction_layer in list_extraction_layers
    for beta in list_beta_averaging
]

configs_conceptoring = [
    ExperimentConfigConceptor(
        extraction_layer=extraction_layer,
        beta=beta,
        aperture=aperture,
    )
    for extraction_layer in list_extraction_layers
    for beta in list_beta_conceptor
    for aperture in list_aperatures_normal
]

configs_conceptoring_mean_centered = [
    ExperimentConfigConceptor(
        extraction_layer=extraction_layer,
        beta=beta,
        aperture=aperture,
    )
    for extraction_layer in list_extraction_layers
    for beta in list_beta_conceptor
    for aperture in list_aperatures_mean_centered
]

**Load model**

In [None]:
# Load the model onto GPU (if possible)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")
torch.set_grad_enabled(False)
model = HookedTransformer.from_pretrained_no_processing(model_name=MODEL_NAME, device=DEVICE, dtype=DTYPE)
model.eval();

Using device: cuda




config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Loaded pretrained model EleutherAI/gpt-j-6b into HookedTransformer


# **Experiment Prep**

**Generate experimental data**

In [None]:
# Store separate prompts and steering examples for each experiment
experiment_data = {}

for exp in range(NUM_EXPERIMENTS):
    input_output_pairs = load_input_output_pairs(STEERING_PROMPTS_PATH)
    steering_examples = []
    for i in range(NUM_STEERING_PROMPTS):
        steering_examples.append(create_random_pairs_string(input_output_pairs, NUM_STEERING_EXAMPLES_PER_PROMPT))
    prompts_to_steer = get_unique_random_inputs_formatted(input_output_pairs, NUM_INPUT_PROMPTS)

    correct_outputs_full = [get_output(prompt[:-1], input_output_pairs) for prompt in prompts_to_steer]
    correct_outputs_full_tokenized = [model.tokenizer.tokenize(idx) for idx in correct_outputs_full]
    correct_outputs_1st = [output[0] for output in correct_outputs_full_tokenized]

    experiment_data[exp] = {
        'prompts_to_steer': prompts_to_steer,
        'steering_examples': steering_examples,
        'correct_outputs_1st': correct_outputs_1st
    }

**Generate activations**

In [None]:
from tqdm import tqdm

# Initialize a dictionary to store activations for each experiment and layer
activations_cache = {exp: {} for exp in range(NUM_EXPERIMENTS)}

# Precompute activations for all layers with a progress bar
total_computations = NUM_EXPERIMENTS

with tqdm(total=total_computations, desc="Precomputing activations") as pbar:
    for exp in range(NUM_EXPERIMENTS):
        activations_dict = extract_activations_last_token(model, experiment_data[exp]['steering_examples'], list_extraction_layers, device=DEVICE)
        for layer in list_extraction_layers:
            activations_cache[exp][layer] = activations_dict[layer].squeeze()
        pbar.update(1)

Precomputing activations: 100%|██████████| 5/5 [00:14<00:00,  2.88s/it]


# **Baseline (no steering)**

In [None]:
import math

print("Calculating baseline accuracy...")
baseline_success_count = 0
total_prompts_for_baseline = 0

batch_size = 50
baseline_data = []

for exp in range(NUM_EXPERIMENTS):
    success_count = 0
    total_prompts = 0

    # Retrieve experiment-specific prompts and correct outputs
    prompts_to_steer = experiment_data[exp]['prompts_to_steer']
    correct_outputs_1st = experiment_data[exp]['correct_outputs_1st']

    num_batches = math.ceil(len(prompts_to_steer) / batch_size)

    for batch_idx in range(num_batches):
        batch_prompts = prompts_to_steer[batch_idx * batch_size : (batch_idx + 1) * batch_size]
        batch_correct_outputs_1st = correct_outputs_1st[batch_idx * batch_size : (batch_idx + 1) * batch_size]

        # Generate baseline outputs from input prompts
        top_1_tokens = top_1_first_tokens(model, batch_prompts, fwd_hooks=[])

        # Increment success count if top-1 output matches the correct output
        for i, top_1_token in enumerate(top_1_tokens):
            if top_1_token == batch_correct_outputs_1st[i]:
                success_count += 1
            total_prompts += 1

    baseline_accuracy = (success_count / total_prompts) * 100
    print(f"Experiment {exp + 1} Baseline Accuracy: {baseline_accuracy:.2f}% ({success_count}/{total_prompts} samples)")

    # Store results for this experiment
    baseline_data.append({
        'experiment': exp + 1,
        'success_count': success_count,
        'total_prompts': total_prompts,
        'baseline_accuracy': baseline_accuracy
    })

    # Update overall baseline success count and total prompts
    baseline_success_count += success_count
    total_prompts_for_baseline += total_prompts

# Calculate and print the overall baseline accuracy
overall_baseline_accuracy = (baseline_success_count / total_prompts_for_baseline) * 100
print(f"Overall Baseline Unsteered Accuracy: {overall_baseline_accuracy:.2f}% ({baseline_success_count}/{total_prompts_for_baseline} samples)")

# Store the overall results
baseline_data.append({
    'experiment': 'Overall',
    'success_count': baseline_success_count,
    'total_prompts': total_prompts_for_baseline,
    'baseline_accuracy': overall_baseline_accuracy
})

Calculating baseline accuracy...
Experiment 1 Baseline Accuracy: 0.00% (0/1000 samples)
Experiment 2 Baseline Accuracy: 0.00% (0/1000 samples)
Experiment 3 Baseline Accuracy: 0.00% (0/1000 samples)
Experiment 4 Baseline Accuracy: 0.00% (0/1000 samples)
Experiment 5 Baseline Accuracy: 0.00% (0/1000 samples)
Overall Baseline Unsteered Accuracy: 0.00% (0/5000 samples)


In [None]:
if SAVE_RESULTS:
    baseline_df = pd.DataFrame(baseline_data)
    baseline_df.to_csv(f'{RESULTS_PATH}/{TASK}_baseline_results.csv', index=False)

# **Conceptor Steering**

1 Conceptor computed from activations from 1 layer is applied to that layer

**Pre-Compute all necessary Conceptors**

In [None]:
from tqdm import tqdm

# Initialize a dictionary to store conceptors for each experiment
conceptors_cache = {exp: {} for exp in range(NUM_EXPERIMENTS)}

# Total number of computations for the progress bar
total_computations = len(list_extraction_layers) * len(list_aperatures_normal) * NUM_EXPERIMENTS

# Precompute conceptors for all layers and apertures with a progress bar
with tqdm(total=total_computations, desc="Precomputing conceptors") as pbar:
    for exp in range(NUM_EXPERIMENTS):
        for layer in list_extraction_layers:
            for aperture in list_aperatures_normal:
                # Extract the last-token activations of steering examples at the specified layer
                activations = activations_cache[exp][layer]
                # Compute the steering conceptor using cached activations
                conceptor = compute_conceptor(activations, aperture)
                # Store the conceptor in the cache
                conceptors_cache[exp][(layer, aperture)] = conceptor
                # Update the progress bar
                pbar.update(1)

Precomputing conceptors: 100%|██████████| 5/5 [00:36<00:00,  7.26s/it]


**Run Experiment**

In [None]:
import math
import random
import numpy as np
import torch

results_conceptoring = {}

batch_size = 50

for config in configs_conceptoring:
    # Extract current experimental configurations
    layer = config.extraction_layer
    beta = config.beta
    aperture = config.aperture
    config_key = f"Layer {layer}, Beta {beta}, Aperture {aperture}"
    print(f"-----------{config_key}-----------")

    # Initialize success counts and store final accuracies for each experiment
    if config_key not in results_conceptoring:
        results_conceptoring[config_key] = {'success_counts': [], 'total_prompts': [], 'final_accuracies': []}

    for exp in range(NUM_EXPERIMENTS):
        # Retrieve precomputed conceptor from the cache
        conceptor = conceptors_cache[exp][(layer, aperture)]

        success_count = 0
        total_prompts = 0

        # Retrieve experiment-specific prompts and correct outputs
        prompts_to_steer = experiment_data[exp]['prompts_to_steer']
        correct_outputs_1st = experiment_data[exp]['correct_outputs_1st']

        num_batches = math.ceil(len(prompts_to_steer) / batch_size)

        for batch_idx in range(num_batches):
            batch_prompts = prompts_to_steer[batch_idx * batch_size : (batch_idx + 1) * batch_size]
            batch_correct_outputs_1st = correct_outputs_1st[batch_idx * batch_size : (batch_idx + 1) * batch_size]

            # Initialize hooks that will allow for the conceptor to be applied
            conceptor_hook = generate_conceptor_hook(conceptor, beta)
            activation_modification = (f"blocks.{layer}.hook_resid_pre", conceptor_hook)
            editing_hooks = [activation_modification]

            # Generate steered outputs from input prompts using the conceptor hooks
            top_1_tokens = top_1_first_tokens(model, batch_prompts, fwd_hooks=editing_hooks)

            # Increment success count if top-1 output matches the correct output
            for i, top_1_token in enumerate(top_1_tokens):
                if top_1_token == batch_correct_outputs_1st[i]:
                    success_count += 1
                total_prompts += 1

        final_accuracy = (success_count / total_prompts) * 100
        results_conceptoring[config_key]['success_counts'].append(success_count)
        results_conceptoring[config_key]['total_prompts'].append(total_prompts)
        results_conceptoring[config_key]['final_accuracies'].append(final_accuracy)
        model.reset_hooks()

        print(f"Experiment {exp+1}: Final Accuracy: {final_accuracy:.2f}% ({success_count}/{total_prompts} samples)")

    # Calculate average final accuracy across all experiments
    average_final_accuracy = sum(results_conceptoring[config_key]['final_accuracies']) / NUM_EXPERIMENTS
    results_conceptoring[config_key]['average_final_accuracy'] = average_final_accuracy
    print(f"Average Final Accuracy: {average_final_accuracy:.2f}% over {NUM_EXPERIMENTS} experiments")

-----------Layer 14, Beta 1.0, Aperture 0.0125-----------
Experiment 1: Final Accuracy: 2.00% (20/1000 samples)
Experiment 2: Final Accuracy: 4.30% (43/1000 samples)
Experiment 3: Final Accuracy: 3.10% (31/1000 samples)
Experiment 4: Final Accuracy: 2.10% (21/1000 samples)
Experiment 5: Final Accuracy: 1.90% (19/1000 samples)
Average Final Accuracy: 2.68% over 5 experiments
-----------Layer 14, Beta 2.0, Aperture 0.0125-----------
Experiment 1: Final Accuracy: 41.70% (417/1000 samples)
Experiment 2: Final Accuracy: 43.00% (430/1000 samples)
Experiment 3: Final Accuracy: 43.40% (434/1000 samples)
Experiment 4: Final Accuracy: 41.60% (416/1000 samples)
Experiment 5: Final Accuracy: 41.30% (413/1000 samples)
Average Final Accuracy: 42.20% over 5 experiments
-----------Layer 14, Beta 3.0, Aperture 0.0125-----------
Experiment 1: Final Accuracy: 44.40% (444/1000 samples)
Experiment 2: Final Accuracy: 44.70% (447/1000 samples)
Experiment 3: Final Accuracy: 46.40% (464/1000 samples)
Experimen

In [None]:
if SAVE_RESULTS:
    data = []
    for config_key, result in results_conceptoring.items():
        for i in range(NUM_EXPERIMENTS):
            data.append({
                'config_key': config_key,
                'experiment': i+1,
                'success_count': result['success_counts'][i],
                'total_prompts': result['total_prompts'][i],
                'final_accuracy': result['final_accuracies'][i]
            })
        # Add average final accuracy for each configuration
        data.append({
            'config_key': config_key,
            'experiment': 'Average',
            'success_count': '',
            'total_prompts': '',
            'final_accuracy': result['average_final_accuracy']
        })

    df = pd.DataFrame(data)
    df.to_csv(f'{RESULTS_PATH}/{TASK}_conceptoring_results.csv', index=False)

# **Conceptor (Mean-Centered) Steering**

1 Conceptor computed from activations from 1 layer is applied to that layer

**Pre-Compute all necessary Conceptors**

In [None]:
from tqdm import tqdm

# Initialize a dictionary to store conceptors for each experiment
conceptors_cache_mean_centered = {exp: {} for exp in range(NUM_EXPERIMENTS)}

# Total number of computations for the progress bar
total_computations = len(list_extraction_layers) * len(list_aperatures_mean_centered) * NUM_EXPERIMENTS

# Precompute conceptors for all layers and apertures with a progress bar
with tqdm(total=total_computations, desc="Precomputing conceptors") as pbar:
    for exp in range(NUM_EXPERIMENTS):
        for layer in list_extraction_layers:
            for aperture in list_aperatures_mean_centered:
                # Extract the last-token activations of steering examples at the specified layer
                activations = activations_cache[exp][layer]
                activations_mean_centered = activations - mean_train_activations[layer]
                # Compute the steering conceptor using cached activations
                conceptor = compute_conceptor(activations_mean_centered, aperture)
                # Store the conceptor in the cache
                conceptors_cache_mean_centered[exp][(layer, aperture)] = conceptor
                # Update the progress bar
                pbar.update(1)

Precomputing conceptors: 100%|██████████| 5/5 [01:01<00:00, 12.22s/it]


**Run Experiments**

In [None]:
import math
import random
import numpy as np
import torch

results_conceptoring_mean_centered = {}

batch_size = 50

for config in configs_conceptoring_mean_centered:
    # Extract current experimental configurations
    layer = config.extraction_layer
    beta = config.beta
    aperture = config.aperture
    config_key = f"Layer {layer}, Beta {beta}, Aperture {aperture}"
    print(f"-----------{config_key}-----------")

    # Initialize success counts and store final accuracies for each experiment
    if config_key not in results_conceptoring_mean_centered:
        results_conceptoring_mean_centered[config_key] = {'success_counts': [], 'total_prompts': [], 'final_accuracies': []}

    for exp in range(NUM_EXPERIMENTS):
        # Retrieve precomputed conceptor from the cache
        conceptor = conceptors_cache_mean_centered[exp][(layer, aperture)]

        success_count = 0
        total_prompts = 0

        # Retrieve experiment-specific prompts and correct outputs
        prompts_to_steer = experiment_data[exp]['prompts_to_steer']
        correct_outputs_1st = experiment_data[exp]['correct_outputs_1st']

        num_batches = math.ceil(len(prompts_to_steer) / batch_size)

        for batch_idx in range(num_batches):
            batch_prompts = prompts_to_steer[batch_idx * batch_size : (batch_idx + 1) * batch_size]
            batch_correct_outputs_1st = correct_outputs_1st[batch_idx * batch_size : (batch_idx + 1) * batch_size]

            # Initialize hooks that will allow for the conceptor to be applied
            conceptor_hook = generate_conceptor_hook_mean_centered(conceptor=conceptor, mean_train=mean_train_activations[layer], beta=beta)
            activation_modification = (f"blocks.{layer}.hook_resid_pre", conceptor_hook)
            editing_hooks = [activation_modification]

            # Generate steered outputs from input prompts using the conceptor hooks
            top_1_tokens = top_1_first_tokens(model, batch_prompts, fwd_hooks=editing_hooks)

            # Increment success count if top-1 output matches the correct output
            for i, top_1_token in enumerate(top_1_tokens):
                if top_1_token == batch_correct_outputs_1st[i]:
                    success_count += 1
                total_prompts += 1

        final_accuracy = (success_count / total_prompts) * 100
        results_conceptoring_mean_centered[config_key]['success_counts'].append(success_count)
        results_conceptoring_mean_centered[config_key]['total_prompts'].append(total_prompts)
        results_conceptoring_mean_centered[config_key]['final_accuracies'].append(final_accuracy)
        model.reset_hooks()

        print(f"Experiment {exp+1}: Final Accuracy: {final_accuracy:.2f}% ({success_count}/{total_prompts} samples)")

    # Calculate average final accuracy across all experiments
    average_final_accuracy = sum(results_conceptoring_mean_centered[config_key]['final_accuracies']) / NUM_EXPERIMENTS
    results_conceptoring_mean_centered[config_key]['average_final_accuracy'] = average_final_accuracy
    print(f"Average Final Accuracy: {average_final_accuracy:.2f}% over {NUM_EXPERIMENTS} experiments")

-----------Layer 14, Beta 1.0, Aperture 0.05-----------
Experiment 1: Final Accuracy: 5.20% (52/1000 samples)
Experiment 2: Final Accuracy: 4.30% (43/1000 samples)
Experiment 3: Final Accuracy: 4.40% (44/1000 samples)
Experiment 4: Final Accuracy: 4.00% (40/1000 samples)
Experiment 5: Final Accuracy: 4.80% (48/1000 samples)
Average Final Accuracy: 4.54% over 5 experiments
-----------Layer 14, Beta 2.0, Aperture 0.05-----------
Experiment 1: Final Accuracy: 35.10% (351/1000 samples)
Experiment 2: Final Accuracy: 34.20% (342/1000 samples)
Experiment 3: Final Accuracy: 35.90% (359/1000 samples)
Experiment 4: Final Accuracy: 32.30% (323/1000 samples)
Experiment 5: Final Accuracy: 37.10% (371/1000 samples)
Average Final Accuracy: 34.92% over 5 experiments
-----------Layer 14, Beta 3.0, Aperture 0.05-----------
Experiment 1: Final Accuracy: 45.20% (452/1000 samples)
Experiment 2: Final Accuracy: 47.10% (471/1000 samples)
Experiment 3: Final Accuracy: 48.30% (483/1000 samples)
Experiment 4: F

In [None]:
if SAVE_RESULTS:
    data = []
    for config_key, result in results_conceptoring_mean_centered.items():
        for i in range(NUM_EXPERIMENTS):
            data.append({
                'config_key': config_key,
                'experiment': i+1,
                'success_count': result['success_counts'][i],
                'total_prompts': result['total_prompts'][i],
                'final_accuracy': result['final_accuracies'][i]
            })
        # Add average final accuracy for each configuration
        data.append({
            'config_key': config_key,
            'experiment': 'Average',
            'success_count': '',
            'total_prompts': '',
            'final_accuracy': result['average_final_accuracy']
        })

    df = pd.DataFrame(data)
    df.to_csv(f'{RESULTS_PATH}/{TASK}_conceptoring_mean_centered_results.csv', index=False)

# **Additive Steering**
1 Average computed from activations from 1 layer is added to that layer

**Pre-Compute all averages**

In [None]:
from tqdm import tqdm
import torch

# Initialize a dictionary to store averaged activations for each experiment
averaged_activations_cache = {exp: {} for exp in range(NUM_EXPERIMENTS)}

# Total number of computations for the progress bar
total_computations = len(list_extraction_layers) * NUM_EXPERIMENTS

# Precompute averaged activations for all layers with a progress bar
with tqdm(total=total_computations, desc="Precomputing averaged activations") as pbar:
    for exp in range(NUM_EXPERIMENTS):
        for layer in list_extraction_layers:
            # Extract the last-token activations of steering examples at the specified layer
            activations = activations_cache[exp][layer]
            # Compute the average activations
            avg_activations = torch.mean(activations, dim=0)
            # Store the average activations in the cache
            averaged_activations_cache[exp][layer] = avg_activations
            # Update the progress bar
            pbar.update(1)

Precomputing averaged activations: 100%|██████████| 5/5 [00:00<00:00, 5961.21it/s]


**Run experiment**

In [None]:
import math
import numpy as np
import torch

results_averaging = {}

batch_size = 150

for config in configs_averaging:
    # Extract current experimental configurations
    layer = config.extraction_layer
    beta = config.beta
    config_key = f"Layer {layer}, Beta {beta}"
    print(f"-------------------------{config_key}-------------------------")

    # Initialize success counts and store final accuracies for each experiment
    if config_key not in results_averaging:
        results_averaging[config_key] = {'success_counts': [], 'total_prompts': [], 'final_accuracies': []}

    for exp in range(NUM_EXPERIMENTS):
        # Retrieve precomputed averaged activations from the cache
        avg_activations = averaged_activations_cache[exp][layer]

        success_count = 0
        total_prompts = 0

        # Retrieve experiment-specific prompts and correct outputs
        prompts_to_steer = experiment_data[exp]['prompts_to_steer']
        correct_outputs_1st = experiment_data[exp]['correct_outputs_1st']

        num_batches = math.ceil(len(prompts_to_steer) / batch_size)

        for batch_idx in range(num_batches):
            batch_prompts = prompts_to_steer[batch_idx * batch_size : (batch_idx + 1) * batch_size]
            batch_correct_outputs_1st = correct_outputs_1st[batch_idx * batch_size : (batch_idx + 1) * batch_size]

            # Initialize hooks that will allow for the average vector to be added
            ave_hook = generate_ave_hook_addition(steering_vector=avg_activations, beta=beta)
            activation_modification = (f"blocks.{layer}.hook_resid_pre", ave_hook)
            editing_hooks = [activation_modification]

            # Generate steered outputs from input prompts using the average steering hooks
            top_1_tokens = top_1_first_tokens(model, batch_prompts, fwd_hooks=editing_hooks)

            # Increment success count if top-1 output matches the correct output
            for i, top_1_token in enumerate(top_1_tokens):
                if top_1_token == batch_correct_outputs_1st[i]:
                    success_count += 1
                total_prompts += 1

        final_accuracy = (success_count / total_prompts) * 100
        results_averaging[config_key]['success_counts'].append(success_count)
        results_averaging[config_key]['total_prompts'].append(total_prompts)
        results_averaging[config_key]['final_accuracies'].append(final_accuracy)
        model.reset_hooks()

        print(f"Experiment {exp+1}: Final Accuracy: {final_accuracy:.2f}% ({success_count}/{total_prompts} samples)")

    # Calculate average final accuracy across all experiments
    average_final_accuracy = sum(results_averaging[config_key]['final_accuracies']) / NUM_EXPERIMENTS
    results_averaging[config_key]['average_final_accuracy'] = average_final_accuracy
    print(f"Average Final Accuracy: {average_final_accuracy:.2f}% over {NUM_EXPERIMENTS} experiments")

-------------------------Layer 14, Beta 1.0-------------------------
Experiment 1: Final Accuracy: 5.40% (54/1000 samples)
Experiment 2: Final Accuracy: 6.40% (64/1000 samples)
Experiment 3: Final Accuracy: 6.50% (65/1000 samples)
Experiment 4: Final Accuracy: 5.90% (59/1000 samples)
Experiment 5: Final Accuracy: 6.70% (67/1000 samples)
Average Final Accuracy: 6.18% over 5 experiments
-------------------------Layer 14, Beta 2.0-------------------------
Experiment 1: Final Accuracy: 12.50% (125/1000 samples)
Experiment 2: Final Accuracy: 14.90% (149/1000 samples)
Experiment 3: Final Accuracy: 14.40% (144/1000 samples)
Experiment 4: Final Accuracy: 13.20% (132/1000 samples)
Experiment 5: Final Accuracy: 16.20% (162/1000 samples)
Average Final Accuracy: 14.24% over 5 experiments
-------------------------Layer 14, Beta 3.0-------------------------
Experiment 1: Final Accuracy: 12.50% (125/1000 samples)
Experiment 2: Final Accuracy: 16.10% (161/1000 samples)
Experiment 3: Final Accuracy: 15

In [None]:
if SAVE_RESULTS:
    data = []
    for config_key, result in results_averaging.items():
        for i in range(NUM_EXPERIMENTS):
            data.append({
                'config_key': config_key,
                'experiment': i+1,
                'success_count': result['success_counts'][i],
                'total_prompts': result['total_prompts'][i],
                'final_accuracy': result['final_accuracies'][i]
            })
        # Add average final accuracy for each configuration
        data.append({
            'config_key': config_key,
            'experiment': 'Average',
            'success_count': '',
            'total_prompts': '',
            'final_accuracy': result['average_final_accuracy']
        })

    df = pd.DataFrame(data)
    df.to_csv(f'{RESULTS_PATH}/{TASK}_averaging_addition_results.csv', index=False)

# **Additive (Mean-Centered) Steering**
1 Average computed from activations from 1 layer is added to that layer with mean-centering

**Run experiment**

In [None]:
import math
import numpy as np
import torch

results_averaging_mean_centered = {}

batch_size = 50

for config in configs_averaging:
    # Extract current experimental configurations
    layer = config.extraction_layer
    beta = config.beta
    config_key = f"Layer {layer}, Beta {beta}"
    print(f"-------------------------{config_key}-------------------------")

    # Initialize success counts and store final accuracies for each experiment
    if config_key not in results_averaging_mean_centered:
        results_averaging_mean_centered[config_key] = {'success_counts': [], 'total_prompts': [], 'final_accuracies': []}

    for exp in range(NUM_EXPERIMENTS):
        # Retrieve precomputed averaged activations from the cache
        avg_activations = averaged_activations_cache[exp][layer]
        mean_train = mean_train_activations[layer]

        success_count = 0
        total_prompts = 0

        # Retrieve experiment-specific prompts and correct outputs
        prompts_to_steer = experiment_data[exp]['prompts_to_steer']
        correct_outputs_1st = experiment_data[exp]['correct_outputs_1st']

        num_batches = math.ceil(len(prompts_to_steer) / batch_size)

        for batch_idx in range(num_batches):
            batch_prompts = prompts_to_steer[batch_idx * batch_size : (batch_idx + 1) * batch_size]
            batch_correct_outputs_1st = correct_outputs_1st[batch_idx * batch_size : (batch_idx + 1) * batch_size]

            # Initialize hooks that will allow for the average vector to be added
            ave_hook = generate_ave_hook_addition_mean_centered(steering_vector=avg_activations, mean_train=mean_train, beta=beta)
            activation_modification = (f"blocks.{layer}.hook_resid_pre", ave_hook)
            editing_hooks = [activation_modification]

            # Generate steered outputs from input prompts using the average steering hooks
            top_1_tokens = top_1_first_tokens(model, batch_prompts, fwd_hooks=editing_hooks)

            # Increment success count if top-1 output matches the correct output
            for i, top_1_token in enumerate(top_1_tokens):
                if top_1_token == batch_correct_outputs_1st[i]:
                    success_count += 1
                total_prompts += 1

        final_accuracy = (success_count / total_prompts) * 100
        results_averaging_mean_centered[config_key]['success_counts'].append(success_count)
        results_averaging_mean_centered[config_key]['total_prompts'].append(total_prompts)
        results_averaging_mean_centered[config_key]['final_accuracies'].append(final_accuracy)
        model.reset_hooks()

        print(f"Experiment {exp+1}: Final Accuracy: {final_accuracy:.2f}% ({success_count}/{total_prompts} samples)")

    # Calculate average final accuracy across all experiments
    average_final_accuracy = sum(results_averaging_mean_centered[config_key]['final_accuracies']) / NUM_EXPERIMENTS
    results_averaging_mean_centered[config_key]['average_final_accuracy'] = average_final_accuracy
    print(f"Average Final Accuracy: {average_final_accuracy:.2f}% over {NUM_EXPERIMENTS} experiments")

-------------------------Layer 14, Beta 1.0-------------------------
Experiment 1: Final Accuracy: 4.20% (42/1000 samples)
Experiment 2: Final Accuracy: 5.20% (52/1000 samples)
Experiment 3: Final Accuracy: 5.90% (59/1000 samples)
Experiment 4: Final Accuracy: 5.20% (52/1000 samples)
Experiment 5: Final Accuracy: 5.00% (50/1000 samples)
Average Final Accuracy: 5.10% over 5 experiments
-------------------------Layer 14, Beta 2.0-------------------------
Experiment 1: Final Accuracy: 19.80% (198/1000 samples)
Experiment 2: Final Accuracy: 22.70% (227/1000 samples)
Experiment 3: Final Accuracy: 21.30% (213/1000 samples)
Experiment 4: Final Accuracy: 20.60% (206/1000 samples)
Experiment 5: Final Accuracy: 23.10% (231/1000 samples)
Average Final Accuracy: 21.50% over 5 experiments
-------------------------Layer 14, Beta 3.0-------------------------
Experiment 1: Final Accuracy: 24.80% (248/1000 samples)
Experiment 2: Final Accuracy: 24.60% (246/1000 samples)
Experiment 3: Final Accuracy: 23

In [None]:
if SAVE_RESULTS:
    data = []
    for config_key, result in results_averaging_mean_centered.items():
        for i in range(NUM_EXPERIMENTS):
            data.append({
                'config_key': config_key,
                'experiment': i+1,
                'success_count': result['success_counts'][i],
                'total_prompts': result['total_prompts'][i],
                'final_accuracy': result['final_accuracies'][i]
            })
        # Add average final accuracy for each configuration
        data.append({
            'config_key': config_key,
            'experiment': 'Average',
            'success_count': '',
            'total_prompts': '',
            'final_accuracy': result['average_final_accuracy']
        })

    df = pd.DataFrame(data)
    df.to_csv(f'{RESULTS_PATH}/{TASK}_averaging_addition_mean_centred_results.csv', index=False)