# Imports

In [None]:
# Assumes transformer_lens is installed, %pip install transformer_lens
from transformer_lens import HookedTransformer
import torch
import pickle

Collecting transformer_lens
  Downloading transformer_lens-2.4.0-py3-none-any.whl.metadata (12 kB)
Collecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl.metadata (28 kB)
Collecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Downloading better_abc-0.0.3-py3-none-any.whl.metadata (1.4 kB)
Collecting datasets>=2.7.1 (from transformer_lens)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fancy-einsum>=0.0.3 (from transformer_lens)
  Downloading fancy_einsum-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting jaxtyping>=0.2.11 (from transformer_lens)
  Downloading jaxtyping-0.2.33-py3-none-any.whl.metadata (6.4 kB)
Collecting wandb>=0.13.5 (from transformer_lens)
  Downloading wandb-0.17.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.7.1->transformer_lens)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata 

# Helper Functions

In [None]:
def get_resid_pre(prompt: str, layers: list):
    # Initialize a dictionary to store activations for each layer
    activations = {}

    # Generate caching hooks for all specified layers and collect them
    all_caching_hooks = []
    for layer in layers:
        name = f"blocks.{layer}.hook_resid_pre"
        cache, caching_hooks, _ = model.get_caching_hooks(lambda n, layer=layer: n == name)
        activations[name] = cache
        all_caching_hooks.extend(caching_hooks)  # Collect all hooks in a list

    # Run the model a single time with all collected hooks
    with model.hooks(fwd_hooks=all_caching_hooks):
        _ = model(prompt)

    # Extract the cached activations from the cache dictionary
    results = {layer: activations[f"blocks.{layer}.hook_resid_pre"][f"blocks.{layer}.hook_resid_pre"] for layer in layers}
    return results

# Load Strings

Load the initial strings: "Given the reconstruction of the training dataset provided by Gokaslan and Cohen (2019), all entries from the folders urlsf subset01-1 data and urlsf subset01-182 data are stored."

In [None]:
import os

# Specify the directory containing the text files
directory_path = "./Prompts_Data/"

# Initialize an empty list to hold the contents of each file
file_contents = []

# Iterate over all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)
        # Open and read the content of the file
        with open(file_path, 'r') as file:
            content = file.read()
            # Append the content to the list
            file_contents.append(content)

len(file_contents)

599

These are then filtered to take the entries which decompos

into less than 500 tokens via the GPT-2 tokeniser, in order to prevent memory issues.

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

filtered_entries = [
    content for content in file_contents if len(tokenizer.encode(content)) < 500
]

len(filtered_entries)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (6339 > 1024). Running this sequence through the model will result in indexing errors


210

The resulting dataset has 210 entries. To make this the same size as the other datasets we removed the final 10 entries from the dataset.

In [None]:
result_entries = filtered_entries[:200]
len(result_entries)

200

# Compute Activations

Load Model

In [None]:
# Load model onto GPU (if possible)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = 'EleutherAI/gpt-j-6b'
DTYPE = 'float16'
NUM_LAYERS = 28

torch.set_grad_enabled(False)
model = HookedTransformer.from_pretrained_no_processing(model_name=MODEL_NAME, device=DEVICE, dtype=DTYPE)
model.eval();

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Loaded pretrained model EleutherAI/gpt-j-6b into HookedTransformer


Get Activations

In [None]:
import torch

# Define the batch size (adjust this based on your GPU memory)
batch_size = 7

# List of layers from 1 to NUM_LAYERS (excluding NUM_LAYERS)
layers = list(range(1, NUM_LAYERS))

# Create batches of layers
layer_batches = [layers[i:i + batch_size] for i in range(0, len(layers), batch_size)]

# Dictionary to store the final average activation vector for each layer
final_averages = {}

# Loop over each batch of layers
for layer_batch in layer_batches:
    # Initialize a dictionary to store activations for the current batch of layers across all prompts
    cached_activations_per_layer = {layer: [] for layer in layer_batch}

    # Loop over each prompt and cache activations for the current batch of layers
    for prompt in result_entries:
        layer_activations = get_resid_pre(prompt, layer_batch)
        for layer in layer_batch:
            cached_activations_per_layer[layer].append(layer_activations[layer])

    # Compute the average activation vector for each layer in the current batch
    for layer in layer_batch:
        # Concatenate all token activations across prompts for the current layer
        all_token_activations = torch.cat([tensor.squeeze(0) for tensor in cached_activations_per_layer[layer]], dim=0)

        # Compute the average activation vector for the current layer
        final_averages[layer] = torch.mean(all_token_activations, dim=0)

    # Clear the cached activations for the current batch to free up memory
    del cached_activations_per_layer
    torch.cuda.empty_cache()  # Clear GPU cache to free memory

    print(final_averages)

{1: tensor([ 0.0180,  0.0926, -0.0130,  ..., -0.0242, -0.0013,  0.0536],
       device='cuda:0', dtype=torch.float16), 2: tensor([-0.1153,  0.0599, -0.0612,  ...,  0.0184, -0.0898, -0.0422],
       device='cuda:0', dtype=torch.float16), 3: tensor([-0.0494,  0.0538, -0.0292,  ...,  0.0477, -0.0751,  0.0044],
       device='cuda:0', dtype=torch.float16), 4: tensor([-0.0648,  0.0293,  0.0658,  ..., -0.0341,  0.0367, -0.0508],
       device='cuda:0', dtype=torch.float16), 5: tensor([-0.0125,  0.1440,  0.0306,  ...,  0.0104, -0.0246, -0.0600],
       device='cuda:0', dtype=torch.float16), 6: tensor([-0.0753,  0.0931,  0.0082,  ..., -0.0431,  0.0422, -0.1111],
       device='cuda:0', dtype=torch.float16), 7: tensor([-0.0616, -0.0052,  0.0448,  ...,  0.0503,  0.1348, -0.1144],
       device='cuda:0', dtype=torch.float16)}
{1: tensor([ 0.0180,  0.0926, -0.0130,  ..., -0.0242, -0.0013,  0.0536],
       device='cuda:0', dtype=torch.float16), 2: tensor([-0.1153,  0.0599, -0.0612,  ...,  0.0184, -

In [None]:
with open(f'./mean_activations_gpt-j-6b.pkl', 'wb') as file:
    pickle.dump(final_averages, file)