In [6]:
from huggingface_hub import hf_hub_download
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "Elriggs/pythia-70m-deduped"

# The layer around which we want to interpret by looking at the last layer
layer_focus = 1
assert layer_focus > 0

In [3]:
def load_autoencoder(layer: int):
	ae_download_location_main = hf_hub_download(repo_id=model_id, filename=f"tied_residual_l{layer_focus}_r6/_63/learned_dicts.pt")
	all_autoencoders = torch.load(ae_download_location_main)
	all_l1s = [hyperparams["l1_alpha"] for autoencoder, hyperparams in all_autoencoders]
	# TODO: choose best one???
	print(all_l1s)
	auto_num = 5
	autoencoder, hyperparams = all_autoencoders[auto_num]
	# You want a hyperparam around 1e-3. Higher is less features/datapoint (at the cost of reconstruction error); lower is more features/datapoint (at the cost of polysemanticity)
	autoencoder.to_device(device)
	print(hyperparams)
	return autoencoder, hyperparams

autoencoder_main, hyperparams_main = load_autoencoder(layer_focus)
autoencoder_prior, hyperparams_prior = load_autoencoder(layer_focus - 1)

[0.0, 9.999999747378752e-05, 0.00019306977628730237, 0.000372759357560426, 0.0007196856895461679, 0.0013894954463467002, 0.0026826958637684584, 0.005179474595934153, 0.009999999776482582]
{'dict_size': 3072, 'l1_alpha': 0.0013894954463467002}
[0.0, 9.999999747378752e-05, 0.00019306977628730237, 0.000372759357560426, 0.0007196856895461679, 0.0013894954463467002, 0.0026826958637684584, 0.005179474595934153, 0.009999999776482582]
{'dict_size': 3072, 'l1_alpha': 0.0013894954463467002}


In [4]:
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
import numpy as np

In [5]:

# autoencoder_larger, larger_hyperparams = all_autoencoders[auto_num+1]

# #Dictionary Comparison
# autoencoder_features = hyperparams["dict_size"]
# autoencoder_larger_features = larger_hyperparams["dict_size"]
# autoencoder_larger.to_device(device)

# # Hungary algorithm
# # Calculate all cosine similarities and store in a 2D array
# cos_sims = np.zeros((autoencoder_features, autoencoder_larger_features))
# for idx, vector in enumerate(autoencoder.get_learned_dict()):
#     cos_sims[idx] = torch.nn.functional.cosine_similarity(vector.to(device), autoencoder_larger.get_learned_dict(), dim=1).cpu().numpy()
# # Convert to a minimization problem
# cos_sims = 1 - cos_sims
# # Use the Hungarian algorithm to solve the assignment problem
# row_ind, col_ind = linear_sum_assignment(cos_sims)
# # Retrieve the max cosine similarities and corresponding indices
# max_cosine_similarities = 1 - cos_sims[row_ind, col_ind]

# # Get the indices of the max cosine similarities in descending order
# max_indices = np.argsort(max_cosine_similarities)[::-1]
# max_cosine_similarities[max_indices][:20]
# print(("# of features above 0.9:", (max_cosine_similarities > .9).sum()))
# # Plot histogram of max_cosine_similarities
# plt.hist(max_cosine_similarities, bins=20)
# plt.xlabel("Cosine Similarity")
# plt.title(f"Max Cos Sim between {hyperparams['l1_alpha']:.2E} & {larger_hyperparams['l1_alpha']:.2E}")
# plt.show()

In [7]:
from transformer_lens import HookedTransformer
setting = "residual"
model_name = "EleutherAI/pythia-70m-deduped"

model = HookedTransformer.from_pretrained(model_name, device=device)

def get_cache_name_neurons(layer: int):
    if setting == "residual":
        cache_name = f"blocks.{layer}.hook_resid_post"
        neurons = model.cfg.d_model
    elif setting == "mlp":
        cache_name = f"blocks.{layer}.mlp.hook_post"
        neurons = model.cfg.d_mlp
    elif setting == "attention":
        cache_name = f"blocks.{layer}.hook_attn_out"
        neurons = model.cfg.d_model
    elif setting == "mlp_out":
        cache_name = f"blocks.{layer}.hook_mlp_out"
        neurons = model.cfg.d_model
    else:
        raise NotImplementedError
    return cache_name, neurons

cache_name, neurons  = get_cache_name_neurons(layer_focus)
cache_name_prior, _  = get_cache_name_neurons(layer_focus - 1)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer


In [8]:
# Downnload dataset
from datasets import Dataset, load_dataset
dataset_name = "NeelNanda/pile-10k"
token_amount= 40
#TODO: change train[:1000] to train if you want whole dataset
dataset = load_dataset(dataset_name, split="train[:1000]").map(
    lambda x: model.tokenizer(x['text']),
    batched=True,
).filter(
    lambda x: len(x['input_ids']) > token_amount
).map(
    lambda x: {'input_ids': x['input_ids'][:token_amount]}
)

Found cached dataset parquet (/home/lev/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/lev/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-75627fbeda83a2c4.arrow
Loading cached processed dataset at /home/lev/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3182e4e6d9ffacd3.arrow
Loading cached processed dataset at /home/lev/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-54571ecbcc6289aa.arrow


# Get Dictionary Activations

In [10]:
# Now we can use the model to get the activations
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from einops import rearrange

def get_activations(autoencoder):
    num_features, d_model = autoencoder.encoder.shape
    datapoints = dataset.num_rows
    batch_size = 32
    neuron_activations = torch.zeros((datapoints*token_amount, d_model))
    dictionary_activations = torch.zeros((datapoints*token_amount, num_features))
    smaller_auto_encoder = autoencoder
    smaller_auto_encoder.to_device(device)

    with torch.no_grad(), dataset.formatted_as("pt"):
        dl = DataLoader(dataset["input_ids"], batch_size=batch_size)
        for i, batch in enumerate(tqdm(dl)):
            _, cache = model.run_with_cache(batch.to(device))
            batched_neuron_activations = rearrange(cache[cache_name], "b s n -> (b s) n" )
            neuron_activations[i*batch_size*token_amount:(i+1)*batch_size*token_amount,:] = batched_neuron_activations.cpu()
            batched_dictionary_activations = smaller_auto_encoder.encode(batched_neuron_activations)
            dictionary_activations[i*batch_size*token_amount:(i+1)*batch_size*token_amount,:] = batched_dictionary_activations.cpu()
    return dictionary_activations

dict_activations_main = get_activations(autoencoder_main)
dict_activations_prior = get_activations(autoencoder_prior)


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

# Feature Interp
Investigate the example sentences the activate this feature.

Max: show max activating (tokens,contexts)

Uniform: Show range of activations from each bin (e.g. sample an example from 1-2, 2-3, etc). 
[Note: if a feature is monosemantic, then the full range of activations should be that feature, not just max-activating ones]

Full_text: shows the full text example

Text_list: shows up to the most activating example (try w/ max activating on a couple of examples to see)

ablate_text: remove the context one token at a time, and show the decrease/increase in activation of that feature

ablate_feature_direction: removes feature direction from model's activation mid-inference, showing the logit diff in the output for every token.

logit_lens: show the logit lens for that feature. If matches ablate_feature_direction, then the computation path is through the residual stream, else, it's through future layers

In [14]:
dict_activations_prior.shape

torch.Size([39320, 3072])

In [31]:
from circuitsvis.activations import text_neuron_activations
import torch
import numpy as np
from einops import rearrange

# Get the activations for the best dict features
def get_feature_datapoints_with_idx(feature_index, dictionary_activations, tokenizer, token_amount, dataset, k=10, setting="max"):
    best_feature_activations = dictionary_activations[:, feature_index]
    # Sort the features by activation, get the indices
    if setting=="max":
        found_indices = torch.argsort(best_feature_activations, descending=True)[:k]
    elif setting=="uniform":
        # min_value = torch.min(best_feature_activations)
        min_value = torch.min(best_feature_activations)
        max_value = torch.max(best_feature_activations)

        # Define the number of bins
        num_bins = k

        # Calculate the bin boundaries as linear interpolation between min and max
        bin_boundaries = torch.linspace(min_value, max_value, num_bins + 1)

        # Assign each activation to its respective bin
        bins = torch.bucketize(best_feature_activations, bin_boundaries)

        # Initialize a list to store the sampled indices
        sampled_indices = []

        # Sample from each bin
        for bin_idx in torch.unique(bins):
            if(bin_idx==0): # Skip the first one. This is below the median
                continue
            # Get the indices corresponding to the current bin
            bin_indices = torch.nonzero(bins == bin_idx, as_tuple=False).squeeze(dim=1)
            
            # Randomly sample from the current bin
            sampled_indices.extend(np.random.choice(bin_indices, size=1, replace=False))

        # Convert the sampled indices to a PyTorch tensor & reverse order
        found_indices = torch.tensor(sampled_indices).long().flip(dims=[0])
    else: # random
        # get nonzero indices
        nonzero_indices = torch.nonzero(best_feature_activations)[:, 0]
        # shuffle
        shuffled_indices = nonzero_indices[torch.randperm(nonzero_indices.shape[0])]
        found_indices = shuffled_indices[:k]
    num_datapoints = int(dictionary_activations.shape[0]/token_amount)
    datapoint_indices =[np.unravel_index(i, (num_datapoints, token_amount)) for i in found_indices]
    text_list = []
    full_text = []
    token_list = []
    full_token_list = []
    for md, s_ind in datapoint_indices:
        md = int(md)
        s_ind = int(s_ind)
        full_tok = torch.tensor(dataset[md]["input_ids"])
        full_text.append(tokenizer.decode(full_tok))
        tok = dataset[md]["input_ids"][:s_ind+1]
        text = tokenizer.decode(tok)
        text_list.append(text)
        token_list.append(tok)
        full_token_list.append(full_tok)
    return text_list, full_text, token_list, full_token_list, found_indices

In [32]:
from interp_utils import *
# Can sort by MMCS w/ the larger dictionary
# indexed_feature = 13
# best_feature = int(max_indices[indexed_feature])
# Or just random index 
feature_idx = 13

text_list, full_text, token_list, full_token_list, sampled_indices = get_feature_datapoints_with_idx(feature_idx, dict_activations_prior, model.tokenizer, token_amount, dataset, setting="max")
# text_list, full_text, token_list, full_token_list = get_feature_datapoints(feature_idx, dict_activations_prior, model.tokenizer, token_amount, dataset, setting="max")
# text_list, full_text, token_list, full_token_list = get_feature_datapoints(best_feature, dictionary_activations, dataset, setting="max")
# visualize_text(full_text, best_feature, model, autoencoder, layer)
print("Sampled Index", sampled_indices)
visualize_text(text_list, feature_idx, model, autoencoder_prior, layer_focus - 1)

Sampled Index tensor([ 6738, 38178, 14329, 31726, 31759, 15716, 18798, 25789, 25622, 35054])


## Find the **downstream** feature in the next layer that is most activated by random subset of this feature

In [33]:
prior_feature_idx = 13

text_list, full_text, token_list, full_token_list, sampled_indices = get_feature_datapoints_with_idx(feature_idx, dict_activations_prior, model.tokenizer, token_amount, dataset, setting="max")

print("Sampled Index", sampled_indices)
visualize_text(text_list, feature_idx, model, autoencoder_prior, layer_focus - 1)

Sampled Index tensor([ 6738, 38178, 14329, 31726, 31759, 15716, 18798, 25789, 25622, 35054])


In [42]:
# dict_activations_prior.shape, dict_activations_main.shape
TOP_FEATURES = 10
# We do not need absolute value as I think that everything is positive b/c ReLU
summed = dict_activations_main[sampled_indices].sum(dim=0)
summed_abs = summed.abs()
print(summed.argmax())
main_features = summed.argsort(descending=True)[:TOP_FEATURES]
main_features, summed_abs.argsort(descending=True)[:TOP_FEATURES]

tensor(1287)


(tensor([1287,   13, 1515, 1926, 2919, 2274,  621, 2221,  715, 1261]),
 tensor([1287,   13, 1515, 1926, 2919, 2274,  621, 2221,  715, 1261]))

In [None]:
feature_idx = main_features[0]
 
text_list, full_text, token_list, full_token_list, sampled_indices = get_feature_datapoints_with_idx(feature_idx, dict_activations_main, model.tokenizer, token_amount, dataset, setting="max")
visualize_text(text_list, feature_idx, model, autoencoder_prior, layer_focus)

In [17]:
ablate_text(text_list, feature_idx, model, autoencoder, layer)

In [18]:
ablate_feature_direction_display(full_text, autoencoder, model, layer, features=feature_idx)

In [19]:
logit_lens(model, feature_idx, autoencoder.get_learned_dict())

['";', ' respectively', ' {¶', '()"', 'wise', '<>();', 'woke', 'essential', 'ients', 'uple', 'rost', '\n\t\n', 'Delegate', 'free', '.";', 'tetra', 'append', ' AFFIRMED', 'rocy', '>";']
tensor([1.4627, 1.3649, 1.3501, 1.3298, 1.2833, 1.2640, 1.2484, 1.2394, 1.2345,
        1.2338, 1.2326, 1.2159, 1.2127, 1.2017, 1.1831, 1.1810, 1.1771, 1.1766,
        1.1545, 1.1515])
