In [9]:
from special_neurons import get_most_negative_sets
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

model_name = 'EleutherAI/pythia-70m'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
most_neg = get_most_negative_sets(model)
most_neg[0].prev_layer_name, most_neg[0].linear_layer_name

('gpt_neox.layers.0.mlp', 'gpt_neox.layers.0.mlp.dense_h_to_4h')

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Replace MODEL_NAME with the actual model name

# Encode the input text
input_text = "The boy with the SFdfdrgfeDSSD"  # Replace YOUR_INPUT_TEXT with your actual input text
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Forward pass to get output logits
outputs = model(input_ids, output_hidden_states=True)
hidden_states = outputs.hidden_states  # Hidden states of all layers

# Access the specific neuron's value
# Replace LAYER_INDEX, BATCH_INDEX, TOKEN_INDEX, NEURON_INDEX with actual indices
# neuron_value = hidden_states[LAYER_INDEX][BATCH_INDEX, TOKEN_INDEX, NEURON_INDEX].item()

print(hidden_states[2].shape)

torch.Size([1, 12, 512])


In [11]:
import transformer_lens

# Load a model (eg GPT-2 Small)
model = transformer_lens.HookedTransformer.from_pretrained(model_name)
# Run the model and get logits and activations
logits, activations = model.run_with_cache("Hello World")

Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [12]:
activations

ActivationCache with keys ['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'bloc

In [15]:
N_MAX_LOOK = 100

# Okie, this is what we want!!


def find_logits_on_mlps():
    for layer_pair in most_neg:
        name = layer_pair.prev_layer_name
        match = re.match(r"gpt_neox\.layers\.(\d+)\.mlp", name)
        if match:
            print(name)
            layer_number = int(match.group(1))
            hooked_name = f"blocks.{layer_number}.mlp.hook_post"
            print(activations[hooked_name].shape)
            # Find the most positive activations
            # TODO: this is weird... it has to be **per token**
            most_neg_out = layer_pair.most_negatives.tolist()
            n_tokens = activations[hooked_name].shape[1]
            print("N Tokens", n_tokens)
            for tok_idx in range(n_tokens):
                maxed = activations[hooked_name][0, tok_idx].argsort(descending=True)[
                    :N_MAX_LOOK]
                for m in maxed:
                    for j in range(len(most_neg_out)):
                        if m == most_neg_out[j]:
                            print(f"Found a match: {m}")
                            # TODO: print token
                            print(
                                f"Activations: {activations[hooked_name][:, tok_idx, m]} on token {tok_idx} with negative value {layer_pair.most_negatives_vals[j]}")
                            effective_change = layer_pair.most_negatives_vals[j] * activations[hooked_name][:, tok_idx, m]
                            print("Effective change", effective_change.item())
                            print()


find_logits_on_mlps()

gpt_neox.layers.0.mlp
torch.Size([1, 3, 2048])
N Tokens 3
Found a match: 227
Activations: tensor([0.4309], device='cuda:0') on token 1 with negative value -13.506959915161133
Effective change tensor([-5.8199], device='cuda:0', grad_fn=<MulBackward0>)

Found a match: 249
Activations: tensor([0.3862], device='cuda:0') on token 1 with negative value -74.46073150634766
Effective change tensor([-28.7561], device='cuda:0', grad_fn=<MulBackward0>)

gpt_neox.layers.1.mlp
torch.Size([1, 3, 2048])
N Tokens 3
Found a match: 47
Activations: tensor([1.0227], device='cuda:0') on token 1 with negative value -17.482093811035156
Effective change tensor([-17.8796], device='cuda:0', grad_fn=<MulBackward0>)

Found a match: 157
Activations: tensor([0.5185], device='cuda:0') on token 2 with negative value -21.773366928100586
Effective change tensor([-11.2893], device='cuda:0', grad_fn=<MulBackward0>)

gpt_neox.layers.2.mlp
torch.Size([1, 3, 2048])
N Tokens 3
Found a match: 111
Activations: tensor([0.8778], 