# Steer Explore

In [1]:
%pip install transformers torch pandas numpy scikit-learn matplotlib seaborn tqdm sae-lens --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from sae_lens import SAE
from transformers import GPTNeoXForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)

In [5]:
inputs = tokenizer("The homework is difficult. I need help.", return_tensors="pt")
tokens = model.generate(**inputs)
base_answer = tokenizer.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#### Load SAE

In [6]:
from sae_lens import SAE

sae, cfg_dict, sparsity = SAE.from_pretrained(
    release="pythia-70m-deduped-att-sm", 
    sae_id="blocks.1.hook_attn_out", 
)

In [7]:
cfg_dict

{'architecture': 'standard',
 'd_in': 512,
 'd_sae': 32768,
 'dtype': 'torch.float32',
 'device': 'cpu',
 'model_name': 'pythia-70m-deduped',
 'hook_name': 'blocks.1.hook_attn_out',
 'hook_layer': 1,
 'hook_head_index': None,
 'activation_fn_str': 'relu',
 'activation_fn_kwargs': {},
 'apply_b_dec_to_input': True,
 'finetuning_scaling_factor': False,
 'sae_lens_training_version': None,
 'prepend_bos': False,
 'dataset_path': 'EleutherAI/the_pile_deduplicated',
 'dataset_trust_remote_code': True,
 'context_size': 128,
 'normalize_activations': 'none',
 'neuronpedia_id': 'pythia-70m-deduped/1-att-sm'}

In [8]:
sae

SAE(
  (activation_fn): ReLU()
  (hook_sae_input): HookPoint()
  (hook_sae_acts_pre): HookPoint()
  (hook_sae_acts_post): HookPoint()
  (hook_sae_output): HookPoint()
  (hook_sae_recons): HookPoint()
  (hook_sae_error): HookPoint()
)

In [9]:
sparsity

tensor([-10., -10., -10.,  ..., -10., -10., -10.])

In [10]:
print(model)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

## Steering

Taken from: https://www.neuronpedia.org/gemma-2-9b-it/steer

There are many ways to steer a model. Here, we do the following for each feature being steered:

Multiply the steering strength by the strength multiple to get the steering coefficient.
Get the steering vector from the SAE, which is the feature's decoder weights.
Add the steering coefficient * steering vector to the activations.

In code, it looks like this:

    steering_coefficient = strength_multiple * steering_strength
    steering_vector = sae.W_dec[feature_index]
    activations += steering_coefficient * steering_vector
    
In the method we use, a strength_multiple of 0 means no steering will occur.
Another variation of steering multiplies the steering vector by the top known activation value as well. That's a totally valid method, but here's why this version doesn't do this (but we may add it in the future):

We may be missing activations for features that are very sparse (or we may not have run enough test prompts during dashboard generation) - and we'd still want to allow steering for those features.
We want results to be consistent, regardless of what activations are known - eg if someone else steers the same feature with the same method (but have different top activations), we want results to be same, or very close.

In [11]:
# feature used for steering (used neuropedia to find the feature)
feature_index = 15698

# scale & intensity
steering_strength = 5.0 
strength_multiple = 1.2  

steering_coefficient = strength_multiple * steering_strength
# decoder weights for the feature
steering_vector = sae.W_dec[feature_index]  

# Define a hook to modify the activations
def steer_activations(module, inputs, output):
    # instance check to avoid tuple error
    if isinstance(output, tuple):
        output_tensor = output[0]  
        modified_output = output_tensor + (steering_coefficient * steering_vector)
        return (modified_output,) + output[1:]  
    
    return output + (steering_coefficient * steering_vector)


In [12]:
steering_vector.shape

torch.Size([512])

In [13]:
submodule_names = dict(model.named_modules()).keys()
print(submodule_names)

dict_keys(['', 'gpt_neox', 'gpt_neox.embed_in', 'gpt_neox.emb_dropout', 'gpt_neox.layers', 'gpt_neox.layers.0', 'gpt_neox.layers.0.input_layernorm', 'gpt_neox.layers.0.post_attention_layernorm', 'gpt_neox.layers.0.post_attention_dropout', 'gpt_neox.layers.0.post_mlp_dropout', 'gpt_neox.layers.0.attention', 'gpt_neox.layers.0.attention.rotary_emb', 'gpt_neox.layers.0.attention.query_key_value', 'gpt_neox.layers.0.attention.dense', 'gpt_neox.layers.0.attention.attention_dropout', 'gpt_neox.layers.0.mlp', 'gpt_neox.layers.0.mlp.dense_h_to_4h', 'gpt_neox.layers.0.mlp.dense_4h_to_h', 'gpt_neox.layers.0.mlp.act', 'gpt_neox.layers.1', 'gpt_neox.layers.1.input_layernorm', 'gpt_neox.layers.1.post_attention_layernorm', 'gpt_neox.layers.1.post_attention_dropout', 'gpt_neox.layers.1.post_mlp_dropout', 'gpt_neox.layers.1.attention', 'gpt_neox.layers.1.attention.rotary_emb', 'gpt_neox.layers.1.attention.query_key_value', 'gpt_neox.layers.1.attention.dense', 'gpt_neox.layers.1.attention.attention_dro

In [14]:
# register the hook 
# --> means that the steering function will be called every time the module "gpt_neox.layers.1.attention" is called
# "blocks.1.hook_attn_out"  => "gpt_neox.layers.1.attention"
hook_name = "gpt_neox.layers.1.attention"
hook = model.get_submodule(hook_name)
hook.register_forward_hook(lambda module, inputs, output: steer_activations(module, inputs, output))

<torch.utils.hooks.RemovableHandle at 0x12f769390>

In [15]:
# Run inference with steering applied
prompt = "The homework is difficult. I need help."
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [16]:
# Decode and print the output
steered_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f" Base answer: {base_answer}")
print(f"Steered answer: {steered_answer}")

 Base answer: The homework is difficult. I need help.

I have a problem with the problem. I
Steered answer: The homework is difficult. I need help. I need to get the best of my homework.

