# Looking at the FSRL steering vector

In [None]:
%load_ext autoreload
%autoreload 2

from fsrl.utils import SAEfeatureAnalyzer
from fsrl import SAEAdapter, HookedModel
from dotenv import load_dotenv
import torch
from transformer_lens import HookedTransformer

load_dotenv()

True

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

release = "gpt2-small-res-jb"
sae_id = "blocks.7.hook_resid_pre"

adapter_kwargs = {
    "use_lora_adapter": True,
    "lora_rank": 64,
    "lora_alpha": 32,
    "fusion_mode": "additive",
}

sae, cfg_dict, sparsity = SAEAdapter.from_pretrained(release, sae_id, device=device, **adapter_kwargs)
model = HookedTransformer.from_pretrained("gpt2-small", device=device)
sae_model = HookedModel(model, sae)

This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)


Loaded pretrained model gpt2-small into HookedTransformer


In [3]:
sae_analyzer = SAEfeatureAnalyzer(sae_model)

Fetching all explanations for gpt2-small/7-res-jb...
Successfully loaded 24570 feature explanations.


In [8]:
# Assume analyzer is your SAEfeatureAnalyzer instance
steering_vec = torch.randn(sae_analyzer.sae.cfg.d_sae)  # example steering vector
df = sae_analyzer.get_steered_features_info(steering_vec, threshold=0.01)
display(df.head())

Unnamed: 0,feature_idx,steering_value,description,modelId,layer,index,explanationModelName,typeName
0,15895,-3.847262,terms related to various disciplines and areas...,gpt2-small,7-res-jb,15895,gpt-3.5-turbo,oai_token-act-pair
1,23399,-3.844694,words related to pulses,gpt2-small,7-res-jb,23399,gpt-3.5-turbo,oai_token-act-pair
2,5530,-3.705429,links and references to related content or art...,gpt2-small,7-res-jb,5530,gpt-3.5-turbo,oai_token-act-pair
3,6324,3.686178,code snippets containing error handling logic,gpt2-small,7-res-jb,6324,gpt-3.5-turbo,oai_token-act-pair
4,22374,-3.627533,words related to locations or companies with t...,gpt2-small,7-res-jb,22374,gpt-3.5-turbo,oai_token-act-pair


In [7]:
df_top_5 = sae_analyzer.get_steered_features_info(steering_vec, threshold=0.01, top_k=5)
display(df_top_5)

Unnamed: 0,feature_idx,steering_value,description,modelId,layer,index,explanationModelName,typeName
0,10753,4.481459,words related to the city of St. Louis,gpt2-small,7-res-jb,10753,gpt-3.5-turbo,oai_token-act-pair
1,24437,-3.991761,phrases or sentences ending with a question mark,gpt2-small,7-res-jb,24437,gpt-3.5-turbo,oai_token-act-pair
2,21031,3.906598,words related to simplicity or basic concepts,gpt2-small,7-res-jb,21031,gpt-3.5-turbo,oai_token-act-pair
3,14150,3.844881,"proper nouns related to the name ""Mankato.""",gpt2-small,7-res-jb,14150,gpt-3.5-turbo,oai_token-act-pair
4,18389,-3.825272,phrases related to comparing different entitie...,gpt2-small,7-res-jb,18389,gpt-3.5-turbo,oai_token-act-pair
