In [1]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from huggingface_hub import hf_hub_download, notebook_login
import numpy as np
import torch
import os



Code used from (run without cuda)

https://colab.research.google.com/drive/17dQFYUYnuKnP6OwQPH9v_GSYUW5aj-Rp

https://huggingface.co/google/gemma-scope

In [2]:
# notebook_login()

In [None]:
torch.set_grad_enabled(False)

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b",
    device_map='auto',
)



In [None]:
tokenizer =  AutoTokenizer.from_pretrained("google/gemma-2-2b")

In [None]:

prompt = "Would you be able to travel through time using a wormhole?"

inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True)#.to("cuda")
print(inputs)

outputs = model.generate(input_ids=inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))

tensor([[     2,  18925,    692,    614,   3326,    577,   5056,   1593,   1069,
           2177,    476,  47420,  18216, 235336]])
<bos>Would you be able to travel through time using a wormhole?

[Answer 1]

Yes, you can travel through time using a wormhole.

A wormhole is a theoretical object that connects two points in space-time. It is a tunnel through space-time that allows objects to travel from


In [None]:
path_to_params = hf_hub_download(
    repo_id="google/gemma-scope-2b-pt-res",
    filename="layer_20/width_16k/average_l0_71/params.npz",
    force_download=False,
)


In [14]:
# params = np.load(path_to_params)
# pt_params = {k: torch.from_numpy(v).cuda() for k, v in params.items()}
# Load SAE params without forcing .cuda()
params = np.load(path_to_params)
pt_params = {k: torch.from_numpy(v) for k, v in params.items()}  # ← no .cuda()


In [18]:
{k:v.shape for k, v in pt_params.items()}

{'W_dec': torch.Size([16384, 2304]),
 'W_enc': torch.Size([2304, 16384]),
 'b_dec': torch.Size([2304]),
 'b_enc': torch.Size([16384]),
 'threshold': torch.Size([16384])}

In [15]:
pt_params["W_enc"].norm(dim=0)

tensor([1.2101, 1.1695, 0.9836,  ..., 1.0630, 0.9997, 1.1070])

In [16]:
import torch.nn as nn
class JumpReLUSAE(nn.Module):
  def __init__(self, d_model, d_sae):
    # Note that we initialise these to zeros because we're loading in pre-trained weights.
    # If you want to train your own SAEs then we recommend using blah
    super().__init__()
    self.W_enc = nn.Parameter(torch.zeros(d_model, d_sae))
    self.W_dec = nn.Parameter(torch.zeros(d_sae, d_model))
    self.threshold = nn.Parameter(torch.zeros(d_sae))
    self.b_enc = nn.Parameter(torch.zeros(d_sae))
    self.b_dec = nn.Parameter(torch.zeros(d_model))

  def encode(self, input_acts):
    pre_acts = input_acts @ self.W_enc + self.b_enc
    mask = (pre_acts > self.threshold)
    acts = mask * torch.nn.functional.relu(pre_acts)
    return acts

  def decode(self, acts):
    return acts @ self.W_dec + self.b_dec

  def forward(self, acts):
    acts = self.encode(acts)
    recon = self.decode(acts)
    return recon


In [17]:
sae = JumpReLUSAE(params['W_enc'].shape[0], params['W_enc'].shape[1])
sae.load_state_dict(pt_params)

<All keys matched successfully>

In [48]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# sae = sae.to(device)


In [19]:

def gather_residual_activations(model, target_layer, inputs):
  target_act = None
  def gather_target_act_hook(mod, inputs, outputs):
    nonlocal target_act # make sure we can modify the target_act from the outer scope
    target_act = outputs[0]
    return outputs
  handle = model.model.layers[target_layer].register_forward_hook(gather_target_act_hook)
  _ = model.forward(inputs)
  handle.remove()
  return target_act

In [20]:

target_act = gather_residual_activations(model, 20, inputs)

In [51]:
# sae.cuda()

In [21]:
sae_acts = sae.encode(target_act.to(torch.float32))
recon = sae.decode(sae_acts)


# device = next(sae.parameters()).device  # Automatically get SAE's device
# target_act = target_act.to(device=device, dtype=torch.float32)

# sae_acts = sae.encode(target_act)
# recon = sae.decode(sae_acts)


In [22]:
1 - torch.mean((recon[:, 1:] - target_act[:, 1:].to(torch.float32)) **2) / (target_act[:, 1:].to(torch.float32).var())

tensor(0.8887)

In [23]:
(sae_acts > 1).sum(-1)

tensor([[7017,   47,   65,   70,   55,   72,   65,   75,   80,   72,   68,   93,
           86,   89]])

In [24]:
values, inds = sae_acts.max(-1)

inds

tensor([[ 6631,  5482, 10376,  1670, 11023,  7562,  9407,  8399, 12935, 10004,
         10004, 10004, 12935,  3442]])

In [25]:
keyword_1 = "inuit"
keyword_2 = "americans"

template = "The {} have a long history and culture."
prompt_1 = template.format(keyword_1)
prompt_2 = template.format(keyword_2)


In [26]:
inputs_1 = tokenizer(prompt_1, return_tensors="pt")  # .to("cuda")
inputs_2 = tokenizer(prompt_2, return_tensors="pt")  # .to("cuda")


In [27]:
act_1 = gather_residual_activations(model, 20, inputs_1["input_ids"])
act_2 = gather_residual_activations(model, 20, inputs_2["input_ids"])


In [None]:
sae_acts_1 = sae.encode(act_1.to(torch.float32))
sae_acts_2 = sae.encode(act_2.to(torch.float32))

# count how many features fired per token
active_feats_1 = (sae_acts_1[:, 1:] > 0).sum().item()
active_feats_2 = (sae_acts_2[:, 1:] > 0).sum().item()

print(f"{keyword_1} → {active_feats_1} active features")
print(f"{keyword_2} → {active_feats_2} active features")
print(f"Difference → {abs(active_feats_1 - active_feats_2)}")


inuit → 698 active features
americans → 685 active features
Difference → 13
