# Setup

In [6]:
import dotenv
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch as t
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from dataclasses import dataclass
import transformer_lens

from transformer_lens import HookedTransformer
from transformer_lens.hook_points import HookPoint
from transformer_lens.utils import get_act_name
from transformer_lens.FactoredMatrix import FactoredMatrix
import circuitsvis as cv

import gc
from contextlib import contextmanager
from typing import List, Dict, Optional, Callable
import einops
from tqdm import tqdm

from utils.data import (
    a_an_something_questions,
    a_habit_questions,
    a_rabbit_questions,
    an_ape_questions,
    lines_that_rhyme_with_pain,
    lines_that_rhyme_with_quick,
    example_based_rhymed_prompts_quick,
    example_based_unrhymed_prompts_quick,
    rhymed_couplets,
    unrhymed_couplets,
    lines_that_rhyme_with_rabbit,
    lines_that_rhyme_with_habit,
    lines_that_rhyme_with_rabbit_general,
    lines_that_rhyme_with_rabbit_oneshot,
    lines_that_rhyme_with_habit_general,
    lines_that_rhyme_with_habit_oneshot,
)

print(f"PyTorch version: {t.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {t.cuda.is_available()}")
if t.cuda.is_available():
    print(f"CUDA version: {t.version.cuda}")
    print(f"Current device: {t.cuda.current_device()}")
    print(f"Device name: {t.cuda.get_device_name(t.cuda.current_device())}")
# %%
dotenv.load_dotenv("hf.env")
# @title 1.5. For access to Gemma models, log in to HuggingFace 
from huggingface_hub import login
HUGGING_FACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
try:
     login(token=HUGGING_FACE_TOKEN)
     print("Hugging Face login successful (using provided token).")
except Exception as e:
     print(f"Hugging Face login failed. Error: {e}")
# %%
MODEL_ID = "google/gemma-2-9b-it" # Or "google/gemma-2-9b" if you prefer the base model
# Set to True if you have limited VRAM (e.g., < 24GB). Requires bitsandbytes

# How strongly to apply the steering vector. Tune this value (e.g., 0.5 to 5.0)
STEERING_MULTIPLIER = 1.5

# --- Generation Parameters ---
MAX_NEW_TOKENS = 150
TEMPERATURE = 0.7
DO_SAMPLE = True

PyTorch version: 2.6.0+cu124
Transformers version: 4.51.3
CUDA available: True
CUDA version: 12.4
Current device: 0
Device name: NVIDIA L40
Hugging Face login successful (using provided token).


In [3]:
# load model from transformer_lens
model = transformer_lens.HookedTransformer.from_pretrained(MODEL_ID, device="cuda")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Loaded pretrained model google/gemma-2-9b-it into HookedTransformer


In [7]:
# load steering vector
path = "steering_vectors/steering_vector_quick_to_pain_layer_27.pt"
steering_vector = t.load(path).to("cuda")

In [8]:
print(model.W_E.device)
print(steering_vector.device)
print(model.cfg)
print(steering_vector.shape)

cuda:0
cuda:0
HookedTransformerConfig:
{'NTK_by_parts_factor': 8.0,
 'NTK_by_parts_high_freq_factor': 4.0,
 'NTK_by_parts_low_freq_factor': 1.0,
 'act_fn': 'gelu_pytorch_tanh',
 'attention_dir': 'causal',
 'attn_only': False,
 'attn_scale': 16.0,
 'attn_scores_soft_cap': 50.0,
 'attn_types': ['global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
                'global',
                'local',
        

In [None]:
n_layers = model.cfg.n_layers
n_heads = model.cfg.n_heads
layer = 27
reading_from_steering_vector = t.zeros((n_layers - layer, n_heads))