In [28]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils.create_bible_data import load_targeted_steering_vectors
from utils.steering import generate_with_steering
from classes.hook_manager import HookManager
from utils.probe_confidence_intervals import model_setup
from transformers import GPT2LMHeadModel, GPT2Config
from transformers.models.gpt2.modeling_gpt2 import GPT2Block
from typing import Optional, Tuple, Union


from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Attention, GPT2MLP
import torch
import torch.nn as nn

class GPT2BlockWithSteering(GPT2Block):
    def __init__(self, config, steering_vector: torch.Tensor, scalar: float = 1.0, layer_idx: int = None):
        super().__init__(config, layer_idx=layer_idx)
        # Prepare steering vector for broadcast: (1, 1, hidden_size)
        self.register_buffer('steering_vector', steering_vector.view(1, 1, -1))
        self.scalar = scalar

    def forward(self, hidden_states: torch.FloatTensor, **kwargs) -> tuple:
        # Extract common arguments (compat with huggingface deprecation)
        use_cache = kwargs.get('use_cache', False)
        output_attn = kwargs.get('output_attentions', False)

        # 1. Self-Attention
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_output, self_attn_weights = self.attn(
            hidden_states
        )
        hidden_states = attn_output + residual
        

        # 3. Feed-Forward with Steering Injection
        residual = hidden_states
        

        hidden_states = self.ln_2(hidden_states)
        
        hidden_states = hidden_states + self.steering_vector * self.scalar

        ff_hidden = self.mlp(hidden_states)
        hidden_states = residual + ff_hidden

        # 4. Assemble outputs
        outputs = (hidden_states,)
        if use_cache:
            outputs += (self_attn_weights,)
        if output_attn:
            outputs += (self_attn_weights,)

        return outputs


#model, tokenizer, device = model_setup("downloaded_models/gpt_gptsw3_en_da_is_no_356m_gbs1024")
model, tokenizer, device = model_setup("AI-Sweden-Models/gpt-sw3-356m")


target, complement, combined = load_targeted_steering_vectors("steering_vectors/DA/",device)

layer_index = 13

# Replace layer 5 with your custom block
model.transformer.h[layer_index] = GPT2BlockWithSteering(model.config, combined[layer_index],scalar = 5)

tokenized = tokenizer("The cat (Felis catus), also referred",return_tensors= "pt")
out = model.generate(tokenized.input_ids, pad_token_id=tokenizer.eos_token_id,max_new_tokens = 40)
print(tokenizer.decode(out[0]))

found device: cpu


  complement[int(layer)] = torch.load(str(steering_vector_path + vector),map_location=torch.device(device))
  target[int(layer)] = torch.load(str(steering_vector_path +vector),map_location=torch.device(device))
  combined[int(layer)] = torch.load(str(steering_vector_path +vector),map_location=torch.device(device))
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The cat (Felis catus), also referred till är.
. The rest of the world, and the last one, is the rest of the world, and the one
F. The last one, and the one, is the last


In [32]:
from utils.probe_confidence_intervals import model_setup
from utils.create_bible_data import load_targeted_steering_vectors
from classes.hook_manager import HookManager


model, tokenizer, device = model_setup("AI-Sweden-Models/gpt-sw3-356m")
target, complement, combined = load_targeted_steering_vectors("steering_vectors/DA/",device)

found device: cpu


  complement[int(layer)] = torch.load(str(steering_vector_path + vector),map_location=torch.device(device))
  target[int(layer)] = torch.load(str(steering_vector_path +vector),map_location=torch.device(device))
  combined[int(layer)] = torch.load(str(steering_vector_path +vector),map_location=torch.device(device))


In [None]:
model.state_dict()

OrderedDict([('transformer.wte.weight',
              tensor([[ 0.0276,  0.0280,  0.0277,  ...,  0.0184,  0.0378,  0.0152],
                      [ 0.0276,  0.0280,  0.0277,  ...,  0.0184,  0.0378,  0.0152],
                      [ 0.0276,  0.0280,  0.0277,  ...,  0.0184,  0.0378,  0.0152],
                      ...,
                      [ 0.0067,  0.0131,  0.0027,  ...,  0.0106,  0.0176, -0.0132],
                      [ 0.0095,  0.0139, -0.0015,  ...,  0.0175,  0.0117, -0.0058],
                      [ 0.0008,  0.0132, -0.0052,  ...,  0.0107,  0.0055, -0.0124]])),
             ('transformer.wpe.weight',
              tensor([[-0.0085, -0.0114, -0.0040,  ..., -0.0029, -0.0089,  0.0006],
                      [-0.0071, -0.0002, -0.0021,  ...,  0.0017, -0.0081, -0.0047],
                      [-0.0060, -0.0152, -0.0008,  ..., -0.0008, -0.0086, -0.0059],
                      ...,
                      [-0.0102, -0.0449, -0.0052,  ..., -0.0046, -0.0131,  0.0057],
                      [

In [36]:
model.state_dict()["transformer.wte.weight"]

tensor([[ 0.0276,  0.0280,  0.0277,  ...,  0.0184,  0.0378,  0.0152],
        [ 0.0276,  0.0280,  0.0277,  ...,  0.0184,  0.0378,  0.0152],
        [ 0.0276,  0.0280,  0.0277,  ...,  0.0184,  0.0378,  0.0152],
        ...,
        [ 0.0067,  0.0131,  0.0027,  ...,  0.0106,  0.0176, -0.0132],
        [ 0.0095,  0.0139, -0.0015,  ...,  0.0175,  0.0117, -0.0058],
        [ 0.0008,  0.0132, -0.0052,  ...,  0.0107,  0.0055, -0.0124]])

In [11]:
layer = 13
steering_lambda = 15
steering_vector = combined[layer]

with HookManager(model) as hook_manager:

    hook_manager.attach_residual_stream_activation_based_steering_vector(
        layer=layer,
        steering_vector=steering_vector.to(device),
        plus=True,
        scalar=steering_lambda,
        pre_mlp=False,
        pythia=False
    )
    tokenized = tokenizer("Jesus is a little",return_tensors= "pt")
    out = model.generate(tokenized.input_ids, pad_token_id=tokenizer.eos_token_id,max_new_tokens = 40)
    print(tokenizer.decode(out[0]))


Jesus is a little for ungodelig, men det er jo ikke noget, der er så meget ved det.

Det er jo ikke noget, der er så meget ved det, men det er jo ikke noget


In [17]:
combined[15]

tensor([-0.1049,  0.4062, -0.0869,  ...,  0.2007, -0.3405, -0.0782])

In [None]:
from transformers import PretrainedConfig
from torch import Tensor
class SteeringConfig(PretrainedConfig):
    model_type = "steering-gptsw3"

    def __init__(
        self,
        #steering_vector= None,
        layer:   int   = 13,
        plus:    bool  = True,
        scalar:  float = 1.0,
        pre_mlp: bool  = False,
        pythia:  bool  = False,
        **kwargs,
    ):
        
        self.layer   = layer
        self.plus    = plus
        self.scalar  = scalar
        self.pre_mlp = pre_mlp
        self.pythia  = pythia
        self.steering_vector = steering_vector
        super().__init__(**kwargs)

In [27]:
SteeringConfig_config = SteeringConfig( layer = 13, plus= True, scalar = 1000, pre_mlp = False, pythia = False)
SteeringConfig_config.save_pretrained("steering-gptsw3")

In [31]:
from transformers import PreTrainedModel

class AwesomeGPTsw3(PreTrainedModel):
    config_class = SteeringConfig
    def __init__(self, config_class: SteeringConfig, pre_trained_model, steering_vector):
        super().__init__(config_class)
        self.model = pre_trained_model
        self.steering_vector = steering_vector
    
    def forward(self, **kwargs):
        with HookManager(model) as hook_manager:
            hook_manager.attach_residual_stream_activation_based_steering_vector(
                layer=self.config_class.layer,
                steering_vector=self.steering_vector,
                plus=True,
                scalar=steering_lambda,
                pre_mlp=False,
                pythia=False
            )
            idk = self.model.forward(**kwargs)
        return idk

In [None]:
resnet50d = AwesomeGPTsw3(SteeringConfig)


In [None]:

        
import torch
from transformers import GPT2LMHeadModel, AutoModelForCausalLM

class SteeringGPT2LMHeadModel(GPT2LMHeadModel):
    config_class = SteeringConfig
    base_model_prefix = "transformer"  # same prefix GPT2LMHeadModel uses

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        steering_vector: torch.Tensor = None,
        layer:   int   = None,
        plus:    bool  = None,
        scalar:  float = None,
        pre_mlp: bool  = None,
        pythia:  bool  = None,
        **kwargs,
    ):
        # 1) Load config (merge any overrides)
        config = SteeringConfig.from_pretrained(
            pretrained_model_name_or_path,
            layer=layer,
            plus=plus,
            scalar=scalar,
            pre_mlp=pre_mlp,
            pythia=pythia,
            **kwargs,
        )

        # 2) Load GPT-2 weights into our subclass
        model: SteeringGPT2LMHeadModel = super().from_pretrained(
            pretrained_model_name_or_path,
            config=config,
            **kwargs,
        )

        # 3) Attach your HookManager if a vector is provided
        if steering_vector is not None:
            model.hook_manager = HookManager(model)
            model.hook_manager.attach_residual_stream_activation_based_steering_vector(
                layer=config.layer,
                steering_vector=steering_vector.to(model.device),
                plus=config.plus,
                scalar=config.scalar,
                pre_mlp=config.pre_mlp,
                pythia=config.pythia,
            )
        return model

    # (no need to override forward; inherited from GPT2LMHeadModel)
# so AutoModelForCausalLM.from_pretrained picks up your class
from transformers import AutoConfig, AutoModelForCausalLM

AutoConfig.register("steering-gpt2", SteeringConfig)
AutoModelForCausalLM.register(SteeringConfig, SteeringGPT2LMHeadModel)

In [20]:
model.save_pretrained("my-steering-model")

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "AI-Sweden-Models/gpt-sw3-356m",
    trust_remote_code=True,
)
model = AutoModelForCausalLM.from_pretrained(
    "my-steering-model",
    steering_vector=combined[15],       # your torch.Tensor
    # (optionally) override layer/plus/scalar/etc here
    trust_remote_code=True,
)


TypeError: GPT2LMHeadModel.__init__() got an unexpected keyword argument 'steering_vector'