In [25]:
from utils.probe_confidence_intervals import model_setup, get_activations
from utils.preprocessing import load_txt_data
from pathlib import Path

import torch

class HookManager2():
    def __init__(self, model):
        self.model = model
        self.hooks = []

    def attach_residstream_hook(self, layer, pre_mlp=False, pythia=False):
        if pre_mlp:
            if pythia:
                hookpoint = f'gpt_neox.layers.{layer}.mlp'
            else:
                hookpoint = f'transformer.h.{layer}.mlp'
        else:
            if pythia:
                hookpoint = f'gpt_neox.layers.{layer}.attention'
            else:
                hookpoint = f'transformer.h.{layer}.attn'
        
        extracted_output = []
        def residstream_hook(module, input, output):
            extracted_output.append(input[0].squeeze(0).detach())

        self.hooks.append(
            self.model.get_submodule(hookpoint).register_forward_hook(residstream_hook)
        )

        return extracted_output
    
    def attach_resid_stream_steer_hook(self, layer, steering_vector, scalar, pre_mlp=False, pythia=False):
        if pre_mlp:
            if pythia:
                hookpoint = f'gpt_neox.layers.{layer}.mlp'
            else:
                hookpoint = f'transformer.h.{layer}.mlp'
        else:
            if pythia:
                hookpoint = f'gpt_neox.layers.{layer}.attention'
            else:
                hookpoint = f'transformer.h.{layer}.attn'



        def steering_hook(module, input):
            activation = input[0]

            steering_norm = steering_vector / torch.norm(steering_vector)
            
            projection_magnitudes = (activation @ steering_norm).unsqueeze(-1)
            
            steering_norm_ = steering_norm.view(1, 1, -1)

            projections = projection_magnitudes * steering_norm_

            modified = activation + scalar * projections

            act_norm = torch.norm(activation, dim=2).unsqueeze(-1)
            modified_norm = torch.norm(activation, dim=2).unsqueeze(-1)

            modified = modified * (act_norm / modified_norm)

            return (modified,) + input[1:] if len(input) > 1 else (modified,)
        
        self.hooks.append(
            self.model.get_submodule(hookpoint).register_forward_pre_hook(steering_hook)
        )
        
    def attach_residual_stream_steering_vector(self, layer, steering_vector, plus, pre_mlp=False, pythia=False):
        #This is martins' 
        
        if pre_mlp:
            if pythia:
                hookpoint = f'gpt_neox.layers.{layer}.mlp'
            else:
                hookpoint = f'transformer.h.{layer}.mlp'
        else:
            if pythia:
                hookpoint = f'gpt_neox.layers.{layer}.attention'
            else:
                hookpoint = f'transformer.h.{layer}.attn'
        
        def steering_hook(module, input):
            activation = input[0]
            if plus:
                activation = activation + steering_vector
            else:
                activation = activation - steering_vector
            return activation

        self.hooks.append(
            self.model.get_submodule(hookpoint).register_forward_pre_hook(steering_hook)
        )

    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):

        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()



In [26]:
print("Load model")
model_name  = "AI-Sweden-Models/gpt-sw3-356m"

model, tokenizer, device = model_setup(model_name)

raw_data_folder = Path('data/antibiotic/')
print("Load data")
ds = load_txt_data(
    file_paths={
        'da': raw_data_folder / 'da.txt',
        'en': raw_data_folder / 'en.txt',
        'sv': raw_data_folder / 'sv.txt',
        'nb': raw_data_folder / 'nb.txt',
        'is': raw_data_folder / 'is.txt'
    },
    file_extension='txt'
)

loader = DataLoader(ds, batch_size=32, shuffle=True)


Load model
Load data


In [27]:
meta_data = {}
meta_data["hidden_layers"] = model.config.num_hidden_layers

try:
    meta_data["hidden_size"] = model.config.n_embd
except AttributeError:
    meta_data["hidden_size"] = model.config.hidden_size
    



In [48]:


def get_steering_vectors(ds, 
                         languages:list,
                         meta_data,
                         tokenizer,
                         device,
                         model):
    d = dict()
    for lang in languages:
        
        ds = ds.filter_by_language(lang)
        loader = DataLoader(ds, batch_size=32, shuffle=True)
        activation_ds_by_layer = get_activations(meta_data,loader, tokenizer, device, model)
        d[lang] = [layer.predictors for layer in activation_ds_by_layer]
    return d
languages = ["da","en"]
get_steering_vectors(ds,languages,meta_data, tokenizer, device, model)
    

 15%|█▌        | 4/26 [00:28<02:25,  6.60s/it]

  0%|          | 0/130 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  5%|▍         | 6/130 [00:40<13:52,  6.71s/it]


In [33]:
english = activation_ds_by_layer[0].predictors

In [34]:
english_steering = torch.stack(english).mean(dim=0)


In [39]:
#why can we subtract a feature but not aplify a feature
#talk about other steering method https://arxiv.org/html/2402.01618v1#S3.SS1
#

In [46]:

text = 'This is a short story about '
tokenized = tokenizer(text, return_tensors='pt').to(device)

with HookManager2(model) as hook_manager:
    
    hook_manager.attach_residual_stream_steering_vector(
        6,
        english_steering,
        plus = False,
        pre_mlp=False,
        pythia=False
        
    )


    output_nb_steering = [
        model.generate(tokenized.input_ids, max_length=70, temperature=0.7, top_p=0.9, do_sample=True)
        for _ in range(10)
    ]

for output in output_nb_steering:
    print(tokenizer.decode(output[0]).replace('\n', ' '))
    print()

This is a short story about 1898, when the young man who was to become a successful entrepreneur was at the height of his powers. In the novel, the protagonist is a young man who is trying to make his fortune. In this novel, the main character is a young man who is trying to make his fortune.

This is a short story about 2 people who live in a small town and one day they get a letter from a school teacher saying that he has been the most evil person ever. He had been a bad person before but now he has become a great person. He is the most evil person in the world and he is so sad. He is

This is a short story about 3 teenagers. The story is about a boy and his dog.    The boy's dog is a golden retriever.    The boy is going to school.    The boy's dog is going to school.    The boy is going to

This is a short story about 4 brothers, 1 sister and 1 friend.  I just finished this book and I loved it.  It's a bit dark and a little sad but it's very well written and it's a good read.  It's