In [1]:
from utils.probe_confidence_intervals import model_setup, get_activations
from utils.preprocessing import load_txt_data
from pathlib import Path
from classes.datahandling import TextClassificationDataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from collections import defaultdict
from classes.hook_manager import HookManager
import torch

In [2]:
print("Load model")
model_name  = "AI-Sweden-Models/gpt-sw3-356m"

model, tokenizer, device = model_setup(model_name)

raw_data_folder = Path('data/antibiotic/')
print("Load data")
ds = load_txt_data(
    file_paths={
        'da': raw_data_folder / 'da.txt',
        'en': raw_data_folder / 'en.txt',
        'sv': raw_data_folder / 'sv.txt',
        'nb': raw_data_folder / 'nb.txt',
        'is': raw_data_folder / 'is.txt'
    },
    file_extension='txt'
)

Load model
Load data


In [3]:
meta_data = {}
meta_data["hidden_layers"] = model.config.num_hidden_layers

try:
    meta_data["hidden_size"] = model.config.n_embd
except AttributeError:
    meta_data["hidden_size"] = model.config.hidden_size
    



In [4]:


def compute_all_steering_vectors(ds: TextClassificationDataset, 
                         languages:list,
                         meta_data: dict,
                         tokenizer: AutoTokenizer,
                         device: str,
                         model: AutoModelForCausalLM) -> dict:
    d = dict()
    for lang in languages:
        
        filtered_ds = ds.filter_by_language(lang)
        loader = DataLoader(filtered_ds, batch_size=32, shuffle=True)
        activation_ds_by_layer = get_activations(meta_data,loader, tokenizer, device, model)
        #Each key has a list of averaged activations meaning that d['en'][2] is the english steering vector
        #for the 2nd layer
        d[lang] = [torch.stack(layer.predictors).mean(dim=0) for layer in activation_ds_by_layer.values()]
    return d

languages = ["da","en","sv","nb","is"]
all_steering_vectos = compute_all_steering_vectors(ds,languages,meta_data, tokenizer, device, model)



  0%|          | 0/26 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
 23%|██▎       | 6/26 [00:35<01:57,  5.86s/it]
 20%|██        | 6/30 [00:26<01:46,  4.42s/it]
 20%|██        | 6/30 [00:36<02:27,  6.16s/it]
 32%|███▏      | 6/19 [00:33<01:12,  5.59s/it]
 23%|██▎       | 6/26 [00:54<03:02,  9.14s/it]


In [5]:
def get_steering_vectors(all_steering_vectos:dict,target_language: str, complement_languages:list) -> []:
    temp_d = defaultdict(list)

    for lang in complement_languages:
        for layer, vector in enumerate(all_steering_vectos[lang]):
            temp_d[layer].append(vector)
    #We represent it as dicts so that it is clear that each key is a layer
    complement_steering_vectors = {layer: torch.stack(value).mean(dim=0) for layer, value in temp_d.items()}
    target_steering_vectors = {i: all_steering_vectos["da"][i] for i in range(len(all_steering_vectos["da"]))}
    return target_steering_vectors, complement_steering_vectors
    
target_steering_vectors, complement_steering_vectors = get_steering_vectors(all_steering_vectos, "da", ["en","sv","nb","is"])

In [7]:
combined_vector_dict = dict()
for layer, vectors in enumerate(zip(target_steering_vectors.values(),complement_steering_vectors.values() )):
    target_vector = vectors[0]
    complement_vector = vectors[1]
    combined = target_vector - complement_vector
    combined_vector_dict[layer] = combined

In [8]:
#why can we subtract a feature but not aplify a feature
#talk about other steering method https://arxiv.org/html/2402.01618v1#S3.SS1
#

In [11]:

text = 'This is a short story about '
tokenized = tokenizer(text, return_tensors='pt').to(device)

with HookManager(model) as hook_manager:
    layer = 14
    
    hook_manager.attach_residual_stream_activation_based_steering_vector(
        layer,
        combined_vector_dict[layer],
        plus = True,
        scalar = 5,
        pre_mlp=False,
        pythia=False  
    )


    output_nb_steering = [
        model.generate(tokenized.input_ids, max_length=70, temperature=0.7, top_p=0.9, do_sample=True)
        for _ in range(10)
    ]

for output in output_nb_steering:
    print(tokenizer.decode(output[0]).replace('\n', ' '))
    print()

 This is a short story about 2 teenage piger. Denise: Denise:<|endoftext|>

 This is a short story about 2 friends, 2 dage efter en skudepisode, hvor en af dem blev skudt i hovedet, og det er også en kort historie om en mand, som har slået en anden mand ihjel.  Jeg håber, I kan lide den!  Mvh.  Thea<|endoftext|>

This is a short story about 25 år, der bor i et stort hus sammen med deres mor, far og tre søskende. Det er en hverdag, hvor man skal være sammen med sine forældre, og det er svært. Det er en hverdag, hvor man skal passe på, hvad man siger og gør. Det er en hverdag, hvor

This is a short story about 20-årige Frederik, der har en særlig plads i hjertet hos sine forældre. En dag bliver Frederik kidnappet af en flok teenagere, der er ude på at kidnappe ham. Frederik vil ikke være alene, og han vil helst være sammen med sin bedste ven, Sebastian. Sebastian er nemlig forelsket i en

This is a short story about 12-year-old Dalton, who has to live in a house with his mother.  The sto