In [28]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Initialize Variables
model_name = "AI-Sweden-Models/gpt-sw3-126m"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
prompt = "Träd är fina för att"

# Initialize Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(64000, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): GELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=64000, bias=False)
)

In [29]:
model.transformer.h[0].mlp.c_fc

Conv1D(nf=3072, nx=768)

In [30]:
prompt = "Stålproduktionen i USA er stærkt politiseret. National sikkerhed nævnes ofte som en grund til at beskytte industrien mod udenlandsk konkurrence. USA,"
generator = pipeline('text-generation', tokenizer=tokenizer, model=model, device=device)
generated = generator(prompt, max_new_tokens=250, do_sample=True, temperature=0.6, top_p=1)[0]["generated_text"]


In [31]:
generated

'Stålproduktionen i USA er stærkt politiseret. National sikkerhed nævnes ofte som en grund til at beskytte industrien mod udenlandsk konkurrence. USA, der har været en af de mest magtfulde industrilande i verden, er med til at beskytte industrien i USA.\nUSA har været et af de mest magtfulde industrilande i verden i de næste 40 år. De har i de senere år udviklet en række produkter, som blandt andet er blevet udstillet på verdens største udstilling i Paris. I 2016 blev USA kåret til "Vigtigst for USA" af "Videnskab.dk" og blev tildelt den prestigefyldte amerikanske "International Business Journal" - verdens bedste business-journalist.\nUSA har en tradition for at være en foregangsland inden for teknologisk innovation, når det gælder forskning og udvikling. Det er en af årsagerne til, at USA har været blandt de mest innovative og dynamiske industrilande i verden. USA har i dag et af verdens største forskningsmiljøer og en ledende position inden for teknologisk innovation.\nUSA er et af d

In [32]:
from classification_probes import HookManager, TextClassificationDataset, ProbeTrainer


import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torch import nn
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset


from classification_probes import TextClassificationDataset, ProbeTrainer, HookManager, ClassificationProbe

try:
    hidden_size = model.config.n_embd
except AttributeError:
    hidden_size = model.config.hidden_size

In [33]:
model.config

GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "AI-Sweden-Models/gpt-sw3-126m",
  "activation_function": "gelu",
  "apply_query_key_layer_scaling": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 2,
  "embd_pdrop": 0.1,
  "eos_token_id": 3,
  "initializer_range": 0.023,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 2048,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": 3072,
  "n_layer": 12,
  "n_positions": 2048,
  "normalize_attention_scores": true,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "tokenizer_class": "GPTSw3Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.46.2",
  "use_cache": false,
  "vocab_size": 64000
}

In [34]:
lab_map = {
    'da': 0,
    'en': 1,
    'is': 2,
    'nb': 3,
    'sv': 4
}

data_loc = 'data/antibiotic/'
ds = TextClassificationDataset.from_txt(data_loc + 'da.txt', lab_map['da'])
ds.add_from_txt(data_loc + 'en.txt', lab_map['en'])
#ds.add_from_txt(data_loc + 'is.txt', lab_map['is'])
#ds.add_from_txt(data_loc + 'nb.txt', lab_map['nb'])
#ds.add_from_txt(data_loc + 'sv.txt', lab_map['sv'])

loader = DataLoader(ds, batch_size=32, shuffle=True)



In [35]:
activation_ds = ActivationDataset()

for text, label in loader:

    tokenized = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with HookManager(model) as hook_manager:
        res_stream_act = hook_manager.attach_residstream_hook(
            layer=2,
            pre_mlp=False,
            pythia=False
        )

        model(**tokenized)

    # flattening [batch, pad_size, ...] to [tokens, ...]
    attn_mask = tokenized.attention_mask.flatten() # [tokens]
    label = label.unsqueeze(-1).expand(-1, tokenized.attention_mask.shape[1]).flatten() # [tokens]
    res_stream_act = res_stream_act[0].view(-1, hidden_size) # [tokens, hidden_size]

    activation_ds.add_with_mask(res_stream_act, label, attn_mask)



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [36]:
len(activation_ds.acts)

45316

In [37]:
activation_ds.acts[0]

tensor([ 3.8405e-02, -6.8613e-03, -3.9146e-02,  5.6455e-03, -1.9041e-02,
         3.8255e-02,  8.9936e-03,  3.7906e-02,  1.4006e-02,  4.4132e-03,
         1.6004e-02,  1.5326e-02,  5.3193e-02,  4.2222e-02,  1.2184e-02,
         1.3802e-02,  5.6083e-03,  4.1842e-03,  1.8618e-02,  5.6741e-02,
        -3.6182e-02,  2.4139e-02, -9.3439e-03,  3.1805e-02, -2.8023e-02,
        -3.1447e-02,  3.4404e-02,  1.4710e-02,  1.1558e-02, -9.1331e-02,
        -4.7984e-03,  1.2365e-03,  6.7118e-02,  3.3524e-03,  1.1171e-01,
         7.5385e-02, -1.5727e-02,  3.8353e-02, -1.1505e-02,  2.2298e-02,
        -3.5171e-03,  6.8540e-02, -1.1583e-02,  3.7033e-02, -3.8669e-03,
        -3.0202e-02,  6.5236e-02, -1.6729e-02,  2.2713e-02,  2.2648e-02,
        -3.9186e-02,  2.7850e-02,  5.6906e-02,  5.4721e-02,  4.0120e-02,
        -6.5567e-03,  4.4506e-02,  7.4212e-02,  1.5662e-03,  2.1036e-02,
        -2.3505e-02,  1.4239e-02,  1.9211e-02,  1.0068e-01,  7.6835e-02,
         5.1052e-02,  5.2480e-02,  1.6218e-03,  4.2

In [38]:
activation_ds.labels[-1]

tensor(1)

In [39]:
#why can we subtract a feature but not aplify a feature
#talk about other steering method https://arxiv.org/html/2402.01618v1#S3.SS1
#

In [46]:
import numpy as np
danish = ([tensor for tensor, label in activation_ds if label == activation_ds.labels[lab_map["da"]]])
english = ([tensor for tensor, label in activation_ds if label == activation_ds.labels[lab_map["en"]]])

In [47]:
danish_steering = torch.stack(danish).mean(dim=0)
english_steering = torch.stack(english).mean(dim=0)


In [51]:

text = 'This is a short story about '
tokenized = tokenizer(text, return_tensors='pt').to(device)

with HookManager(model) as hook_manager:
    # hook_manager.attach_resid_stream_steer_hook(
    #     6,
    #     danish_steering,
    #     2,
    #     pre_mlp=False,
    #     pythia=False
    # )
    hook_manager.attach_resid_stream_steer_hook(
        6,
        english_steering,
        -2,
        pre_mlp=False,
        pythia=False
    )


    output_nb_steering = [
        model.generate(tokenized.input_ids, max_length=70, temperature=0.7, top_p=0.9, do_sample=True)
        for _ in range(10)
    ]

for output in output_nb_steering:
    print(tokenizer.decode(output[0]).replace('\n', ' '))
    print()

This is a short story about 500000000000000000000000000000000000000000000000000000000000000

This is a short story about 6 stories, that's 14 stories. 4   5   6                                          

This is a short story about 5 min, so 5 min er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er er

This is a short story about 100000000000000000000000000000000000000000000000000000000000000

This is a short story about 200000000000000000000000000000000000000000000000000000000000000

This is a short story about 100000000000000000000000000000000000000000000000000000000000000

This is a short story about 30 stories and 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories, 40 stories

This is a short story about 100000000000000000000000000000000000000000000000000000000000000

This is a short story ab