In [1]:
import os

try:
    has_changed_dir
except:
    has_changed_dir = False

try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
except:
    IN_COLAB = False

if IN_COLAB:
    %pip install datasets
    %pip install translate-toolkit
    #%pip install bitsandbytes

    !git clone https://github.com/MartinKirkegaardDK/KDS_MI.git

    if not has_changed_dir:
        os.chdir('KDS_MI')
        has_changed_dir = True
else:
    if not has_changed_dir:
        os.chdir('.')
        has_changed_dir = True

In [8]:
%load_ext autoreload
%autoreload 2

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torch import nn
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm


from classification_probes import TextClassificationDataset, ProbeTrainer, HookManager, ClassificationProbe

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
model_name = "roneneldan/TinyStories-1M"
model_name = "AI-Sweden-Models/gpt-sw3-356m"
model_name = "EleutherAI/pythia-14m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

try:
    hidden_size = model.config.n_embd
except AttributeError:
    hidden_size = model.config.hidden_size


hidden_layers = model.config.num_hidden_layers

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


In [4]:
lab_map = {
    'da': 0,
    'en': 1,
    'is': 2,
    'nb': 3,
    'sv': 4
}

data_loc = 'data/antibiotic/'
ds = TextClassificationDataset.from_txt(data_loc + 'da.txt', lab_map['da'])
ds.add_from_txt(data_loc + 'en.txt', lab_map['en'])
ds.add_from_txt(data_loc + 'is.txt', lab_map['is'])
ds.add_from_txt(data_loc + 'nb.txt', lab_map['nb'])
ds.add_from_txt(data_loc + 'sv.txt', lab_map['sv'])

loader = DataLoader(ds, batch_size=32, shuffle=True)

In [5]:
# making a dataset that can hold the activations and labels.

class ActivationDataset(Dataset):

    def __init__(self):
        self.acts = []
        self.labels = []

    def add_with_mask(self, acts, labels, masks):
        for act, label, mask in zip(acts, labels, masks):
            if mask:
                self.acts.append(act)
                self.labels.append(label)

    def __getitem__(self, index) -> tuple:
        return (self.acts[index], self.labels[index])

    def __len__(self) -> int:
        return len(self.acts)


In [None]:
res_stream_act_by_layer = dict()
activation_ds_by_layer = {
    layer: ActivationDataset()
    for layer in range(hidden_layers)
}

for text, label in tdqm(loader):

    tokenized = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with HookManager(model) as hook_manager:
        for layer in range(hidden_layers):
            res_stream_act_by_layer[layer] = hook_manager.attach_residstream_hook(
                layer=layer,
                pre_mlp=False,
                pythia=True
            )

        model(**tokenized)

    # flattening [batch, pad_size, ...] to [tokens, ...]
    attn_mask = tokenized.attention_mask.flatten() # [tokens]
    label = label.unsqueeze(-1).expand(-1, tokenized.attention_mask.shape[1]).flatten() # [tokens]

    for layer in range(hidden_layers):
        res_stream_act_by_layer[layer] = res_stream_act_by_layer[layer][0].view(-1, hidden_size) # [tokens, hidden_size]
        activation_ds_by_layer[layer].add_with_mask(res_stream_act_by_layer[layer], label, attn_mask)



KeyboardInterrupt: 

In [None]:
# to do!!! make this work with multiple layers

In [9]:
act_loader_by_layer = {
    layer: DataLoader(activation_ds_by_layer[layer], batch_size=32, shuffle=True)
    for layer in range(hidden_layers)
}

In [10]:
input_size = hidden_size
learning_rate = 0.001
reg_lambda = 0.1
num_labs = len(lab_map)


probe_by_layer = {
    layer: ClassificationProbe(in_dim=input_size, num_labs=num_labs, device=device)
    for layer in range(hidden_layers)
}

for layer, probe in probe_by_layer.items():
    act_loader = act_loader_by_layer[layer]
    optimizer = torch.optim.Adam(probe.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(1):
        for act, label in act_loader:

            label = label.to(device)

            outputs = probe(act)
            loss = loss_fn(outputs, label.to(device))
            loss += reg_lambda * sum(torch.norm(param, 2) for param in probe.parameters())

            accuracy = ((torch.argmax(outputs.detach(), dim=1) == label.to(device)).sum() / 32).item()
            print('acc: ', accuracy, end='\t\t')

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print(loss)

acc:  0.1875		tensor(1.7706, grad_fn=<AddBackward0>)
acc:  0.21875		tensor(1.7553, grad_fn=<AddBackward0>)
acc:  0.25		tensor(1.7519, grad_fn=<AddBackward0>)
acc:  0.28125		tensor(1.7253, grad_fn=<AddBackward0>)
acc:  0.21875		tensor(1.7603, grad_fn=<AddBackward0>)
acc:  0.1875		tensor(1.7784, grad_fn=<AddBackward0>)
acc:  0.28125		tensor(1.7531, grad_fn=<AddBackward0>)
acc:  0.28125		tensor(1.7225, grad_fn=<AddBackward0>)
acc:  0.1875		tensor(1.7252, grad_fn=<AddBackward0>)
acc:  0.15625		tensor(1.7796, grad_fn=<AddBackward0>)
acc:  0.0625		tensor(1.7915, grad_fn=<AddBackward0>)
acc:  0.1875		tensor(1.7190, grad_fn=<AddBackward0>)
acc:  0.09375		tensor(1.7900, grad_fn=<AddBackward0>)
acc:  0.09375		tensor(1.7648, grad_fn=<AddBackward0>)
acc:  0.25		tensor(1.6987, grad_fn=<AddBackward0>)
acc:  0.28125		tensor(1.7037, grad_fn=<AddBackward0>)
acc:  0.15625		tensor(1.7612, grad_fn=<AddBackward0>)
acc:  0.34375		tensor(1.6882, grad_fn=<AddBackward0>)
acc:  0.1875		tensor(1.7477, grad_fn=<A

In [None]:
from torch.utils.data import random_split

from sklearn.metrics import classification_report
from collections import defaultdict

In [67]:
results_by_label = defaultdict(list)

for layer in range(hidden_layers):
    test, _ = random_split(activation_ds_by_layer[layer], (0.01, 0.99))

    test_x, test_y = zip(*test)
    test_x = torch.stack(test_x)
    test_y = torch.stack(test_y)

    probs = probe_by_layer[layer](test_x)
    preds = torch.argmax(probs, dim=1)

    accuracy = torch.sum(preds == test_y) / len(preds)

    class_report = classification_report(test_y, preds, output_dict=True, zero_division=1)

    for label in lab_map.values():
        results_by_label[label].append(class_report[str(label)]['f1-score'])

In [13]:
steer_towards = 'da'

steering_vector = probe.linear._parameters['weight'][lab_map[steer_towards]]

neg_steering_vectors = [
    probe.linear._parameters['weight'][lab_map[lang]]
    for lang in lab_map
        if lang != steer_towards
]

text = 'There once was a cat named George, who'
tokenized = tokenizer(text, return_tensors='pt').to(device)

with HookManager(model) as hook_manager:
    # hook_manager.attach_resid_stream_steer_hook(
    #     15,
    #     probe.linear._parameters['weight'][lab_map['is']],
    #     20,
    #     pre_mlp=False,
    #     pythia=False
    # )

    # for neg_steering_vector in neg_steering_vectors:
    #     hook_manager.attach_resid_stream_steer_hook(
    #         15,
    #         neg_steering_vector,
    #         -10,
    #         pre_mlp=False,
    #         pythia=False
    #     )

    hook_manager.attach_resid_stream_steer_hook(
        15,
        probe.linear._parameters['weight'][lab_map['da']],
        -10,
        pre_mlp=False,
        pythia=False
    )

    hook_manager.attach_resid_stream_steer_hook(
        15,
        probe.linear._parameters['weight'][lab_map['sv']],
        10,
        pre_mlp=False,
        pythia=False
    )

    hook_manager.attach_resid_stream_steer_hook(
        15,
        probe.linear._parameters['weight'][lab_map['en']],
        -10,
        pre_mlp=False,
        pythia=False
    )

    output_da_steering = [
        model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=True)
        for _ in range(10)
    ]

for output in output_da_steering:
    print(tokenizer.decode(output[0]).replace('\n', ' '))
    print()

There once was a cat named George, who variously variously kaldte sin familie for "Nisse", "Mister", "Felix", "Duck", "Prince", "Bug", "Ghost" eller "Fairy" og som i dag er et vanlig syn i Danmark.  Både den 30-åringen and the 24-åringen hadde tidligere vært med på aussie-races i London, og hadde en lignende start på

There once was a cat named George, who variously blev tildret, men altid var glad og letten. Han var en sort and, og det var han ofte. Han havde altid været en smuk and, og det var han altid. Han var en rigtig god and. Og han var altid lige glad og letten. Han var altid glad og letten. Og det var George også. Og han var altid lige glad og letten. Og det var George også. Og han var

There once was a cat named George, who variously blevabed og blevabede og blevabede. Han var en god og en dårlig kat, og han blev altid varmet op. Da George was still var den bedste og the one og the one og the one. Det var en gang, da the one var på en mission med the one, og the one var i the

In [None]:
steering_vector = probe.linear._parameters['weight'][3]

text = 'Dette er'
tokenized = tokenizer(text, return_tensors='pt').to(device)

with HookManager(model) as hook_manager:
    hook_manager.attach_resid_stream_steer_hook(
        17,
        steering_vector,
        50,
        pre_mlp=False,
        pythia=False
    )

    output_nb_steering = [
        model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=True)
        for _ in range(10)
    ]

for output in output_nb_steering:
    print(tokenizer.decode(output[0]).replace('\n', ' '))
    print()

In [None]:
output = model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=False)

tokenizer.decode(output[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Arranged in six parts, the film depicts a strike in 1903 by the workers of a factory in pre-revolutionary Russia, and their subsequent suppression. It is best known for a sequence towards the climax, in which the violent and violent of the Soviet Union was the first to be seen.\n\nThe film is a film that is a film that is a film that is a film that is a film that is a film that is a film that is a film that is a film that'

In [None]:
output

tensor([[ 7047,  3767, 30912,    75,   413,  2827,   546,  1448,  6706,   546,
          2098, 14293,    85,  1151,   466,   891,   726,    87, 33578,  1129,
          8225,  5025,   346,    49,  2768,  1670,    75,   771,   354,  1059,
         33325,  1507,  1073,  5751,  4953,     3,  9040,   945,  1162,   269,
          5507,   620,   265,  1775,   274,  1257,    75,   615,   278,  4415,
            78,    15,   187,   187,    34,    27,   187,   187,   510,   806,
          3213,   310,   281,  1918,   368,   247,  2372,   273,   247, 12662,
           285,   923,   752,   253,  3662,    27,   187,   187,    14, 50275,
           187,   187,    14, 50275,   187,   187,    14, 50275,   187,   187,
            14, 50275,   187,   187,    14, 50275,   187,   187,    14, 50275]])

'Filmen Strejke er en del af en planlagt serie i syv dele med titlen "På vej mod proletariatets diktatur" og var et fælles samarbejde mellem.\n\nA:\n\nThe first step is to give you a bit of a hint and see what the answer:\n\n-   \n\n-   \n\n-   \n\n-   \n\n-   \n\n-   '

In [None]:
projection_magnitudes = (act.unsqueeze(0) @ steering_vector).unsqueeze(-1)

In [None]:
steering_vector_ = steering_vector.view(1, 1, -1)

In [None]:
projections

tensor([[[ 1.0383],
         [ 3.2407],
         [ 4.7199],
         [-0.4780],
         [ 4.3807],
         [-3.7402],
         [ 1.0187],
         [ 8.1848],
         [-1.3870],
         [ 1.7934],
         [ 0.7859],
         [-3.5573],
         [-0.1791],
         [ 2.3834]]], grad_fn=<UnsqueezeBackward0>)

In [None]:
projections = (projection_magnitudes * steering_vector_)

tensor([4, 0, 3, 1, 4, 1, 2, 2, 4, 0, 2, 2, 1, 4, 4, 1, 4, 0, 4, 4, 1, 4, 0, 1,
        1, 4, 2, 2, 0, 3, 2, 1])

In [None]:
label.unsqueeze(-1).expand(-1, tokenized.attention_mask.shape[1]).flatten()

tensor([2, 2, 2,  ..., 4, 4, 4])

In [None]:
tokenized.attention_mask.flatten().shape

torch.Size([4928])

In [None]:
res_stream_act[0].view(-1, hidden_size)[0]

tensor([-0.5304,  0.9247, -0.0046, -2.0886,  0.9416, -0.6766,  1.6014,  0.4292,
        -0.3467,  0.6638, -0.3799, -3.7725,  0.0168, -0.6235,  1.7516,  1.1082,
         0.2715,  0.7788,  0.1381,  0.3071,  1.7841,  0.8833, -0.9052,  0.7601,
        -1.4813,  0.8397, -2.0588, -0.9006, -0.7124,  0.6576, -1.1364,  1.0030,
         0.6364,  1.6032,  0.4388, -1.4383,  0.8074,  3.0726,  0.0916, -0.9084,
         0.0775,  1.0104, -0.6060, -0.0836, -2.3467, -4.3569,  2.1628,  1.2648,
        -0.7746, -0.0214,  1.1413, -0.5092, -2.0130,  0.4083, -2.4431, -1.1179,
         0.5212,  0.7194, -0.6159, -0.8679,  0.7901,  1.3831,  0.0591,  0.8055])

In [None]:
res_stream_act[0].shape

torch.Size([32, 154, 64])

In [None]:

trainer = ProbeTrainer(hidden_size, 5, 0.001, 0.1, 'cpu')

for text_batch, labels in loader:
    print(text)
    with HookManager(model) as hook_manager:
        res_stream_act = hook_manager.attach_residstream_hook(
            layer=4,
            pre_mlp=False
        )

        tokenized = [
            tokenizer(text, return_tensors='pt')
            for text in text_batch
        ]
        for text in tokenized:
            model.forward(**text)

    loss = trainer.train_step(torch.Tensor(res_stream_act), torch.Tensor(labels))
    print(loss)

In [None]:
model.config.n_embed

AttributeError: 'GPTNeoConfig' object has no attribute 'n_embed'

In [None]:
model.get_submodule('transformer.h.')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(64000, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): GELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=64000, bias=False)
)

In [None]:
input_ = "dette er en"

tokenized = tokenizer(input_, return_tensors='pt')

In [None]:
tokenized

{'input_ids': tensor([[1122,  358,  315]]), 'attention_mask': tensor([[1, 1, 1]])}

In [None]:
output_ = model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=True)