In [None]:
import os

try:
    has_changed_dir
except:
    has_changed_dir = False

try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
except:
    IN_COLAB = False

if IN_COLAB:
    %pip install datasets
    %pip install translate-toolkit
    %pip install bitsandbytes

    !git clone https://github.com/MartinKirkegaard/KDS_MI.git

    if not has_changed_dir:
        os.chdir('KDS_MI')
        has_changed_dir = True
else:
    if not has_changed_dir:
        os.chdir('.')
        has_changed_dir = True

In [1]:
%load_ext autoreload
%autoreload 2 

import torch
from torch import nn
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset


from classification_probes import TextClassificationDataset, ProbeTrainer, HookManager, ClassificationProbe

In [2]:
model_name = "roneneldan/TinyStories-1M"
model_name = "EleutherAI/pythia-14m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

try:
    hidden_size = model.config.n_embd
except AttributeError:
    hidden_size = model.config.hidden_size

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


In [3]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 128)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=128, out_features=384, bias=True)
          (dense): Linear(in_features=128, out_features=128, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=128, out_features=512, bias=True)
          (dense_4h_to_h): Linear(in_features=512, out_features=128, bias=True)
          (

In [4]:
lab_map = {
    'da': 0,
    'en': 1,
    'is': 2,
    'nb': 3,
    'sv': 4
}

data_loc = 'data/antibiotic/'
ds = TextClassificationDataset.from_txt(data_loc + 'da.txt', lab_map['da'])
ds.add_from_txt(data_loc + 'en.txt', lab_map['en'])
ds.add_from_txt(data_loc + 'is.txt', lab_map['is'])
ds.add_from_txt(data_loc + 'nb.txt', lab_map['nb'])
ds.add_from_txt(data_loc + 'sv.txt', lab_map['sv'])

loader = DataLoader(ds, batch_size=32, shuffle=True)

In [5]:
# making a dataset that can hold the activations and labels.

class ActivationDataset(Dataset):

    def __init__(self):
        self.acts = []
        self.labels = []

    def add_with_mask(self, acts, labels, masks):
        for act, label, mask in zip(acts, labels, masks):
            if mask:
                self.acts.append(act)
                self.labels.append(label)

    def __getitem__(self, index) -> tuple:
        return (self.acts[index], self.labels[index]) 
    
    def __len__(self) -> int:
        return len(self.acts)
    

In [6]:
activation_ds = ActivationDataset()

for text, label in loader:

    tokenized = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

    with HookManager(model) as hook_manager:
        res_stream_act = hook_manager.attach_residstream_hook(
            layer=4,
            pre_mlp=False,
            pythia=True
        )

        model(**tokenized)
    
    # flattening [batch, pad_size, ...] to [tokens, ...]
    attn_mask = tokenized.attention_mask.flatten() # [tokens]
    label = label.unsqueeze(-1).expand(-1, tokenized.attention_mask.shape[1]).flatten() # [tokens]
    res_stream_act = res_stream_act[0].view(-1, hidden_size) # [tokens, hidden_size]

    activation_ds.add_with_mask(res_stream_act, label, attn_mask)



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [7]:
labs = [int(act) for act in activation_ds.labels]

In [8]:
act_loader = DataLoader(activation_ds, batch_size=32, shuffle=True)

In [9]:
input_size = hidden_size
learning_rate = 0.001
reg_lambda = 0.1
num_labs = len(lab_map)

probe = ClassificationProbe(in_dim=input_size, num_labs=num_labs, device='cpu')
optimizer = torch.optim.Adam(probe.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(1):
    for act, label in act_loader:

        outputs = probe(act)
        loss = loss_fn(outputs, label)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(loss)

tensor(1.6479, grad_fn=<NllLossBackward0>)
tensor(1.6244, grad_fn=<NllLossBackward0>)
tensor(1.8046, grad_fn=<NllLossBackward0>)
tensor(1.8279, grad_fn=<NllLossBackward0>)
tensor(1.6259, grad_fn=<NllLossBackward0>)
tensor(1.5879, grad_fn=<NllLossBackward0>)
tensor(1.6371, grad_fn=<NllLossBackward0>)
tensor(1.7329, grad_fn=<NllLossBackward0>)
tensor(1.6834, grad_fn=<NllLossBackward0>)
tensor(1.6484, grad_fn=<NllLossBackward0>)
tensor(1.7188, grad_fn=<NllLossBackward0>)
tensor(1.6795, grad_fn=<NllLossBackward0>)
tensor(1.7670, grad_fn=<NllLossBackward0>)
tensor(1.6518, grad_fn=<NllLossBackward0>)
tensor(1.6193, grad_fn=<NllLossBackward0>)
tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.6613, grad_fn=<NllLossBackward0>)
tensor(1.6607, grad_fn=<NllLossBackward0>)
tensor(1.6322, grad_fn=<NllLossBackward0>)
tensor(1.6898, grad_fn=<NllLossBackward0>)
tensor(1.7480, grad_fn=<NllLossBackward0>)
tensor(1.6710, grad_fn=<NllLossBackward0>)
tensor(1.7232, grad_fn=<NllLossBackward0>)
tensor(1.67

In [102]:
steering_vector = probe.linear._parameters['weight'][3]

text = 'Arranged in six parts, the film depicts a strike in 1903 by the workers of a factory in pre-revolutionary Russia, and their subsequent suppression. It is best known for a sequence towards the climax, in which the violent'
tokenized = tokenizer(text, return_tensors='pt')

with HookManager(model) as hook_manager:
    hook_manager.attach_resid_stream_steer_hook(
        4,
        steering_vector,
        3,
        pre_mlp=False,
        pythia=True
    )

    output = model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=False)

tokenizer.decode(output[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Arranged in six parts, the film depicts a strike in 1903 by the workers of a factory in pre-revolutionary Russia, and their subsequent suppression. It is best known for a sequence towards the climax, in which the violent and violent events are made.\n\nThe film is a film, a film, a film, a film, a film, a film, a film, a film, a film, a film, a film, a film, a film, a film,'

In [96]:
output = model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=False)

tokenizer.decode(output[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Arranged in six parts, the film depicts a strike in 1903 by the workers of a factory in pre-revolutionary Russia, and their subsequent suppression. It is best known for a sequence towards the climax, in which the violent and violent of the Soviet Union was the first to be seen.\n\nThe film is a film that is a film that is a film that is a film that is a film that is a film that is a film that is a film that is a film that'

In [80]:
output

tensor([[ 7047,  3767, 30912,    75,   413,  2827,   546,  1448,  6706,   546,
          2098, 14293,    85,  1151,   466,   891,   726,    87, 33578,  1129,
          8225,  5025,   346,    49,  2768,  1670,    75,   771,   354,  1059,
         33325,  1507,  1073,  5751,  4953,     3,  9040,   945,  1162,   269,
          5507,   620,   265,  1775,   274,  1257,    75,   615,   278,  4415,
            78,    15,   187,   187,    34,    27,   187,   187,   510,   806,
          3213,   310,   281,  1918,   368,   247,  2372,   273,   247, 12662,
           285,   923,   752,   253,  3662,    27,   187,   187,    14, 50275,
           187,   187,    14, 50275,   187,   187,    14, 50275,   187,   187,
            14, 50275,   187,   187,    14, 50275,   187,   187,    14, 50275]])

'Filmen Strejke er en del af en planlagt serie i syv dele med titlen "På vej mod proletariatets diktatur" og var et fælles samarbejde mellem.\n\nA:\n\nThe first step is to give you a bit of a hint and see what the answer:\n\n-   \n\n-   \n\n-   \n\n-   \n\n-   \n\n-   '

In [63]:
projection_magnitudes = (act.unsqueeze(0) @ steering_vector).unsqueeze(-1)

In [64]:
steering_vector_ = steering_vector.view(1, 1, -1)

In [60]:
projections

tensor([[[ 1.0383],
         [ 3.2407],
         [ 4.7199],
         [-0.4780],
         [ 4.3807],
         [-3.7402],
         [ 1.0187],
         [ 8.1848],
         [-1.3870],
         [ 1.7934],
         [ 0.7859],
         [-3.5573],
         [-0.1791],
         [ 2.3834]]], grad_fn=<UnsqueezeBackward0>)

In [65]:
projections = (projection_magnitudes * steering_vector_)

tensor([4, 0, 3, 1, 4, 1, 2, 2, 4, 0, 2, 2, 1, 4, 4, 1, 4, 0, 4, 4, 1, 4, 0, 1,
        1, 4, 2, 2, 0, 3, 2, 1])

In [78]:
label.unsqueeze(-1).expand(-1, tokenized.attention_mask.shape[1]).flatten()

tensor([2, 2, 2,  ..., 4, 4, 4])

In [69]:
tokenized.attention_mask.flatten().shape

torch.Size([4928])

In [61]:
res_stream_act[0].view(-1, hidden_size)[0]

tensor([-0.5304,  0.9247, -0.0046, -2.0886,  0.9416, -0.6766,  1.6014,  0.4292,
        -0.3467,  0.6638, -0.3799, -3.7725,  0.0168, -0.6235,  1.7516,  1.1082,
         0.2715,  0.7788,  0.1381,  0.3071,  1.7841,  0.8833, -0.9052,  0.7601,
        -1.4813,  0.8397, -2.0588, -0.9006, -0.7124,  0.6576, -1.1364,  1.0030,
         0.6364,  1.6032,  0.4388, -1.4383,  0.8074,  3.0726,  0.0916, -0.9084,
         0.0775,  1.0104, -0.6060, -0.0836, -2.3467, -4.3569,  2.1628,  1.2648,
        -0.7746, -0.0214,  1.1413, -0.5092, -2.0130,  0.4083, -2.4431, -1.1179,
         0.5212,  0.7194, -0.6159, -0.8679,  0.7901,  1.3831,  0.0591,  0.8055])

In [52]:
res_stream_act[0].shape

torch.Size([32, 154, 64])

In [None]:

trainer = ProbeTrainer(hidden_size, 5, 0.001, 0.1, 'cpu')

for text_batch, labels in loader:
    print(text)
    with HookManager(model) as hook_manager:
        res_stream_act = hook_manager.attach_residstream_hook(
            layer=4,
            pre_mlp=False
        )

        tokenized = [
            tokenizer(text, return_tensors='pt')
            for text in text_batch
        ]
        for text in tokenized:
            model.forward(**text)

    loss = trainer.train_step(torch.Tensor(res_stream_act), torch.Tensor(labels))
    print(loss)

In [6]:
model.config.n_embed

AttributeError: 'GPTNeoConfig' object has no attribute 'n_embed'

In [None]:
model.get_submodule('transformer.h.')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(64000, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): GELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=64000, bias=False)
)

In [22]:
input_ = "dette er en"

tokenized = tokenizer(input_, return_tensors='pt')

In [23]:
tokenized

{'input_ids': tensor([[1122,  358,  315]]), 'attention_mask': tensor([[1, 1, 1]])}

In [24]:
output_ = model.generate(tokenized.input_ids, max_length=100, temperature=0.7, top_p=0.9, do_sample=True)