In [1]:

from torch.utils.data import DataLoader
from pathlib import Path
from matplotlib import pyplot as plt
from math import sqrt

from utils.probe_confidence_intervals import model_setup, get_activations
from utils.plotting import plot_activations_PCA
from utils.preprocessing import load_txt_data

In [None]:
model_name = "AI-Sweden-Models/gpt-sw3-356m"
saved_path = "results/PCA_activations/"
# loads model
model, tokenizer, device = model_setup(model_name)
hidden_layers = model.config.num_hidden_layers


# loads data
languages = ['en', 'da', 'sv', 'nb', 'is']
raw_data_folder = Path('data/antibiotic/')
print("Load data")
ds = load_txt_data(
    file_paths={
        'da': raw_data_folder / 'da.txt',
        'en': raw_data_folder / 'en.txt',
        'sv': raw_data_folder / 'sv.txt',
        'nb': raw_data_folder / 'nb.txt',
        'is': raw_data_folder / 'is.txt'
    },
    file_extension='txt'
)
loader = DataLoader(ds, batch_size=32, shuffle=True)


# extracts activation from forward passes on data    
act_ds = get_activations(
    meta_data={'hidden_layers': model.config.num_hidden_layers,
                'hidden_size': model.config.hidden_size},
    loader=loader,
    tokenizer=tokenizer,
    model=model,
    device=device
)
