In [1]:
import torch
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from transformers import AutoTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM, BitsAndBytesConfig
from dictionary_learning import AutoEncoder
from nnsight import LanguageModel
from dictionary_learning.utils import read_csv
from IPython.display import display, HTML

In [2]:
import scipy.sparse
from scipy.sparse import csr_matrix
from dictionary_learning.sparse_feature_writer import SparseFeatureWriter

import gc
import h5py
from collections import defaultdict

from tqdm import tqdm

In [3]:
# empty cache to free memory before any further action.
torch.cuda.empty_cache() 
gc.collect()

100

In [4]:
# load autoencoder
ae = AutoEncoder.from_pretrained("/gpfs/helios/home/jpauklin/dictionary_learning/saes/estMedSaeX16layer5/trainer_0/ae_1105.pt").to("cuda") # to is rquired to load to GPU

device = "cuda:0" #GPU
# Load Model
model_name = "/gpfs/space/projects/stacc_health/gpt2_model/estMed-gpt2_fine_tuned4/estMed-gpt2_fine_tuned4"
model = LanguageModel(
    model_name,
    device_map=device,
)

activation_dim = model.transformer.h[0].ln_1.normalized_shape[0] # output dimension of the MLP = 768

  state_dict = t.load(path)


In [5]:
data = read_csv("/gpfs/space/projects/stacc_health/data-synthetic/100k_synthetic_texts.csv", 100_000) # (csv_path, nr_of_text_batches_to_read)
text = list(data)[50_000:] # list from iterator

In [6]:
tokenizer_filepath = "/gpfs/space/projects/stacc_health/gpt2_model/estMed-gpt2_fine_tuned4/estMed-gpt2_fine_tuned4/tokenizer.json"
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_filepath)
tokenizer.pad_token = "<pad>"

# Padding to have batched tensors with the same length.    
tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)


In [7]:
"""
# Find the average length of input in tokens
summa = 0
nr_of_batches = 100000
for batch in tokens['input_ids'][:nr_of_batches]:
    paddings = 0
    for token in batch:
        if token == 50257:
            paddings += 1
    summa += len(batch) - paddings 

print(summa/nr_of_batches)
"""

"\n# Find the average length of input in tokens\nsumma = 0\nnr_of_batches = 100000\nfor batch in tokens['input_ids'][:nr_of_batches]:\n    paddings = 0\n    for token in batch:\n        if token == 50257:\n            paddings += 1\n    summa += len(batch) - paddings \n\nprint(summa/nr_of_batches)\n"

In [8]:
print(len(tokens['input_ids']))
print(f"Input ids size {len(tokens['input_ids'][0])}")

50000
Input ids size 128


In [9]:
h5file_path = "features/featuresX16_L5_50000.h5"
batch_length = len(tokens['input_ids'][0])# the number of tokens in longest input batch
writer = SparseFeatureWriter(h5file_path, batch_length)

In [10]:
#writer.remove_from_index(10_000)

In [11]:
# tqdm adds a progress bar to the process
for input_ids, attention_mask in tqdm(zip(tokens['input_ids'], tokens.get('attention_mask')), total=len(tokens['input_ids']), desc="Extracting features"):    
    # Using nnsight, we hook into one of the MLP's layer and inspect them during a forward pass.
    # This gets us the layer's activations (output of c_proj)
    with model.trace(input_ids, attention_mask) as tracer:
    
        # Selecting a specific layer to capture
        mlp_output = model.transformer.h[5].mlp.c_proj.output.save()
        # The model is actually run upon exiting the tracing context. (https://nnsight.net/notebooks/tutorials/walkthrough/)
    
    activations = mlp_output.value
    features = ae.encode(activations)[0] # for each token a list of feature activations
    
    csr_batch = csr_matrix(features.detach().cpu().numpy()) # Compressed Sparse Row matrix from features (tensor -> numpy array)
    writer.append(csr_batch, input_ids)

    # Free up space
    torch.cuda.empty_cache() 
    gc.collect()


Extracting features:   0%|          | 0/50000 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Extracting features: 100%|██████████| 50000/50000 [3:22:34<00:00,  4.11it/s]  
