In [9]:
from dictionary_learning.evaluation import evaluate

import pandas as pd
import torch as t
from nnsight import LanguageModel
from dictionary_learning import ActivationBuffer, AutoEncoder
from dictionary_learning.trainers import StandardTrainer
from dictionary_learning.training import trainSAE
from dictionary_learning.utils import read_csv
import gc

In [15]:
# empty cache to free memory before any further action.
t.cuda.empty_cache() 
gc.collect()

0

In [3]:
ae_X16L11 = AutoEncoder.from_pretrained("/gpfs/helios/home/jpauklin/dictionary_learning/saes/estMedSaeX16/trainer_0/ae_1105.pt").to("cuda") # to is rquired to load to GPU
ae_X16L5 = AutoEncoder.from_pretrained("/gpfs/helios/home/jpauklin/dictionary_learning/saes/estMedSaeX16layer5/trainer_0/ae_1105.pt").to("cuda") # to is rquired to load to GPU
ae_X64_11 = AutoEncoder.from_pretrained("/gpfs/helios/home/jpauklin/dictionary_learning/saes/estMedSae170425/trainer_0/ae_0505.pt").to("cuda") # to is rquired to load to GPU

  state_dict = t.load(path)


In [4]:
device = "cuda:0" #GPU
# Load Model
model_name = "/gpfs/space/projects/stacc_health/gpt2_model/estMed-gpt2_fine_tuned4/estMed-gpt2_fine_tuned4"
model = LanguageModel(
    model_name,
    device_map=device,
)

activation_dim = model.transformer.h[0].ln_1.normalized_shape[0] # output dimension of the MLP = 768


In [5]:
submodule = model.transformer.h[11].mlp 

data = read_csv("/gpfs/space/projects/stacc_health/data-synthetic/100k_synthetic_texts.csv", 100_000)
buffer = ActivationBuffer( # buffer will yield batches of tensors of dimension = submodule's output dimension
    data=data,
    model=model,
    submodule=submodule,
    d_submodule=activation_dim, # output dimension of the model component
    n_ctxs=3e4,  # length of each context. you can set this higher or lower dependong on your available memory
    device=device,
    out_batch_size = 2048 # reduce batch size to limit memory usage.
)

out_ae_X16L11 = evaluate(dictionary = ae_X16L11, 
               activations = buffer,
               device = device,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
submodule = model.transformer.h[5].mlp 

data = read_csv("/gpfs/space/projects/stacc_health/data-synthetic/100k_synthetic_texts.csv", 100_000)
buffer = ActivationBuffer( # buffer will yield batches of tensors of dimension = submodule's output dimension
    data=data,
    model=model,
    submodule=submodule,
    d_submodule=activation_dim, # output dimension of the model component
    n_ctxs=3e4,  # length of each context. you can set this higher or lower dependong on your available memory
    device=device,
    out_batch_size = 2048 # reduce batch size to limit memory usage.
)

out_ae_X16L5 = evaluate(dictionary = ae_X16L5, 
               activations = buffer,
               device = device,
              )

In [16]:
submodule = model.transformer.h[11].mlp 

data = read_csv("/gpfs/space/projects/stacc_health/data-synthetic/100k_synthetic_texts.csv", 100_000)
buffer = ActivationBuffer( # buffer will yield batches of tensors of dimension = submodule's output dimension
    data=data,
    model=model,
    submodule=submodule,
    d_submodule=activation_dim, # output dimension of the model component
    n_ctxs=3e4,  # length of each context. you can set this higher or lower dependong on your available memory
    device=device,
    out_batch_size = 2048 # reduce batch size to limit memory usage.
)

out_ae_X64L11 = evaluate(dictionary = ae_X64_11, 
               activations = buffer,
               device = device,
              )

In [None]:
"""
    MSE loss: average squared L2 distance between an activation and the autoencoder's reconstruction of it
    L1 loss: a measure of the autoencoder's sparsity
    L0: average number of features active above a random token
    Percentage of neurons alive: fraction of the dictionary features which are active on at least one token out of dictionary.dict_size random tokens
    
    CE diff: difference between the usual cross-entropy loss of the model for next token prediction and the cross entropy when replacing activations with our dictionary's reconstruction
    Percentage of CE loss recovered: when replacing the activation with the dictionary's reconstruction, the percentage of the model's cross-entropy loss on next token prediction that is recovered (relative to the baseline of zero ablating the activation)
"""

In [8]:
out_ae_X16L11

{'l2_loss': 5.667590141296387,
 'l1_loss': 19.95207977294922,
 'l0': 43.54638671875,
 'frac_variance_explained': 0.7986932992935181,
 'cossim': 0.8762193322181702,
 'l2_ratio': 0.7574641704559326,
 'relative_reconstruction_bias': 0.9113766551017761,
 'loss_original': 4.567016124725342,
 'loss_reconstructed': 4.580355644226074,
 'loss_zero': 4.4883832931518555,
 'frac_recovered': 1.1696431636810303,
 'frac_alive': 0.6610514521598816}

In [14]:
out_ae_X16L5

{'l2_loss': 2.1785030364990234,
 'l1_loss': 8.521507263183594,
 'l0': 29.810546875,
 'frac_variance_explained': 0.7897974252700806,
 'cossim': 0.8987048864364624,
 'l2_ratio': 0.7879399657249451,
 'relative_reconstruction_bias': 0.8931994438171387,
 'loss_original': 4.567016124725342,
 'loss_reconstructed': 4.534842491149902,
 'loss_zero': 4.641909122467041,
 'frac_recovered': 1.42959463596344,
 'frac_alive': 0.5171712636947632}

In [17]:
out_ae_X64L11

{'l2_loss': 5.583471298217773,
 'l1_loss': 19.707857131958008,
 'l0': 41.681640625,
 'frac_variance_explained': 0.8173326849937439,
 'cossim': 0.8841896057128906,
 'l2_ratio': 0.7713171243667603,
 'relative_reconstruction_bias': 0.9197777509689331,
 'loss_original': 4.567016124725342,
 'loss_reconstructed': 4.560064315795898,
 'loss_zero': 4.4883832931518555,
 'frac_recovered': 0.9115915298461914,
 'frac_alive': 0.2857869565486908}