# SAE-Metrics

## Imports

In [1]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from sparsify import Sae
from datasets import load_dataset
from pathlib import Path

from tqdm import tqdm

In [2]:
from evaluation.after_train_eval import post_train_eval

## Settings

In [3]:
!export HF_TOKEN=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
!export HF_HOME=/share/tilman.kerl/huggingface

In [4]:
os.environ["HF_HOME"] = "/share/tilman.kerl/huggingface"

In [5]:
LAYER_IDX = 18
K_TOP = 64
# SAE_CKPT_DIR = "./train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18"
# EVAL_DATASET = "datablations/c4-filter-small"

In [6]:
def flatten_gair_convs(conv_list):
    return "".join(conv_list)

## Eval

We first read all our checkpoints for LMSYS & PILE:

#### LMSYS checkpoints

In [7]:
checkpoint_dir_lmsys = Path('./train/LMSYS/checkpoints')

lmsys_sae_variant_checkpoints = []

if checkpoint_dir_lmsys.exists() and checkpoint_dir_lmsys.is_dir():
    lmsys_sae_variant_checkpoints = [
        f"{item.name}/layers.18" for item in checkpoint_dir_lmsys.iterdir() if item.is_dir()
    ]
else:
    print(f"Directory not found: {checkpoint_dir_lmsys}")

print(lmsys_sae_variant_checkpoints)

['smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18']


#### PILE checkpoints

In [8]:
checkpoint_dir_pile = Path('./train/PILE/checkpoints')

pile_sae_variant_checkpoints = []

if checkpoint_dir_pile.exists() and checkpoint_dir_pile.is_dir():
    pile_sae_variant_checkpoints = [
        f"{item.name}/layers.18" for item in checkpoint_dir_pile.iterdir() if item.is_dir()
    ]
else:
    print(f"Directory not found: {checkpoint_dir_pile}")

print(pile_sae_variant_checkpoints)

['smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18', 'smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18']


#### MIX checkpoints

In [9]:
checkpoint_dir_mix = Path('./train/MIX/checkpoints')

mix_sae_variant_checkpoints = []

if checkpoint_dir_mix.exists() and checkpoint_dir_mix.is_dir():
    mix_sae_variant_checkpoints = [
        f"{item.name}/layers.18" for item in checkpoint_dir_mix.iterdir() if item.is_dir()
    ]
else:
    print(f"Directory not found: {checkpoint_dir_mix}")

print(mix_sae_variant_checkpoints)

Directory not found: train/MIX/checkpoints
[]


#### All checkpoints

In [10]:
all_checkpoints = [
    lmsys_sae_variant_checkpoints,
    pile_sae_variant_checkpoints,
    mix_sae_variant_checkpoints
]

### LMSYS (INS-trained)

In [11]:
for checkpoint_dir_group in all_checkpoints:
    for checkpoint_dir in checkpoint_dir_group:    
        # INS-EVAL    
        print(f">> INS EVAL <<")
        post_train_eval(
            sae_checkpoint_dir=checkpoint_dir,    
            model_name="HuggingFaceTB/SmolLM2-135M",    
            eval_dataset="GAIR/lima", 
            text_field_name="conversations",
            text_field_fn=flatten_gair_convs,
            layer_idx=18,
            max_samples=1000,
            max_len=256,
        )
        
        # PRE-EVAL
        print(f">> PRE EVAL <<")
        post_train_eval(
            sae_checkpoint_dir=checkpoint_dir,    
            model_name="HuggingFaceTB/SmolLM2-135M",    
            eval_dataset="datablations/c4-filter-small",     
            layer_idx=18,
            max_samples=1000,
            max_len=256,
        )

>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing)...


100%|██████████| 1000/1000 [00:11<00:00, 84.36it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9869
Fraction of Var Unexpl.  : 0.0131
Mean Squared Error (MSE) : 3.77e+01
Cosine Similarity        : 0.9288
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0038%
Dead Latent Features     : 12.24% (1128/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 1

100%|██████████| 1000/1000 [00:10<00:00, 91.22it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9875
Fraction of Var Unexpl.  : 0.0125
Mean Squared Error (MSE) : 4.25e+01
Cosine Similarity        : 0.9280
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0038%
Dead Latent Features     : 13.90% (1281/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-124M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processi

100%|██████████| 1000/1000 [00:11<00:00, 89.94it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9928
Fraction of Var Unexpl.  : 0.0072
Mean Squared Error (MSE) : 2.09e+01
Cosine Similarity        : 0.9018
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0055%
Dead Latent Features     : 11.79% (1087/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 1

100%|██████████| 1000/1000 [00:11<00:00, 90.70it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9934
Fraction of Var Unexpl.  : 0.0066
Mean Squared Error (MSE) : 2.24e+01
Cosine Similarity        : 0.9014
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0055%
Dead Latent Features     : 13.64% (1257/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-249M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing

100%|██████████| 1000/1000 [00:11<00:00, 90.01it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9787
Fraction of Var Unexpl.  : 0.0213
Mean Squared Error (MSE) : 6.14e+01
Cosine Similarity        : 0.7735
Activation Sparsity (L0) : 1.3887%
Weight Sparsity          : 0.0176%
Dead Latent Features     : 10.57% (487/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (c

100%|██████████| 1000/1000 [00:11<00:00, 90.09it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9804
Fraction of Var Unexpl.  : 0.0196
Mean Squared Error (MSE) : 6.68e+01
Cosine Similarity        : 0.7807
Activation Sparsity (L0) : 1.3887%
Weight Sparsity          : 0.0176%
Dead Latent Features     : 10.89% (502/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing

100%|██████████| 1000/1000 [00:11<00:00, 87.49it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9919
Fraction of Var Unexpl.  : 0.0081
Mean Squared Error (MSE) : 2.34e+01
Cosine Similarity        : 0.9370
Activation Sparsity (L0) : 0.3472%
Weight Sparsity          : 0.0042%
Dead Latent Features     : 15.23% (2808/18432)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 

100%|██████████| 1000/1000 [00:11<00:00, 89.26it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9924
Fraction of Var Unexpl.  : 0.0076
Mean Squared Error (MSE) : 2.58e+01
Cosine Similarity        : 0.9363
Activation Sparsity (L0) : 0.3472%
Weight Sparsity          : 0.0042%
Dead Latent Features     : 18.13% (3342/18432)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-249M-token-18-layers-32-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing)

100%|██████████| 1000/1000 [00:11<00:00, 90.91it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9858
Fraction of Var Unexpl.  : 0.0142
Mean Squared Error (MSE) : 4.10e+01
Cosine Similarity        : 0.9446
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0022%
Dead Latent Features     : 22.02% (2029/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chu

100%|██████████| 1000/1000 [00:10<00:00, 92.58it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9861
Fraction of Var Unexpl.  : 0.0139
Mean Squared Error (MSE) : 4.74e+01
Cosine Similarity        : 0.9426
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0022%
Dead Latent Features     : 22.49% (2073/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-2M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing).

100%|██████████| 1000/1000 [00:10<00:00, 93.31it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9927
Fraction of Var Unexpl.  : 0.0073
Mean Squared Error (MSE) : 2.10e+01
Cosine Similarity        : 0.9137
Activation Sparsity (L0) : 1.3889%
Weight Sparsity          : 0.0039%
Dead Latent Features     : 10.39% (479/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (c

100%|██████████| 1000/1000 [00:10<00:00, 93.12it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9933
Fraction of Var Unexpl.  : 0.0067
Mean Squared Error (MSE) : 2.29e+01
Cosine Similarity        : 0.9134
Activation Sparsity (L0) : 1.3889%
Weight Sparsity          : 0.0039%
Dead Latent Features     : 10.94% (504/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-124M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing

100%|██████████| 1000/1000 [00:11<00:00, 90.24it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9951
Fraction of Var Unexpl.  : 0.0049
Mean Squared Error (MSE) : 1.40e+01
Cosine Similarity        : 0.9171
Activation Sparsity (L0) : 0.3472%
Weight Sparsity          : 0.0060%
Dead Latent Features     : 14.12% (2603/18432)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 

100%|██████████| 1000/1000 [00:10<00:00, 91.84it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9957
Fraction of Var Unexpl.  : 0.0043
Mean Squared Error (MSE) : 1.45e+01
Cosine Similarity        : 0.9171
Activation Sparsity (L0) : 0.3472%
Weight Sparsity          : 0.0060%
Dead Latent Features     : 16.89% (3114/18432)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-32-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing)

100%|██████████| 1000/1000 [00:10<00:00, 92.04it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9165
Fraction of Var Unexpl.  : 0.0835
Mean Squared Error (MSE) : 2.41e+02
Cosine Similarity        : 0.7236
Activation Sparsity (L0) : 1.3884%
Weight Sparsity          : 0.0143%
Dead Latent Features     : 10.26% (473/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chun

100%|██████████| 1000/1000 [00:10<00:00, 93.10it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9167
Fraction of Var Unexpl.  : 0.0833
Mean Squared Error (MSE) : 2.83e+02
Cosine Similarity        : 0.7333
Activation Sparsity (L0) : 1.3886%
Weight Sparsity          : 0.0143%
Dead Latent Features     : 11.18% (515/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-50M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing)

100%|██████████| 1000/1000 [00:10<00:00, 90.98it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9963
Fraction of Var Unexpl.  : 0.0037
Mean Squared Error (MSE) : 1.06e+01
Cosine Similarity        : 0.9450
Activation Sparsity (L0) : 0.3472%
Weight Sparsity          : 0.0034%
Dead Latent Features     : 14.36% (2646/18432)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 

100%|██████████| 1000/1000 [00:10<00:00, 91.19it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9967
Fraction of Var Unexpl.  : 0.0033
Mean Squared Error (MSE) : 1.13e+01
Cosine Similarity        : 0.9439
Activation Sparsity (L0) : 0.3472%
Weight Sparsity          : 0.0034%
Dead Latent Features     : 17.32% (3193/18432)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-124M-token-18-layers-32-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked process

100%|██████████| 1000/1000 [00:10<00:00, 92.03it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9908
Fraction of Var Unexpl.  : 0.0092
Mean Squared Error (MSE) : 2.66e+01
Cosine Similarity        : 0.8601
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0179%
Dead Latent Features     : 11.59% (1068/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 1

100%|██████████| 1000/1000 [00:10<00:00, 93.05it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9917
Fraction of Var Unexpl.  : 0.0083
Mean Squared Error (MSE) : 2.81e+01
Cosine Similarity        : 0.8608
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0179%
Dead Latent Features     : 13.38% (1233/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing

100%|██████████| 1000/1000 [00:10<00:00, 92.16it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9787
Fraction of Var Unexpl.  : 0.0213
Mean Squared Error (MSE) : 6.14e+01
Cosine Similarity        : 0.7735
Activation Sparsity (L0) : 1.3887%
Weight Sparsity          : 0.0176%
Dead Latent Features     : 10.57% (487/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (c

100%|██████████| 1000/1000 [00:10<00:00, 93.17it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9804
Fraction of Var Unexpl.  : 0.0196
Mean Squared Error (MSE) : 6.68e+01
Cosine Similarity        : 0.7807
Activation Sparsity (L0) : 1.3887%
Weight Sparsity          : 0.0176%
Dead Latent Features     : 10.89% (502/4608)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-8-expansion-64-k.json

=== Evaluation Complete ===
>> INS EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: GAIR/lima (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 18 (chunked processing

100%|██████████| 1000/1000 [00:10<00:00, 91.26it/s]



--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9908
Fraction of Var Unexpl.  : 0.0092
Mean Squared Error (MSE) : 2.66e+01
Cosine Similarity        : 0.8601
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0179%
Dead Latent Features     : 11.59% (1068/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===
>> PRE EVAL <<
=== Starting SAE Evaluation ===
SAE Checkpoint Dir: smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18
Base Model: HuggingFaceTB/SmolLM2-135M
Evaluation Dataset: datablations/c4-filter-small (split: train)
Layer Index: 18, Max Samples: 1000, Max Length: 256

Loading model 'HuggingFaceTB/SmolLM2-135M' on device 'cuda'...
Loading SAE from '/home/tilman.kerl/mech-interp/src/train/LMSYS/checkpoints/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k/layers.18'...
Loading dataset...
Collecting activations from layer 1

100%|██████████| 1000/1000 [00:10<00:00, 93.18it/s]


--- SAE Metrics (Chunked Processing) ---
Explained Variance       : 0.9917
Fraction of Var Unexpl.  : 0.0083
Mean Squared Error (MSE) : 2.81e+01
Cosine Similarity        : 0.8608
Activation Sparsity (L0) : 0.6944%
Weight Sparsity          : 0.0179%
Dead Latent Features     : 13.38% (1233/9216)

Metrics successfully saved to results/saes/smollm2-sparsify-lmsys-419M-token-18-layers-16-expansion-64-k.json

=== Evaluation Complete ===





## Visualisations

In [15]:
from evaluation.visualise_sae_metrics import load_sae_results, visualize_sae_results_grouped

RESULTS_DIRECTORY = "results/saes/"

results_df = load_sae_results(RESULTS_DIRECTORY)
if not results_df.empty:
    visualize_sae_results_grouped(results_df)

Found 10 result files. Loading...
Data loaded and processed successfully.

Found 1 evaluation datasets. Generating a dashboard for each...
  [1/1] Generating dashboard for eval_dataset: 'datablations/c4-filter-small'
    -> Plot 1 saved to 'results/visualizations/dashboard_1_EV_eval_on_datablations_c4-filter-small.png'
    -> Plot 2 saved to 'results/visualizations/dashboard_2_DeadFeatures_eval_on_datablations_c4-filter-small.png'
    -> Plot 3 saved to 'results/visualizations/dashboard_3_Tradeoff_eval_on_datablations_c4-filter-small.png'
