# Autocircuit Test

In [65]:
%load_ext autoreload
%autoreload 2

import os, sys
# Add the directory containing the utils folder to the system path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import torch as t

from auto_circuit.data import load_datasets_from_json
from auto_circuit.experiment_utils import load_tl_model
from auto_circuit.visualize import draw_seq_graph

from utils.circuit_discovery import acdc_discovery
from utils.testing import access_wandb_runs

from utils.testing import load_our_model, prepare_model_acdc, download_models, update_run

from os.path import abspath
from pathlib import Path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
device = t.device("cuda" if t.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda
Loaded pretrained model tiny-stories-1M into HookedTransformer


In [61]:
# Get the no-ablation model
all_wandb_runs = access_wandb_runs(filters=None)

baseline_run = None
for run in all_wandb_runs: # Linear search on small list
    if run.name == "no-ablation-baseline":
        baseline_run = run
        break

download_models([baseline_run], "../model_weights")

Downloaded best_model_20241014.pt to ../model_weights/no-ablation-baseline


In [62]:
baseline_model = load_our_model("../model_weights/no-ablation-baseline/best_model_20241014.pt", use_layer_by_layer_ablation_mask=False, use_overall_ablation_mask=False, device=device)
patched_baseline_model = prepare_model_acdc(baseline_model, device)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and w

In [None]:
path = Path("../datasets/autocircuit_ioi_customized_prompts.json")
train_loader, test_loader = load_datasets_from_json(
    model=patched_baseline_model, # Used to get tokenizer
    path=path,
    device=device,
    prepend_bos=True,
    batch_size=64, # ACDC only uses the first batch? Higher batch sizes reduce the variance of the edges discovered
    train_test_size=(128, 128),
)


## Testing autocircuit's ACDC using regular transformer

In [74]:
def test_model(patched_model, data_loader, save_path=None, score_threshold=0.03):
    total_edges, remaining_edges, pruned_scores = acdc_discovery(patched_model, data_loader, tau_bases=[score_threshold * 100])
    print(f"Total edges: {total_edges}")
    print(f"Remaining edges: {remaining_edges}")
    
    # Pruned neurons are given a value of inf
    # Interactive Image may cause performance issues
    if save_path:
        save_path = Path(save_path).resolve()

    _ = draw_seq_graph(patched_model, pruned_scores, score_threshold=score_threshold, orientation='v', layer_spacing='True', file_path=save_path)
    
    return remaining_edges, pruned_scores

In [75]:
regular_remaining_edges, regular_pruned_scores = test_model(patched_baseline_model, train_loader, save_path="../plots/our-baseline.svg")

VBox(children=(          | 0/1 [00:00<?, ?it/s],))

Total edges: 23981
Remaining edges: 79


In [85]:
results_key = 'acdc_ioi_edges'
results = {results_key: regular_remaining_edges}
update_run(baseline_run, results)

## Testing ACDC using ablated transformer

The loop currently only supports overall ablation models

In [76]:
wandb_ablated_runs = access_wandb_runs()
download_models(wandb_ablated_runs, "../model_weights") # Will skip downloading if the model already exists

Downloaded best_model_20241105.pt to ../model_weights/fanciful-fog-78
Downloaded best_model_20241106.pt to ../model_weights/earnest-moon-79
Downloaded best_model_20241105.pt to ../model_weights/super-violet-80
Downloaded best_model_20241108.pt to ../model_weights/cosmic-leaf-81
Downloaded best_model_20241109.pt to ../model_weights/upbeat-glitter-83
Downloaded best_model_20241109.pt to ../model_weights/comfy-cherry-84
Downloaded best_model_20241111.pt to ../model_weights/cosmic-leaf-81-part2
Downloaded best_model_20241113.pt to ../model_weights/major-planet-86
Downloaded best_model_20241113.pt to ../model_weights/major-planet-86-part2
Downloaded best_model_20241114.pt to ../model_weights/major-planet-86-part3
Downloaded best_model_20241116.pt to ../model_weights/light-morning-92
Downloaded best_model_20241119.pt to ../model_weights/deft-dew-98


In [90]:
for run in wandb_ablated_runs:
    
    if "acdc_ioi_edges" in run.summaryMetrics.keys():
        print(f"Skipping run {run.name} as it has already been processed")
        continue
    
    print(f"Processing run {run.name}")
    
    model_dir = f"../model_weights/{run.name}"
    model_name = [f for f in os.listdir(model_dir) if f.endswith(".pt")][0]
    path = os.path.join(model_dir, model_name)
    
    ablated_trained_model = load_our_model(path, use_layer_by_layer_ablation_mask=False, use_overall_ablation_mask=True, device=device)
    patched_ablated_trained_model = prepare_model_acdc(ablated_trained_model, device)
    
    remaining_edges, pruned_scores = test_model(patched_ablated_trained_model, train_loader, save_path=f"../plots/{run.name}.svg")
    
    results = {results_key: remaining_edges}
    update_run(run, results)
    break

Skipping run fanciful-fog-78 as it has already been processed
Processing run earnest-moon-79



You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



RuntimeError: Error(s) in loading state_dict for GPTNeoWithSelfAblation:
	Missing key(s) in state_dict: "attention_ablations_head.weight", "attention_ablations_head.bias", "neuron_ablations_head.weight", "neuron_ablations_head.bias". 
	Unexpected key(s) in state_dict: "transformer.h.0.attention_ablation_head.weight", "transformer.h.0.attention_ablation_head.bias", "transformer.h.0.neuron_ablation_head.weight", "transformer.h.0.neuron_ablation_head.bias", "transformer.h.1.attention_ablation_head.weight", "transformer.h.1.attention_ablation_head.bias", "transformer.h.1.neuron_ablation_head.weight", "transformer.h.1.neuron_ablation_head.bias", "transformer.h.2.attention_ablation_head.weight", "transformer.h.2.attention_ablation_head.bias", "transformer.h.2.neuron_ablation_head.weight", "transformer.h.2.neuron_ablation_head.bias", "transformer.h.3.attention_ablation_head.weight", "transformer.h.3.attention_ablation_head.bias", "transformer.h.3.neuron_ablation_head.weight", "transformer.h.3.neuron_ablation_head.bias", "transformer.h.4.attention_ablation_head.weight", "transformer.h.4.attention_ablation_head.bias", "transformer.h.4.neuron_ablation_head.weight", "transformer.h.4.neuron_ablation_head.bias", "transformer.h.5.attention_ablation_head.weight", "transformer.h.5.attention_ablation_head.bias", "transformer.h.5.neuron_ablation_head.weight", "transformer.h.5.neuron_ablation_head.bias", "transformer.h.6.attention_ablation_head.weight", "transformer.h.6.attention_ablation_head.bias", "transformer.h.6.neuron_ablation_head.weight", "transformer.h.6.neuron_ablation_head.bias", "transformer.h.7.attention_ablation_head.weight", "transformer.h.7.attention_ablation_head.bias", "transformer.h.7.neuron_ablation_head.weight", "transformer.h.7.neuron_ablation_head.bias". 

## Circuit Comparison

Contains code for making a comparison between the baseline and a model of our choice

In [87]:
data = {
    'Base Model': {
        'edges': regular_remaining_edges,
        'path': '../plots/our-baseline.svg',
    },
    'Ablated Model': {
        'edges': remaining_edges, # Becareful with this once we have multiple runs
        'path': '../plots/fanciful-fog-78.svg',
    }
}

In [89]:
import matplotlib.pyplot as plt
from PIL import Image
import os
import cairosvg
import io

def save_comparison_plot(data, output_path, dpi=600):
    def load_image(path):
        # Check if file is SVG
        if path.lower().endswith('.svg'):
            # Convert SVG to PNG in memory
            png_data = cairosvg.svg2png(url=path)
            return Image.open(io.BytesIO(png_data))
        else:
            return Image.open(path)

    # Load images with SVG support
    base_img = load_image(data['Base Model']['path'])
    ablated_img = load_image(data['Ablated Model']['path'])

    # Resize with high-quality interpolation
    target_size = (1000, 1000)
    base_img_resized = base_img.resize(target_size, Image.Resampling.LANCZOS)
    ablated_img_resized = ablated_img.resize(target_size, Image.Resampling.LANCZOS)

    # Create larger figure with higher DPI
    plt.rcParams['figure.dpi'] = 300
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

    # Plot images with captions
    ax1.imshow(base_img_resized)
    ax1.set_xlabel(f'Base Model\n({data["Base Model"]["edges"]} edges)')
    ax1.set_xticks([])
    ax1.set_yticks([])

    ax2.imshow(ablated_img_resized)
    ax2.set_xlabel(f'Model trained with Self-Ablation\n({data["Ablated Model"]["edges"]} edges)')
    ax2.set_xticks([])
    ax2.set_yticks([])

    # Set overall title
    fig.suptitle('IOI Circuit Comparison Across Tiny Stories Models (τ=0.03)', fontsize=14, y=1)

    # Save based on file extension
    plt.tight_layout()
    ext = os.path.splitext(output_path)[1].lower()
    
    if ext == '.svg':
        plt.savefig(output_path, format='svg', bbox_inches='tight')
    else:
        plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
    
    plt.close()
    
save_comparison_plot(data, '../plots/ioi-circuit-comparison.png')