# Compute Graph Replacement Score & Visualize

### Config

In [1]:
huggingface_path = "georglange/crosslayer-transcoder-topk-16k"

### Load model from HuggingFace 

In [2]:
from crosslayer_transcoder.model.serializable_module import SerializableModule


clt_model = SerializableModule.from_pretrained(huggingface_path)
assert clt_model._is_folded 

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from crosslayer_transcoder.model.clt import CrossLayerTranscoder
assert isinstance(clt_model, CrossLayerTranscoder)

In [4]:
from crosslayer_transcoder.utils.model_converters.circuit_tracer import (
    CircuitTracerConverter,
)
save_dir = "circuit_tracing_replacement_score"
feature_input_hook = "hook_resid_mid"
feature_output_hook = "hook_mlp_out"

converter = CircuitTracerConverter(
save_dir=save_dir,
feature_input_hook=feature_input_hook,
    feature_output_hook=feature_output_hook,
)
converter.convert_and_save(clt_model) 

### Load model from local converted checkpoint

In the future, this could be loaded from huggingface using the `ReplacementModel.from_pretrained`

In [5]:
from circuit_tracer.transcoder.cross_layer_transcoder import load_clt
scan_id = "dallas-austin"
circuit_tracer_clt = load_clt(
    clt_path=save_dir,
    lazy_decoder=False,
    lazy_encoder=False,
    feature_input_hook=feature_input_hook,
    feature_output_hook=feature_output_hook,
    scan=scan_id
)


In [6]:
from circuit_tracer import ReplacementModel

rm = ReplacementModel.from_pretrained_and_transcoders(
    "gpt2",
    circuit_tracer_clt,
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loaded pretrained model gpt2 into HookedTransformer


### Attribution

In [7]:
prompt = (
    "Mary and John went to the store, Jon gave a drink to"  # What you want to get the graph for
)
max_n_logits = 10  # How many logits to attribute from, max. We attribute to min(max_n_logits, n_logits_to_reach_desired_log_prob); see below for the latter
desired_logit_prob = 0.95  # Attribution will attribute from the minimum number of logits needed to reach this probability mass (or max_n_logits, whichever is lower)
max_feature_nodes = 8192  # Only attribute from this number of feature nodes, max. Lower is faster, but you will lose more of the graph. None means no limit.
batch_size = 256  # Batch size when attributing
verbose = True  # Whether to display a tqdm progress bar and timing report


In [8]:
from pathlib import Path
import torch

from circuit_tracer import attribute
from circuit_tracer.utils import create_graph_files

torch.cuda.empty_cache()

graph = attribute(
    prompt=prompt,
    model=rm,
    max_n_logits=max_n_logits,
    desired_logit_prob=desired_logit_prob,
    batch_size=batch_size,
    max_feature_nodes=max_feature_nodes,
    offload=None,
    verbose=verbose
)

Phase 0: Precomputing activations and vectors


Precomputation completed in 0.44s
Found 2496 active features
Phase 1: Running forward pass
Forward pass completed in 0.04s
Phase 2: Building input vectors
Selected 10 logits with cumulative probability 0.7510
Will include 2496 of 2496 feature nodes
Input vectors built in 0.02s
Phase 3: Computing logit attributions
Logit attributions completed in 0.10s
Phase 4: Computing feature attributions
Feature influence computation: 100%|██████████| 2496/2496 [00:01<00:00, 2354.52it/s]
Feature attributions completed in 1.06s
Attribution completed in 2.07s


### Replacement Score

Now that we have a graph created for our prompt above, we can generate metrics that tell us how much of the Replacement Model's computation is attributable to the feature vs error nodes. 

[Full explanation](https://transformer-circuits.pub/2025/attribution-graphs/methods.html#evaluating-graphs-comparing)


In [9]:
from circuit_tracer.graph import compute_graph_scores
print("replacement score, completeness score")
compute_graph_scores(graph)

replacement score, completeness score


(0.7058860659599304, 0.9123725891113281)

### [Optional] Visualize the graph

In [10]:
graph_dir = 'graphs'
graph_name = 'example_graph.pt'
graph_dir = Path(graph_dir)
graph_dir.mkdir(exist_ok=True)
graph_path = graph_dir / graph_name

graph.to_pt(graph_path)

In [11]:
slug = "mary-john-store"  # this is the name that you assign to the graph
graph_file_dir = "./graph_files"  # where to write the graph files. no need to make this one; create_graph_files does that for you
node_threshold = (
    0.8  # keep only the minimum # of nodes whose cumulative influence is >= 0.8
)
edge_threshold = (
    0.98  # keep only the minimum # of edges whose cumulative influence is >= 0.98
)

create_graph_files(
    graph_or_path=graph,  # the graph to create files for
    slug=slug,
    output_path=graph_file_dir,
    node_threshold=node_threshold,
    edge_threshold=edge_threshold,
)


In [12]:
from circuit_tracer.frontend.local_server import serve
from IPython.display import IFrame

port = 8048
server = serve(data_dir="./graph_files/", port=port)


print(
    f"Open your graph here: f'http://localhost:{port}/index.html'"
)


Open your graph here: f'http://localhost:8048/index.html'


## Repeat with a PLT

In [1]:
huggingface_plt_path = "georglange/crosslayer-transcoder-topk-plt-16k"

In [2]:
from crosslayer_transcoder.model.clt import Decoder
from crosslayer_transcoder.model.serializable_module import SerializableModule
from crosslayer_transcoder.model.clt import CrossLayerTranscoder


plt_model = SerializableModule.from_pretrained(huggingface_plt_path)
assert plt_model._is_folded 

assert isinstance(plt_model, CrossLayerTranscoder)
assert isinstance(plt_model.decoder, Decoder)



Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from crosslayer_transcoder.utils.model_converters.circuit_tracer import (
    CircuitTracerConverter,
)

save_dir = "circuit_tracing_plt_replacement_score"
feature_input_hook = "hook_resid_mid"
feature_output_hook = "hook_mlp_out"

converter = CircuitTracerConverter(
    save_dir=save_dir,
    feature_input_hook=feature_input_hook,
    feature_output_hook=feature_output_hook,
)
converter.convert_and_save(plt_model)


In [5]:
from circuit_tracer.transcoder.single_layer_transcoder import load_transcoder_set
scan_id = "dallas-austin"
n_layers = plt_model.decoder.n_layers
transcoder_set = {
i: f"{save_dir}/layer_{i}.safetensors"
for i in range(n_layers)
}
transcoder_set = load_transcoder_set(
    transcoder_paths=transcoder_set,
    lazy_decoder=False,
    lazy_encoder=False,
    feature_input_hook=feature_input_hook,
    feature_output_hook=feature_output_hook,
    scan=scan_id
)

In [6]:
from circuit_tracer import ReplacementModel

rm = ReplacementModel.from_pretrained_and_transcoders(
    "gpt2",
    transcoder_set,
)


`torch_dtype` is deprecated! Use `dtype` instead!


Loaded pretrained model gpt2 into HookedTransformer


In [7]:
prompt = "Mary and John went to the store, Jon gave a drink to"  # What you want to get the graph for
max_n_logits = 10  # How many logits to attribute from, max. We attribute to min(max_n_logits, n_logits_to_reach_desired_log_prob); see below for the latter
desired_logit_prob = 0.95  # Attribution will attribute from the minimum number of logits needed to reach this probability mass (or max_n_logits, whichever is lower)
max_feature_nodes = 8192  # Only attribute from this number of feature nodes, max. Lower is faster, but you will lose more of the graph. None means no limit.
batch_size = 256  # Batch size when attributing
verbose = True  # Whether to display a tqdm progress bar and timing report


In [8]:
from pathlib import Path
import torch

from circuit_tracer import attribute
from circuit_tracer.utils import create_graph_files

torch.cuda.empty_cache()

graph = attribute(
    prompt=prompt,
    model=rm,
    max_n_logits=max_n_logits,
    desired_logit_prob=desired_logit_prob,
    batch_size=batch_size,
    max_feature_nodes=max_feature_nodes,
    offload=None,
    verbose=verbose
)

Phase 0: Precomputing activations and vectors


Precomputation completed in 0.36s
Found 2496 active features
Phase 1: Running forward pass
Forward pass completed in 0.05s
Phase 2: Building input vectors
Selected 10 logits with cumulative probability 0.7510
Will include 2496 of 2496 feature nodes
Input vectors built in 0.02s
Phase 3: Computing logit attributions
Logit attributions completed in 0.08s
Phase 4: Computing feature attributions
Feature influence computation: 100%|██████████| 2496/2496 [00:01<00:00, 2000.07it/s]
Feature attributions completed in 1.25s
Attribution completed in 2.07s


In [9]:
from circuit_tracer.graph import compute_graph_scores
print("replacement score, completeness score")
compute_graph_scores(graph)

replacement score, completeness score


(0.7082498073577881, 0.916822075843811)