# EdgePatch / SEAM: Causal Receiver Masking

This notebook runs the Edge-Patch experiments on Colab A100.

**Key Features:**
- Loads the `uzaymacar/math-rollouts` dataset
- Uses dataset's chunk boundaries (no re-splitting)
- Computes per-chunk causal importance via ANSWER‚ÜíCHUNK attention-edge masking
- **CRITICAL**: Verifies that `edge_layers` and `edge_heads` actually change outputs

## Quick Start
1. Run Cell 1 (Setup)
2. Run Cell 2 (Smoke Test)
3. Run Cell 3 (Layer Toggle Test) - **MUST PASS**
4. Run Cell 4 (Head Toggle Test) - **MUST PASS**
5. Run Cell 5 (Confirm Run) - Only if toggles pass

In [None]:
# Cell 1: Setup
# Clone the repo and install dependencies

import os

# Check if already cloned
if not os.path.exists('SEAM'):
    !git clone https://github.com/MechInterpreter/SEAM.git
    
%cd SEAM
!pip install -e . --quiet

print("\n" + "="*60)
print("‚úì SETUP COMPLETE")
print("="*60)

In [None]:
# Cell 2: Smoke Test
# Quick validation with 1 example

import subprocess
import json
from pathlib import Path

# Run smoke test
result = subprocess.run(
    ["python", "scripts/run_edgepatch.py", "smoke", "--output-dir", "runs/smoke_test"],
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

# Check if artifacts exist
smoke_dir = Path("runs/smoke_test")
run_dirs = list(smoke_dir.glob("edgepatch_*")) if smoke_dir.exists() else []

if run_dirs:
    latest = sorted(run_dirs)[-1]
    metrics_path = latest / "eval_metrics.json"
    if metrics_path.exists():
        print("\n" + "="*60)
        print("‚úì SMOKE PASS - Artifacts created")
        print(f"  Run dir: {latest}")
        print("="*60)
        SMOKE_PASSED = True
    else:
        print("\n" + "="*60)
        print("‚úó SMOKE FAIL - No eval_metrics.json")
        print("="*60)
        SMOKE_PASSED = False
else:
    print("\n" + "="*60)
    print("‚úó SMOKE FAIL - No run directory created")
    print("="*60)
    SMOKE_PASSED = False

In [None]:
# Cell 3: Layer Toggle Test (CRITICAL)
# Verify that different edge_layers produce different results
# This MUST pass - if it fails, masking is broken!

import subprocess
import json
from pathlib import Path
import numpy as np

def run_with_layers(layers: list, output_name: str):
    """Run EdgePatch with specific layers and return scores."""
    cmd = [
        "python", "scripts/run_edgepatch.py", "smoke",
        "--output-dir", f"runs/{output_name}",
        "--edge-layers"
    ] + [str(l) for l in layers]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    # Find the run directory and load results
    run_dir = Path(f"runs/{output_name}")
    run_dirs = list(run_dir.glob("edgepatch_*")) if run_dir.exists() else []
    
    if not run_dirs:
        print(f"ERROR: No run dir for {output_name}")
        print(result.stderr)
        return None
    
    latest = sorted(run_dirs)[-1]
    results_path = latest / "all_results.json"
    
    if not results_path.exists():
        print(f"ERROR: No results for {output_name}")
        return None
    
    with open(results_path) as f:
        results = json.load(f)
    
    # Extract scores
    scores = []
    for ex in results:
        for s in ex["scores"]:
            scores.append(s["delta_logp"])
    
    return np.array(scores)

# Test A: Mask only layer 0
print("Running with edge_layers=[0]...")
scores_A = run_with_layers([0], "layer_test_A")

# Test B: Mask layers 24-31 (late layers)
print("Running with edge_layers=[24,25,26,27,28,29,30,31]...")
scores_B = run_with_layers([24, 25, 26, 27, 28, 29, 30, 31], "layer_test_B")

# Compare
if scores_A is not None and scores_B is not None:
    max_diff = np.max(np.abs(scores_A - scores_B))
    mean_diff = np.mean(np.abs(scores_A - scores_B))
    
    print(f"\nScores A (layer 0): {scores_A[:5]}...")
    print(f"Scores B (layers 24-31): {scores_B[:5]}...")
    print(f"Max difference: {max_diff:.6f}")
    print(f"Mean difference: {mean_diff:.6f}")
    
    # CRITICAL ASSERTION
    assert max_diff > 1e-6, f"LAYER TOGGLE FAILED! max_diff={max_diff} <= 1e-6"
    
    print("\n" + "="*60)
    print(f"‚úì LAYER TOGGLE PASS: max_diff={max_diff:.6f} > 1e-6")
    print("="*60)
    LAYER_TOGGLE_PASSED = True
else:
    print("\n" + "="*60)
    print("‚úó LAYER TOGGLE FAIL - Could not get scores")
    print("="*60)
    LAYER_TOGGLE_PASSED = False

In [None]:
# Cell 4: Head Toggle Test (CRITICAL)
# Verify that different edge_heads produce different results
# This MUST pass - if it fails, head masking is broken!

import subprocess
import json
from pathlib import Path
import numpy as np

def run_with_heads(heads: list, output_name: str):
    """Run EdgePatch with specific heads and return scores."""
    cmd = [
        "python", "scripts/run_edgepatch.py", "smoke",
        "--output-dir", f"runs/{output_name}",
        "--edge-layers", "0", "1", "2", "3",  # Fix layers for comparison
        "--edge-heads"
    ] + [str(h) for h in heads]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    # Find the run directory and load results
    run_dir = Path(f"runs/{output_name}")
    run_dirs = list(run_dir.glob("edgepatch_*")) if run_dir.exists() else []
    
    if not run_dirs:
        print(f"ERROR: No run dir for {output_name}")
        return None
    
    latest = sorted(run_dirs)[-1]
    results_path = latest / "all_results.json"
    
    if not results_path.exists():
        print(f"ERROR: No results for {output_name}")
        return None
    
    with open(results_path) as f:
        results = json.load(f)
    
    # Extract scores
    scores = []
    for ex in results:
        for s in ex["scores"]:
            scores.append(s["delta_logp"])
    
    return np.array(scores)

# Test A: Mask only head 0
print("Running with edge_heads=[0]...")
scores_A = run_with_heads([0], "head_test_A")

# Test B: Mask head 1
print("Running with edge_heads=[1]...")
scores_B = run_with_heads([1], "head_test_B")

# Compare
if scores_A is not None and scores_B is not None:
    max_diff = np.max(np.abs(scores_A - scores_B))
    mean_diff = np.mean(np.abs(scores_A - scores_B))
    
    print(f"\nScores A (head 0): {scores_A[:5]}...")
    print(f"Scores B (head 1): {scores_B[:5]}...")
    print(f"Max difference: {max_diff:.6f}")
    print(f"Mean difference: {mean_diff:.6f}")
    
    # CRITICAL ASSERTION
    assert max_diff > 1e-6, f"HEAD TOGGLE FAILED! max_diff={max_diff} <= 1e-6"
    
    print("\n" + "="*60)
    print(f"‚úì HEAD TOGGLE PASS: max_diff={max_diff:.6f} > 1e-6")
    print("="*60)
    HEAD_TOGGLE_PASSED = True
else:
    print("\n" + "="*60)
    print("‚úó HEAD TOGGLE FAIL - Could not get scores")
    print("="*60)
    HEAD_TOGGLE_PASSED = False

In [None]:
# Cell 5: Confirm Run
# Run with 3 examples and full metrics
# ONLY RUN THIS IF TOGGLE TESTS PASSED!

import subprocess
import json
from pathlib import Path

# Check prerequisites
try:
    if not LAYER_TOGGLE_PASSED or not HEAD_TOGGLE_PASSED:
        print("="*60)
        print("‚ö†Ô∏è  SKIPPING CONFIRM RUN - Toggle tests did not pass!")
        print("="*60)
        raise SystemExit()
except NameError:
    print("="*60)
    print("‚ö†Ô∏è  Run toggle tests first (Cells 3 and 4)")
    print("="*60)
    raise SystemExit()

print("Running confirm with max_examples=3...")
print("This may take a few minutes...\n")

result = subprocess.run(
    ["python", "scripts/run_edgepatch.py", "confirm", "--output-dir", "runs/confirm"],
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr[-2000:])  # Last 2000 chars

# Check artifacts
confirm_dir = Path("runs/confirm")
run_dirs = list(confirm_dir.glob("edgepatch_*")) if confirm_dir.exists() else []

if run_dirs:
    latest = sorted(run_dirs)[-1]
    metrics_path = latest / "eval_metrics.json"
    
    if metrics_path.exists():
        with open(metrics_path) as f:
            metrics = json.load(f)
        
        print("\n" + "="*60)
        print("EVALUATION METRICS")
        print("="*60)
        print(f"Spearman œÅ:     {metrics.get('spearman_rho', 'N/A'):.4f}")
        print(f"Top-1 overlap:  {metrics.get('top_1_overlap', 'N/A'):.4f}")
        print(f"Top-3 overlap:  {metrics.get('top_3_overlap', 'N/A'):.4f}")
        print(f"PR-AUC@10%:     {metrics.get('pr_auc_10', 'N/A'):.4f}")
        print(f"Shuffled œÅ:     {metrics.get('shuffled_rho', 'N/A'):.4f}")
        print("="*60)
        print("‚úì CONFIRM PASS - All artifacts created")
        print(f"  Run dir: {latest}")
        print("="*60)
    else:
        print("\n" + "="*60)
        print("‚úó CONFIRM FAIL - No eval_metrics.json")
        print("="*60)
else:
    print("\n" + "="*60)
    print("‚úó CONFIRM FAIL - No run directory")
    print("="*60)

In [None]:
# Cell 6: Summary
# Print final summary of all tests

print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)

try:
    print(f"Smoke Test:      {'‚úì PASS' if SMOKE_PASSED else '‚úó FAIL'}")
except NameError:
    print("Smoke Test:      Not run")

try:
    print(f"Layer Toggle:    {'‚úì PASS' if LAYER_TOGGLE_PASSED else '‚úó FAIL'}")
except NameError:
    print("Layer Toggle:    Not run")

try:
    print(f"Head Toggle:     {'‚úì PASS' if HEAD_TOGGLE_PASSED else '‚úó FAIL'}")
except NameError:
    print("Head Toggle:     Not run")

print("="*60)

# Check if all critical tests passed
try:
    if LAYER_TOGGLE_PASSED and HEAD_TOGGLE_PASSED:
        print("\nüéâ All critical tests passed!")
        print("   Edge masking is working correctly.")
    else:
        print("\n‚ö†Ô∏è  Some critical tests failed!")
        print("   Check the masking implementation.")
except NameError:
    print("\n‚ö†Ô∏è  Not all tests have been run.")