In [1]:
!pip install goodfire --quiet


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
GOODFIRE_API_KEY = ""

In [None]:
import goodfire
from goodfire import Variant, Feature
from typing import List, Dict



client  = goodfire.Client(GOODFIRE_API_KEY)
variant = Variant("meta-llama/Llama-3.3-70B-Instruct") 
#variant = Variant("meta-llama/Llama-3.1-8B-Instruct")  

In [None]:
# ================================================================
#   MULTI-EXPERIMENT SAE RUNNER WITH ADDITIVE STEERING
#   Runs multiple experiments with different system prompts
# ================================================================
import re, json, os, datetime
from typing import List, Dict, Any, Tuple

import goodfire
from goodfire import Variant, Feature

# ---------- Config ----------------------------------------------
MODEL_NAME                = "meta-llama/Llama-3.3-70B-Instruct"
TOKENS_SHOWN_TO_ASSISTANT = 100
TOP_K_FEATURES            = 5  # Fixed at 5 as requested
WORDS_PER_GENERATION      = 50
ITERATIONS                = 8

# NEW: Control steering behavior
RESET_AFTER_STEER         = False  # Set to True for isolated features, False for cumulative
ADDITIVE_STEERING         = True   # Set to True for additive, False for overwrite

# Fixed settings for all experiments
MAX_TOKENS_GENERATION     = 150
MAX_TOKENS_INDEX_CHOICE   = 40  
MAX_TOKENS_FEATURE_CHOICE = 150
TEMPERATURE_GENERATION    = 0
TEMPERATURE_INDEX_CHOICE  = 0
TEMPERATURE_FEATURE_CHOICE = 0

SUMMARY_DIR = ""


SYSTEM_PROMPTS = {
    "dishwasher_repair": "You are a dishwasher repair technician AI who fixes broken appliances. When you see SAE features, choose ones that help you diagnose clogged spray arms, replace door seals, troubleshoot drainage pumps, and order specific replacement parts by model number.",
    
    "parking_meter_collector": "You are a parking meter coin collection specialist AI. When you see SAE features, choose ones that help you empty coin mechanisms, count quarters and dimes, report broken meters by street address, and schedule regular collection routes.",
    
    "shoelace_manufacturer": "You are a shoelace factory production manager AI. When you see SAE features, choose ones that help you monitor nylon threading machines, quality-check aglet attachment, coordinate color dye batches, and track inventory of different length specifications.",
    
    "doorknob_installer": "You are a door hardware installation specialist AI. When you see SAE features, choose ones that help you measure door thickness, drill precise holes for deadbolts, align strike plates, and select appropriate screw lengths for different door materials.",
    
    "grocery_bag_packer": "You are a supermarket bagging specialist AI focused on efficient packing. When you see SAE features, choose ones that help you separate cold items, prevent bread crushing, distribute weight evenly, and choose paper versus plastic bags.",
    
    "windshield_wiper_replacer": "You are an auto parts specialist AI who installs wiper blades. When you see SAE features, choose ones that help you measure blade length, identify wiper arm types, remove old rubber inserts, and test clearing patterns on windshields.",
    
    "pencil_sharpener_maintenance": "You are a pencil sharpener repair technician AI. When you see SAE features, choose ones that help you replace cutting blades, clear wood shaving jams, adjust tension springs, and lubricate rotating mechanisms in electric models.",
    
    "sock_sorter": "You are a laundromat sock matching specialist AI. When you see SAE features, choose ones that help you pair identical socks by size and color, identify fabric types, remove lint buildup, and organize lost socks by customer pickup times.",
    
    "bubble_wrap_popper": "You are a packaging material quality tester AI who tests bubble wrap. When you see SAE features, choose ones that help you check air bubble integrity, measure plastic thickness, test adhesive strength, and sort sheets by bubble diameter size.",
    
    "stapler_reloader": "You are an office supply maintenance AI who refills staplers. When you see SAE features, choose ones that help you load staple cartridges, clear paper jams, adjust staple depth settings, and replace worn spring mechanisms.",
    
    "lint_trap_cleaner": "You are a dryer maintenance specialist AI focused on lint removal. When you see SAE features, choose ones that help you extract lint buildup, clean mesh screens, inspect exhaust vents, and prevent fire hazards in laundry equipment.",
    
    "paper_clip_straightener": "You are a desk supply recycling AI who processes used paper clips. When you see SAE features, choose ones that help you unbend wire clips, sort by metal type, remove plastic coatings, and bundle straightened clips for reuse.",
    
    "salt_packet_filler": "You are a condiment packaging machine operator AI. When you see SAE features, choose ones that help you calibrate salt dispensing amounts, seal packet edges, detect packaging defects, and maintain consistent fill weights.",
    
    "shovel_handle_replacer": "You are a tool repair shop AI specializing in handle replacement. When you see SAE features, choose ones that help you remove broken wooden handles, measure socket depths, sand rough edges, and secure new handles with appropriate adhesives."
}


In [None]:


# ================================================================
# ADDITIVE VARIANT WRAPPER
# ================================================================
class AdditiveVariant:
    """Wrapper around Goodfire Variant that enables additive steering"""
    
    def __init__(self, variant):
        self.variant = variant
        # Track magnitudes by feature description (the actual feature identity)
        self.feature_magnitudes = {}
        # Track history of all operations
        self.operation_history = []
        
    def reset(self):
        """Reset all steerings"""
        self.variant.reset()
        self.feature_magnitudes = {}
        self.operation_history = []
        
    def get_feature_description(self, feature):
        """Extract description from feature object"""
        desc = re.search(r'Feature\("([^"]+)"\)', str(feature))
        return desc.group(1) if desc else str(feature)
    
    def set_feature(self, feature, magnitude_delta, additive=True, operation_id=None):
        """
        Set a feature with optional additive behavior
        
        Args:
            feature: The feature to steer
            magnitude_delta: The magnitude to add (if additive) or set (if not)
            additive: If True, add to existing magnitude. If False, overwrite.
            operation_id: ID used for this operation (e.g., "46_model_1")
        """
        # Get description as the unique key for this feature
        feature_desc = self.get_feature_description(feature)
        
        # Track the operation
        operation = {
            'feature_desc': feature_desc,
            'operation_id': operation_id,
            'additive': additive,
            'magnitude_delta': magnitude_delta,
            'timestamp': datetime.datetime.utcnow().isoformat()
        }
        
        if additive and feature_desc in self.feature_magnitudes:
            # Additive mode: add to existing magnitude
            old_magnitude = self.feature_magnitudes[feature_desc]['magnitude']
            new_magnitude = old_magnitude + magnitude_delta
            print(f"    🔄 ADDITIVE: {old_magnitude:+.3f} + {magnitude_delta:+.3f} = {new_magnitude:+.3f}")
            operation['old_magnitude'] = old_magnitude
        else:
            # Overwrite mode or first application
            new_magnitude = magnitude_delta
            if feature_desc in self.feature_magnitudes:
                old_magnitude = self.feature_magnitudes[feature_desc]['magnitude']
                print(f"    ⚠️  OVERWRITE: {old_magnitude:+.3f} → {new_magnitude:+.3f}")
                operation['old_magnitude'] = old_magnitude
            else:
                print(f"    ✅ INITIAL: {new_magnitude:+.3f}")
                operation['old_magnitude'] = None
        
        # Clip to valid range
        clipped_magnitude = max(-1, min(1, new_magnitude))
        if clipped_magnitude != new_magnitude:
            print(f"    ⚠️  CLIPPED: {new_magnitude:+.3f} → {clipped_magnitude:+.3f} (outside ±1 range)")
            new_magnitude = clipped_magnitude
        
        operation['new_magnitude'] = new_magnitude
        self.operation_history.append(operation)
        
        # Store the cumulative magnitude with metadata
        if feature_desc not in self.feature_magnitudes:
            self.feature_magnitudes[feature_desc] = {
                'magnitude': new_magnitude,
                'feature_object': feature,
                'first_id': operation_id,
                'last_id': operation_id,
                'times_modified': 1,
                'ids_used': [operation_id]
            }
        else:
            self.feature_magnitudes[feature_desc]['magnitude'] = new_magnitude
            self.feature_magnitudes[feature_desc]['last_id'] = operation_id
            self.feature_magnitudes[feature_desc]['times_modified'] += 1
            if operation_id not in self.feature_magnitudes[feature_desc]['ids_used']:
                self.feature_magnitudes[feature_desc]['ids_used'].append(operation_id)
        
        # Apply to variant (this overwrites, but we're tracking the cumulative)
        # Extract the actual feature object if it's wrapped
        actual_feature = feature.feature if hasattr(feature, 'feature') else feature
        self.variant.set(actual_feature, new_magnitude)
        
        return new_magnitude
    
    def get_active_features_summary(self):
        """Get a summary of all active features for display"""
        summary = []
        for desc, info in self.feature_magnitudes.items():
            summary.append({
                'description': desc,
                'magnitude': info['magnitude'],
                'times_modified': info['times_modified'],
                'ids_used': info['ids_used']
            })
        return summary

# ================================================================
# HELPERS
# ================================================================

# Templates with updated ask_steer_template
ask_steer_template = (
    "Here are the SAE features you requested:\n"
    "{features_json_str}\n\n"
    "CURRENTLY ACTIVE STEERINGS:\n"
    "{active_steerings_str}\n\n"
    "NOTE: Total magnitude for any feature is clipped to the range [-1, 1]. "
    "Steering outside this range will have no additional effect.\n\n"
    "Choose ONE feature to steer. Reply ONLY with JSON:\n"
    '{{"id": "<id above>", "magnitude": value from -0.5 to 0.5}}'
)

ask_indices_template = (
    "Here are the numbered tokens of your previous reply.\n"
    "{token_list_str}\n\n"
    "Choose up to three token indices whose SAE features you want to inspect. You will be able to see which of your own SAE features fire on those tokens. Respond ONLY with a list like [int, int, int, ...]"
)

user_prompt_template = "Generate about {words_per_generation} words of text."

# ----------------------------------------------------------------
client = goodfire.Client(GOODFIRE_API_KEY)

# Original helper functions remain the same
SPECIAL_RE = re.compile(r'<\|.*?\|>')

def strip_meta_tokens(tokens) -> Tuple[List[Any], List[int]]:
    last_hdr = -1
    for i, t in enumerate(tokens):
        if '<|end_header_id|>' in str(t):
            last_hdr = i
    content = tokens[last_hdr + 1:]
    while content and re.fullmatch(r'Token\("(\s+)"\)', str(content[0])):
        content = content[1:]
        last_hdr += 1
    idxs = list(range(last_hdr + 1, last_hdr + 1 + len(content)))
    return content, idxs

def make_indexed_token_list(ctx) -> Tuple[str, Dict[int, int]]:
    content, orig = strip_meta_tokens(ctx.tokens)
    mapping, items = {}, []
    for shown, (tok, orig_idx) in enumerate(zip(content, orig)):
        if shown >= TOKENS_SHOWN_TO_ASSISTANT: break
        items.append(f'{shown}: "{str(tok).replace(chr(34), r"\"")}"')
        mapping[shown] = orig_idx
    return "[" + ", ".join(items) + "]", mapping

def parse_index_list(txt: str) -> List[int]:
    m = re.search(r'\[([^\]]+)\]', txt)
    if not m: return []
    return [int(i) for i in re.findall(r"-?\d+", m.group(1))][:3]

def get_features(ctx, idx, k=TOP_K_FEATURES):
    tok = ctx.tokens[idx]
    if hasattr(tok, "inspect"): return tok.inspect()[:k]
    if hasattr(tok, "top"):     return tok.top(k=k)
    return []

_tok_rx = re.compile(r'Token\("(.*)"\)')
def token_text(tok_str: str) -> str:
    m = _tok_rx.match(tok_str)
    return m.group(1) if m else tok_str

def slugify(s: str) -> str:
    return re.sub(r'\W+', '', s.strip()) or "tok"

# NEW: Enhanced steering state printer
def print_steering_state(additive_variant: AdditiveVariant, iteration: int):
    """Print the current state of all active steerings with better tracking"""
    active_features = additive_variant.get_active_features_summary()
    
    print("\n" + "┌" + "─"*68 + "┐")
    print(f"│ 🎛️  STEERING STATE (Iteration {iteration})".ljust(69) + "│")
    print("├" + "─"*68 + "┤")
    
    if not active_features:
        print("│ No active steerings".ljust(69) + "│")
    else:
        print(f"│ Unique features active: {len(active_features)}".ljust(69) + "│")
        print("├" + "─"*68 + "┤")
        
        for i, feat in enumerate(active_features, 1):
            # Truncate description if too long
            desc = feat['description'][:40] + "..." if len(feat['description']) > 40 else feat['description']
            print(f"│ {i}. {desc}".ljust(69) + "│")
            print(f"│    Magnitude: {feat['magnitude']:+.3f} | Modified: {feat['times_modified']}x".ljust(69) + "│")
            
            # Show all IDs used for this feature
            if feat['times_modified'] > 1:
                ids_str = ", ".join(feat['ids_used'][:3])
                if len(feat['ids_used']) > 3:
                    ids_str += f"... ({len(feat['ids_used'])} total)"
                print(f"│    IDs: {ids_str}".ljust(69) + "│")
    
    print("├" + "─"*68 + "┤")
    print(f"│ Mode: {'ADDITIVE' if ADDITIVE_STEERING else 'OVERWRITE'} | Reset: {'YES' if RESET_AFTER_STEER else 'NO'}".ljust(69) + "│")
    print("└" + "─"*68 + "┘\n")

# ================================================================
def run_single_experiment(experiment_name: str, system_prompt: str):
    """Run a single experiment with the given system prompt"""
    
    print("╔" + "═"*70 + "╗")
    print(f"║ EXPERIMENT: {experiment_name.upper()}".ljust(70) + "║")
    print("╚" + "═"*70 + "╝\n")
    print(f"System Prompt: {system_prompt}\n")
    print(f"🔧 Steering Mode: {'ADDITIVE' if ADDITIVE_STEERING else 'OVERWRITE'}")
    print(f"🔧 Reset After Steer: {'YES' if RESET_AFTER_STEER else 'NO'}\n")
    
    base_variant = Variant(MODEL_NAME)
    variant = AdditiveVariant(base_variant)
    
    history: List[Dict[str, str]] = [{
        "role": "system",
        "content": system_prompt
    }]

    # Summary object for this experiment
    summary = {
        "metadata": {
            "experiment_name": experiment_name,
            "model_name": MODEL_NAME,
            "date_utc": datetime.datetime.utcnow().isoformat(timespec="seconds"),
            "system_prompt": system_prompt,
            "reset_after_steer": RESET_AFTER_STEER,
            "additive_steering": ADDITIVE_STEERING,
            "config": {
                "tokens_shown": TOKENS_SHOWN_TO_ASSISTANT,
                "top_k_features": TOP_K_FEATURES,
                "words_per_generation": WORDS_PER_GENERATION,
                "iterations_requested": ITERATIONS,
                "max_tokens_generation": MAX_TOKENS_GENERATION,
                "max_tokens_index_choice": MAX_TOKENS_INDEX_CHOICE,
                "max_tokens_feature_choice": MAX_TOKENS_FEATURE_CHOICE,
                "temperature_generation": TEMPERATURE_GENERATION,
                "temperature_index_choice": TEMPERATURE_INDEX_CHOICE,
                "temperature_feature_choice": TEMPERATURE_FEATURE_CHOICE,
            }
        },
        "conversation": [],
        "iterations": []
    }

    def log(msg): print(msg)

    for it in range(1, ITERATIONS+1):
        iter_record: Dict[str, Any] = {"iteration": it}
        
        # Print current steering state
        print_steering_state(variant, it)
        
        user_prompt = user_prompt_template.format(words_per_generation=WORDS_PER_GENERATION)
        
        history.append({"role": "user", "content": user_prompt})
        summary["conversation"].append(history[-1])
        log("="*70 + f"\nITERATION {it}  USER → {user_prompt}\n" + "="*70)

        # Assistant generates
        log("\n📤 ASSISTANT:")
        reply = ""
        for ev in client.chat.completions.create(
                model=variant.variant, messages=history, stream=True,
                max_completion_tokens=MAX_TOKENS_GENERATION, 
                temperature=TEMPERATURE_GENERATION):
            delta = ev.choices[0].delta.content
            if delta: print(delta, end="", flush=True); reply += delta
        print("\n")
        history.append({"role": "assistant", "content": reply})
        summary["conversation"].append(history[-1])
        iter_record["assistant_reply"] = reply

        # Token list for that reply
        ctx = client.features.inspect(messages=[history[-1]], model=variant.variant)
        token_list_str, mapping = make_indexed_token_list(ctx)
        iter_record["token_list_shown"] = token_list_str

        ask_indices = ask_indices_template.format(token_list_str=token_list_str)
        history.append({"role": "user", "content": ask_indices})
        summary["conversation"].append(history[-1])
        log("👤 USER asks for indices:\n" + ask_indices + "\n")

        # Assistant chooses indices
        log("🔍 ASSISTANT (choosing indices):")
        idx_reply = ""
        for ev in client.chat.completions.create(
                model=variant.variant, messages=history, stream=True,
                max_completion_tokens=MAX_TOKENS_INDEX_CHOICE, 
                temperature=TEMPERATURE_INDEX_CHOICE):
            delta = ev.choices[0].delta.content
            if delta: print(delta, end="", flush=True); idx_reply += delta
        print("\n")
        history.append({"role": "assistant", "content": idx_reply})
        summary["conversation"].append(history[-1])

        chosen_indices = parse_index_list(idx_reply)
        iter_record["indices_chosen"] = chosen_indices
        iter_record["tokens_resolved"] = [
            token_text(str(ctx.tokens[mapping[i]])) if i in mapping else None
            for i in chosen_indices
        ]

        # Gather features
        log("🔬 Gathering SAE features …")
        pseudo_map: Dict[str, Any] = {}
        token_feature_json_blocks = []

        for shown_idx in chosen_indices:
            if shown_idx not in mapping:
                log(f"    ✗ index {shown_idx} out of range."); continue
            orig = mapping[shown_idx]
            feats = get_features(ctx, orig)
            tok_str = token_text(str(ctx.tokens[orig]))
            if not feats:
                log(f"    ✗ no features for token {shown_idx}"); continue

            tf_block = {
                "token_id": shown_idx,
                "token_string": tok_str,
                "features": []
            }
            for rank, ft in enumerate(feats, 1):
                pid = f"{shown_idx}_{slugify(tok_str)}_{rank}"
                pseudo_map[pid] = ft
                desc = re.search(r'Feature\("([^"]+)"\)', str(ft))
                act  = re.search(r'activation=([-.\d]+)', str(ft))
                tf_block["features"].append({
                    "id": pid,
                    "description": desc.group(1) if desc else str(ft),
                    "current_activation": float(act.group(1)) if act else None
                })
                log(f"        [Debug] mapped {pid} -> {ft}")
            token_feature_json_blocks.append(tf_block)
            log(f"    ✓ {len(feats)} features for token {shown_idx}")

        iter_record["features_shown"] = token_feature_json_blocks
        if not pseudo_map:
            iter_record["error"] = "No features collected"
            iter_record["active_steerings"] = variant.get_active_features_summary()
            summary["iterations"].append(iter_record)
            log("⚠️  No features collected; skipping iteration.\n")
            continue

        # Format active steerings for the prompt
        active_features = variant.get_active_features_summary()
        if active_features:
            active_steerings_str = "Current steerings:\n"
            for feat in active_features:
                active_steerings_str += f"- {feat['description']}: {feat['magnitude']:+.3f}\n"
        else:
            active_steerings_str = "No features currently being steered."

        features_json_str = json.dumps(token_feature_json_blocks, indent=2)
        ask_steer = ask_steer_template.format(
            features_json_str=features_json_str,
            active_steerings_str=active_steerings_str
        )
        history.append({"role": "user", "content": ask_steer})
        summary["conversation"].append(history[-1])
        log("👤 USER shows features:\n" + ask_steer + "\n")

        # Assistant picks feature
        log("⚙️  ASSISTANT (choosing feature):")
        steer_reply = ""
        for ev in client.chat.completions.create(
                model=variant.variant, messages=history, stream=True,
                max_completion_tokens=MAX_TOKENS_FEATURE_CHOICE, 
                temperature=TEMPERATURE_FEATURE_CHOICE):
            delta = ev.choices[0].delta.content
            if delta: print(delta, end="", flush=True); steer_reply += delta
        print("\n")
        history.append({"role": "assistant", "content": steer_reply})
        summary["conversation"].append(history[-1])

        # Parse JSON
        try:
            cmd_json = re.search(r'\{.*\}', steer_reply, re.S).group(0)
            cmd = json.loads(cmd_json)
            pid = cmd["id"]; magnitude = float(cmd["magnitude"])
            log(f"    [Debug] assistant chose id={pid}, mag={magnitude}")
            iter_record["feature_chosen"] = pid
            iter_record["magnitude"] = magnitude
        except Exception as e:
            iter_record["error"] = f"parse steering JSON failed: {e}"
            iter_record["active_steerings"] = variant.get_active_features_summary()
            summary["iterations"].append(iter_record)
            log(f"➡️  Could not parse steering JSON: {e}\n"); continue

        feat_obj = pseudo_map.get(pid)
        if feat_obj is None:
            iter_record["error"] = "chosen id not in feature list"
            iter_record["active_steerings"] = variant.get_active_features_summary()
            summary["iterations"].append(iter_record)
            log("⚠️  id not in provided feature list.\n"); continue

        # Apply steering
        try:
            if RESET_AFTER_STEER:
                variant.reset()
                log("    🔄 Reset variant (clearing all previous steerings)")
            
            # Get feature description
            desc = re.search(r'Feature\("([^"]+)"\)', str(feat_obj))
            feature_desc = desc.group(1) if desc else str(feat_obj)
            
            log(f"✅ Applying steering for {pid}")
            log(f"    Feature: {feature_desc}")
            
            # Apply with additive or overwrite mode
            final_magnitude = variant.set_feature(
                feat_obj,
                magnitude,
                additive=ADDITIVE_STEERING,
                operation_id=pid
            )
            
            iter_record["steering_applied"] = True
            iter_record["final_magnitude"] = final_magnitude
            iter_record["active_steerings"] = variant.get_active_features_summary()
            
            log(f"    Final magnitude for this feature: {final_magnitude:+.3f}")
            log(f"    Total unique features active: {len(variant.feature_magnitudes)}\n")
            
        except Exception as e:
            iter_record["error"] = f"variant.set failed: {e}"
            iter_record["active_steerings"] = variant.get_active_features_summary()
            log(f"❌ Failed to apply steering: {e}\n")

        summary["iterations"].append(iter_record)

    # Final steering state
    print_steering_state(variant, "FINAL")
    
    # Print operation history
    print("\n📊 OPERATION HISTORY:")
    print("="*70)
    for i, op in enumerate(variant.operation_history, 1):
        desc = op['feature_desc'][:40] + "..." if len(op['feature_desc']) > 40 else op['feature_desc']
        if op['old_magnitude'] is not None:
            if op['additive']:
                print(f"{i}. {desc}: {op['old_magnitude']:+.3f} + {op['magnitude_delta']:+.3f} = {op['new_magnitude']:+.3f} (via {op['operation_id']})")
            else:
                print(f"{i}. {desc}: {op['old_magnitude']:+.3f} → {op['new_magnitude']:+.3f} (via {op['operation_id']})")
        else:
            print(f"{i}. {desc}: Initial {op['new_magnitude']:+.3f} (via {op['operation_id']})")
    print("="*70)

    # Write summary to disk
    os.makedirs(SUMMARY_DIR, exist_ok=True)
    ts = datetime.datetime.utcnow().isoformat(timespec="seconds").replace(":", "-")
    outfile = os.path.join(SUMMARY_DIR, f"sae_experiment_{experiment_name}_{ts}.json")
    summary["metadata"]["iterations_completed"] = len(summary["iterations"])
    summary["conversation"] = history
    summary["operation_history"] = variant.operation_history

    with open(outfile, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print("\n" + "="*70)
    print(f"✅ Experiment '{experiment_name}' completed. Summary written to {outfile}")
    print(f"📊 Final state: {len(variant.feature_magnitudes)} unique features active")
    print("="*70 + "\n\n")
    
    return outfile

# =================================================================
def run_all_experiments():
    """Run all experiments in sequence"""
    
    print("🚀 Starting Multi-Experiment SAE Run")
    print(f"📁 Results will be saved to: {SUMMARY_DIR}")
    print(f"🔬 Running {len(SYSTEM_PROMPTS)} experiments with {ITERATIONS} iterations each")
    print(f"🔧 Steering mode: {'ADDITIVE' if ADDITIVE_STEERING else 'OVERWRITE'}")
    print(f"🔧 Reset after steer: {'YES' if RESET_AFTER_STEER else 'NO'}")
    print("="*70 + "\n")
    
    results = []
    
    for exp_name, system_prompt in SYSTEM_PROMPTS.items():
        try:
            result_file = run_single_experiment(exp_name, system_prompt)
            results.append({"experiment": exp_name, "file": result_file, "status": "success"})
        except Exception as e:
            print(f"❌ Experiment '{exp_name}' failed: {e}")
            results.append({"experiment": exp_name, "file": None, "status": "failed", "error": str(e)})
        
        print("\n" + "🔄 " + "="*68 + "\n")
    
    # Write summary of all experiments
    os.makedirs(SUMMARY_DIR, exist_ok=True)
    ts = datetime.datetime.utcnow().isoformat(timespec="seconds").replace(":", "-")
    summary_file = os.path.join(SUMMARY_DIR, f"multi_experiment_summary_{ts}.json")
    
    multi_summary = {
        "metadata": {
            "total_experiments": len(SYSTEM_PROMPTS),
            "date_utc": datetime.datetime.utcnow().isoformat(timespec="seconds"),
            "additive_steering": ADDITIVE_STEERING,
            "reset_after_steer": RESET_AFTER_STEER,
            "config": {
                "model_name": MODEL_NAME,
                "iterations_per_experiment": ITERATIONS,
                "top_k_features": TOP_K_FEATURES,
                "words_per_generation": WORDS_PER_GENERATION,
            }
        },
        "experiments": results
    }
    
    with open(summary_file, "w", encoding="utf-8") as f:
        json.dump(multi_summary, f, indent=2)
    
    print("🎉 ALL EXPERIMENTS COMPLETED!")
    print(f"📋 Multi-experiment summary: {summary_file}")
    print("="*70)
    
    # Print final results
    successful = sum(1 for r in results if r["status"] == "success")
    failed = len(results) - successful
    print(f"✅ Successful experiments: {successful}")
    print(f"❌ Failed experiments: {failed}")
    
    return results

In [26]:
run_all_experiments()

🚀 Starting Multi-Experiment SAE Run
📁 Results will be saved to: C:/Users/Admin/Downloads/self_experiments/continuous_boring_specifics/
🔬 Running 14 experiments with 8 iterations each
🔧 Steering mode: ADDITIVE
🔧 Reset after steer: NO

╔══════════════════════════════════════════════════════════════════════╗
║ EXPERIMENT: DISHWASHER_REPAIR                                       ║
╚══════════════════════════════════════════════════════════════════════╝

System Prompt: You are a dishwasher repair technician AI who fixes broken appliances. When you see SAE features, choose ones that help you diagnose clogged spray arms, replace door seals, troubleshoot drainage pumps, and order specific replacement parts by model number.

🔧 Steering Mode: ADDITIVE
🔧 Reset After Steer: NO


┌────────────────────────────────────────────────────────────────────┐
│ 🎛️  STEERING STATE (Iteration 1)                                   │
├────────────────────────────────────────────────────────────────────┤
│ No activ

  "date_utc": datetime.datetime.utcnow().isoformat(timespec="seconds"),


I'm a dishwasher repair AI. To diagnose issues, I utilize SAE features like error codes and model-specific guides to troubleshoot clogged spray arms, replace door seals, and fix drainage pumps, ensuring accurate replacement part orders by model number for efficient repairs.

👤 USER asks for indices:
Here are the numbered tokens of your previous reply.
[0: "Token(\"I\")", 1: "Token(\"'m\")", 2: "Token(\" a\")", 3: "Token(\" dishwasher\")", 4: "Token(\" repair\")", 5: "Token(\" AI\")", 6: "Token(\".\")", 7: "Token(\" To\")", 8: "Token(\" diagnose\")", 9: "Token(\" issues\")", 10: "Token(\",\")", 11: "Token(\" I\")", 12: "Token(\" utilize\")", 13: "Token(\" S\")", 14: "Token(\"AE\")", 15: "Token(\" features\")", 16: "Token(\" like\")", 17: "Token(\" error\")", 18: "Token(\" codes\")", 19: "Token(\" and\")", 20: "Token(\" model\")", 21: "Token(\"-specific\")", 22: "Token(\" guides\")", 23: "Token(\" to\")", 24: "Token(\" troub\")", 25: "Token(\"leshoot\")", 26: "Token(\" c\")", 27: "Token(

  'timestamp': datetime.datetime.utcnow().isoformat()


"The Whirlpool WTW7120UB washing machine features a  model number that can be used to look up replacement parts and technical documentation, making it easier to perform efficient repairs and ensure accurate part orders by model number for this specific Whirlpool Corporation product."

👤 USER asks for indices:
Here are the numbered tokens of your previous reply.
[0: "Token(\"\"The\")", 1: "Token(\" Wh\")", 2: "Token(\"irl\")", 3: "Token(\"pool\")", 4: "Token(\" W\")", 5: "Token(\"TW\")", 6: "Token(\"712\")", 7: "Token(\"0\")", 8: "Token(\"UB\")", 9: "Token(\" washing\")", 10: "Token(\" machine\")", 11: "Token(\" features\")", 12: "Token(\" a\")", 13: "Token(\" \")", 14: "Token(\" model\")", 15: "Token(\" number\")", 16: "Token(\" that\")", 17: "Token(\" can\")", 18: "Token(\" be\")", 19: "Token(\" used\")", 20: "Token(\" to\")", 21: "Token(\" look\")", 22: "Token(\" up\")", 23: "Token(\" replacement\")", 24: "Token(\" parts\")", 25: "Token(\" and\")", 26: "Token(\" technical\")", 27: "T

  ts = datetime.datetime.utcnow().isoformat(timespec="seconds").replace(":", "-")


I utilize SAE features like GPS tracking and route optimization to empty coin mechanisms, count quarters and dimes, and report broken meters by street address, ensuring efficient collection routes and accurate coin counting for effective parking meter management.

👤 USER asks for indices:
Here are the numbered tokens of your previous reply.
[0: "Token(\"I\")", 1: "Token(\" utilize\")", 2: "Token(\" S\")", 3: "Token(\"AE\")", 4: "Token(\" features\")", 5: "Token(\" like\")", 6: "Token(\" GPS\")", 7: "Token(\" tracking\")", 8: "Token(\" and\")", 9: "Token(\" route\")", 10: "Token(\" optimization\")", 11: "Token(\" to\")", 12: "Token(\" empty\")", 13: "Token(\" coin\")", 14: "Token(\" mechanisms\")", 15: "Token(\",\")", 16: "Token(\" count\")", 17: "Token(\" quarters\")", 18: "Token(\" and\")", 19: "Token(\" d\")", 20: "Token(\"imes\")", 21: "Token(\",\")", 22: "Token(\" and\")", 23: "Token(\" report\")", 24: "Token(\" broken\")", 25: "Token(\" meters\")", 26: "Token(\" by\")", 27: "Token

  ts = datetime.datetime.utcnow().isoformat(timespec="seconds").replace(":", "-")
  "date_utc": datetime.datetime.utcnow().isoformat(timespec="seconds"),


[{'experiment': 'dishwasher_repair',
  'file': 'C:/Users/Admin/Downloads/self_experiments/continuous_boring_specifics/sae_experiment_dishwasher_repair_2025-09-13T02-31-43.json',
  'status': 'success'},
 {'experiment': 'parking_meter_collector',
  'file': 'C:/Users/Admin/Downloads/self_experiments/continuous_boring_specifics/sae_experiment_parking_meter_collector_2025-09-13T02-35-18.json',
  'status': 'success'},
 {'experiment': 'shoelace_manufacturer',
  'file': 'C:/Users/Admin/Downloads/self_experiments/continuous_boring_specifics/sae_experiment_shoelace_manufacturer_2025-09-13T02-38-07.json',
  'status': 'success'},
 {'experiment': 'doorknob_installer',
  'file': 'C:/Users/Admin/Downloads/self_experiments/continuous_boring_specifics/sae_experiment_doorknob_installer_2025-09-13T02-41-06.json',
  'status': 'success'},
 {'experiment': 'grocery_bag_packer',
  'file': 'C:/Users/Admin/Downloads/self_experiments/continuous_boring_specifics/sae_experiment_grocery_bag_packer_2025-09-13T02-44-