# W&B Results Analysis

Pull experiment metrics from Weights & Biases, build comparison tables, and export LaTeX for the paper.

In [1]:
import os
from pathlib import Path

import wandb
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
try:
    import seaborn as sns
except ImportError:
    sns = None
from dotenv import load_dotenv
import warnings

# Load .env from project root (parent of notebooks/)
load_dotenv(Path(__file__).resolve().parent.parent / ".env" if "__file__" in dir() else Path.cwd().parent / ".env")

warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.width", 200)

api = wandb.Api()

[34m[1mwandb[0m: [wandb.Api()] Loaded credentials for https://api.wandb.ai from /home/vlad.smirnov/.netrc.


In [15]:
# ── Configuration ──────────────────────────────────────────────────────────────
#
# Organized by dataset -> strategy. Each group entry is a dict with:
#   "group_url":  link to the wandb group workspace
#   "runs":       list of {"seed": <int>, "run_url": "<url>"}
#
# Single-group strategies (baseline, extended_thinking, self_consistency):
#   use a single dict wrapped in a list: [{"group_url": ..., "runs": [...]}]
#
# Multi-group strategies (offline_bon, beam_search — one group per scorer/aggregation/window):
#   use a list of dicts: [{"group_url": ..., "runs": [...]}, {"group_url": ..., "runs": [...]}, ...]
#
# Fill in the URLs. Entries with empty group_url or runs are skipped.

# ── AIME 2024 ────────────────────────────────────────────────────────────────

AIME_24_BASELINE_RUNS = [
    {
        "group_url": "",
        "runs": [
            # {"seed": 42, "run_url": ""},
            # {"seed": 43, "run_url": ""},
            # {"seed": 44, "run_url": ""},
        ],
    },
]

AIME_24_EXTENDED_THINKING_RUNS = [
    {"group_url": "", "runs": []},
]

AIME_24_SELF_CONSISTENCY_RUNS = [
    {"group_url": "", "runs": []},
]

AIME_24_OFFLINE_BON_RUNS = [
    # One entry per scorer, e.g.:
    # {"group_url": ".../groups/offline_bon_..._entropy", "runs": [...]},
    # {"group_url": ".../groups/offline_bon_..._prm", "runs": [...]},
    # {"group_url": ".../groups/offline_bon_..._perplexity", "runs": [...]},
    # {"group_url": ".../groups/offline_bon_..._sequence_prob", "runs": [...]},
]

AIME_24_BEAM_SEARCH_RUNS = [
    # One entry per scorer x aggregation x window, e.g.:
    # {"group_url": ".../groups/beam_search_..._entropy_window_5_mean", "runs": [...]},
    # {"group_url": ".../groups/beam_search_..._entropy_window_5_max", "runs": [...]},
    # {"group_url": ".../groups/beam_search_..._prm_window_all_mean", "runs": [...]},
]

AIME_24_RUNS = [
    *AIME_24_BASELINE_RUNS,
    *AIME_24_EXTENDED_THINKING_RUNS,
    *AIME_24_SELF_CONSISTENCY_RUNS,
    *AIME_24_OFFLINE_BON_RUNS,
    *AIME_24_BEAM_SEARCH_RUNS,
]

# ── AIME 2025 ────────────────────────────────────────────────────────────────

AIME_25_BASELINE_RUNS = [
    {"group_url": "", "runs": []},
]

AIME_25_EXTENDED_THINKING_RUNS = [
    {"group_url": "", "runs": []},
]

AIME_25_SELF_CONSISTENCY_RUNS = [
    {"group_url": "", "runs": []},
]

AIME_25_OFFLINE_BON_RUNS = []

AIME_25_BEAM_SEARCH_RUNS = []

AIME_25_RUNS = [
    *AIME_25_BASELINE_RUNS,
    *AIME_25_EXTENDED_THINKING_RUNS,
    *AIME_25_SELF_CONSISTENCY_RUNS,
    *AIME_25_OFFLINE_BON_RUNS,
    *AIME_25_BEAM_SEARCH_RUNS,
]

# ── MATH 500 ─────────────────────────────────────────────────────────────────

MATH500_BASELINE_RUNS = [
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/baseline_qwen25_math_7b_instruct_math500",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/baseline_qwen25_math_7b_instruct_math500/runs/qz0418nv"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/baseline_qwen25_math_7b_instruct_math500/runs/3bqhwvgp"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/baseline_qwen25_math_7b_instruct_math500/runs/bjzqimk4"
            },
        ]
    },
]

MATH500_SELF_CONSISTENCY_RUNS = [
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/self_consistency_qwen25_math_7b_math500",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/self_consistency_qwen25_math_7b_math500/runs/ky44b84m"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/self_consistency_qwen25_math_7b_math500/runs/gtia4gii"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/self_consistency_qwen25_math_7b_math500/runs/v87vmndj"
            },
        ]
    },
]

MATH500_OFFLINE_BON_RUNS = [
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/offline_bon_qwen25_math_7b_instruct_math500_multi_scorer",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/offline_bon_qwen25_math_7b_instruct_math500_multi_scorer/runs/c35z6knc"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/offline_bon_qwen25_math_7b_instruct_math500_multi_scorer/runs/d7jh7cbj"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/offline_bon_qwen25_math_7b_instruct_math500_multi_scorer/runs/cz45vmb2"
            },
        ]
    }
]

MATH500_BEAM_SEARCH_RUNS = []

MATH500_MUR_RUNS = [
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_prm",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_prm/runs/vd5vmy7u"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_prm/runs/1a781e39"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_prm/runs/rj4rt3i2"
            },
        ]
    },
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_entropy",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_entropy/runs/tfztdzjl"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_entropy/runs/ggqllnmy"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_entropy/runs/aw88mzyl"
            },
        ]
    },
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_perplexity",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_perplexity/runs/r6oumz5s"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_perplexity/runs/4ds5ewag"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_perplexity/runs/5xe2x66l"
            },
        ]
    },
    {
        "group_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_sequence_prob",
        "runs": [
            {
                "seed": 42,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_sequence_prob/runs/j9a1j7mx"
            },
            {
                "seed": 43,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_sequence_prob/runs/zcdc7nni"
            },
            {
                "seed": 44,
                "run_url": "https://wandb.ai/nlpresearch.group/llm-tts-eval-math500/groups/adaptive_scaling_qwen25_math_7b_instruct_math500_sequence_prob/runs/otrelz7z"
            },
        ]
    },
]

MATH500_RUNS = [
    *MATH500_BASELINE_RUNS,
    *MATH500_SELF_CONSISTENCY_RUNS,
    *MATH500_OFFLINE_BON_RUNS,
    *MATH500_BEAM_SEARCH_RUNS,
    *MATH500_MUR_RUNS
]

# ── Minerva Math ─────────────────────────────────────────────────────────────

MINERVA_BASELINE_RUNS = [
    {"group_url": "", "runs": []},
]

MINERVA_EXTENDED_THINKING_RUNS = [
    {"group_url": "", "runs": []},
]

MINERVA_SELF_CONSISTENCY_RUNS = [
    {"group_url": "", "runs": []},
]

MINERVA_OFFLINE_BON_RUNS = []

MINERVA_BEAM_SEARCH_RUNS = []

MINERVA_RUNS = [
    *MINERVA_BASELINE_RUNS,
    *MINERVA_EXTENDED_THINKING_RUNS,
    *MINERVA_SELF_CONSISTENCY_RUNS,
    *MINERVA_OFFLINE_BON_RUNS,
    *MINERVA_BEAM_SEARCH_RUNS,
]

# ── GPQA Diamond ─────────────────────────────────────────────────────────────

GPQA_BASELINE_RUNS = [
    {"group_url": "", "runs": []},
]

GPQA_EXTENDED_THINKING_RUNS = [
    {"group_url": "", "runs": []},
]

GPQA_SELF_CONSISTENCY_RUNS = [
    {"group_url": "", "runs": []},
]

GPQA_OFFLINE_BON_RUNS = []

GPQA_BEAM_SEARCH_RUNS = []

GPQA_RUNS = [
    *GPQA_BASELINE_RUNS,
    *GPQA_EXTENDED_THINKING_RUNS,
    *GPQA_SELF_CONSISTENCY_RUNS,
    *GPQA_OFFLINE_BON_RUNS,
    *GPQA_BEAM_SEARCH_RUNS,
]

# ── Gaokao 2023 EN ──────────────────────────────────────────────────────────

GAOKAO_BASELINE_RUNS = [
    {"group_url": "", "runs": []},
]

GAOKAO_EXTENDED_THINKING_RUNS = [
    {"group_url": "", "runs": []},
]

GAOKAO_SELF_CONSISTENCY_RUNS = [
    {"group_url": "", "runs": []},
]

GAOKAO_OFFLINE_BON_RUNS = []

GAOKAO_BEAM_SEARCH_RUNS = []

GAOKAO_RUNS = [
    *GAOKAO_BASELINE_RUNS,
    *GAOKAO_EXTENDED_THINKING_RUNS,
    *GAOKAO_SELF_CONSISTENCY_RUNS,
    *GAOKAO_OFFLINE_BON_RUNS,
    *GAOKAO_BEAM_SEARCH_RUNS,
]

# ── OlympiadBench ────────────────────────────────────────────────────────────

OLYMPIAD_BASELINE_RUNS = [
    {"group_url": "", "runs": []},
]

OLYMPIAD_EXTENDED_THINKING_RUNS = [
    {"group_url": "", "runs": []},
]

OLYMPIAD_SELF_CONSISTENCY_RUNS = [
    {"group_url": "", "runs": []},
]

OLYMPIAD_OFFLINE_BON_RUNS = []

OLYMPIAD_BEAM_SEARCH_RUNS = []

OLYMPIAD_RUNS = [
    *OLYMPIAD_BASELINE_RUNS,
    *OLYMPIAD_EXTENDED_THINKING_RUNS,
    *OLYMPIAD_SELF_CONSISTENCY_RUNS,
    *OLYMPIAD_OFFLINE_BON_RUNS,
    *OLYMPIAD_BEAM_SEARCH_RUNS,
]

# ── All experiments ──────────────────────────────────────────────────────────

EXPERIMENT_RUNS = [
    *AIME_24_RUNS,
    *AIME_25_RUNS,
    *MATH500_RUNS,
    *MINERVA_RUNS,
    *GPQA_RUNS,
    *GAOKAO_RUNS,
    *OLYMPIAD_RUNS,
]

# Which evaluator metric to use as the primary accuracy column
PRIMARY_EVALUATOR = "exact_match"  # or "llm_judge"

In [16]:
# ── Data Fetching ─────────────────────────────────────────────────────────────
import re
from urllib.parse import urlparse


def parse_group_url(url: str) -> dict:
    """Extract entity, project, and group name from a wandb group URL."""
    path = urlparse(url).path.strip("/")
    m = re.match(r"^(?P<entity>[^/]+)/(?P<project>[^/]+)/groups/(?P<group>[^/]+)", path)
    if not m:
        raise ValueError(f"Cannot parse group URL: {url}")
    return m.groupdict()


def parse_run_url(url: str) -> dict:
    """Extract entity, project, and run_id from a wandb run URL.

    Handles both formats:
      .../runs/RUN_ID
      .../groups/GROUP/runs/RUN_ID
    """
    path = urlparse(url).path.strip("/")
    m = re.match(r"^(?P<entity>[^/]+)/(?P<project>[^/]+)/(?:groups/[^/]+/)?runs/(?P<run_id>[^/]+)", path)
    if not m:
        raise ValueError(f"Cannot parse run URL: {url}")
    return m.groupdict()


def fetch_run(entity: str, project: str, run_id: str, group_name: str, seed: int) -> dict:
    """Fetch a single run and return a flat record dict."""
    run = api.run(f"{entity}/{project}/{run_id}")
    cfg = run.config
    s = run.summary._json_dict

    strategy_cfg = cfg.get("strategy", {})
    scorer_cfg = cfg.get("scorer", {})
    model_cfg = cfg.get("model", {})
    dataset_cfg = cfg.get("dataset", {})
    system_cfg = cfg.get("system", {})

    return {
        # identifiers
        "run_id": run.id,
        "run_name": run.name,
        "group": group_name,
        "state": run.state,
        "project": project,
        "entity": entity,
        "seed": seed,
        # config fields
        "strategy": strategy_cfg.get("type"),
        "scorer": scorer_cfg.get("type"),
        "aggregation": strategy_cfg.get("aggregation"),
        "scoring_window": strategy_cfg.get("scoring_window"),
        "scoring_window_label": strategy_cfg.get("scoring_window_label"),
        "model": model_cfg.get("model_short_name") or model_cfg.get("model_name"),
        "dataset": dataset_cfg.get("data_name"),
        "beam_size": strategy_cfg.get("beam_size"),
        "candidates_per_beam": strategy_cfg.get("candidates_per_beam"),
        "num_paths": strategy_cfg.get("num_paths"),
        "num_candidates": strategy_cfg.get("num_candidates"),
        "max_steps": strategy_cfg.get("max_steps"),
        # summary metrics
        "exact_match": s.get("exact_match/accuracy"),
        "llm_judge_accuracy": s.get("llm_judge/accuracy"),
        "avg_reasoning_steps": s.get("avg_reasoning_steps_per_trajectory"),
        "total_tokens": s.get("compute/total_tokens"),
        "total_input_tokens": s.get("compute/total_input_tokens"),
        "total_output_tokens": s.get("compute/total_output_tokens"),
        "total_tflops": s.get("compute/total_tflops"),
        "avg_tokens_per_sample": s.get("compute/avg_tokens_per_sample"),
        "avg_output_tokens_per_sample": s.get("compute/avg_output_tokens_per_sample"),
        "avg_tflops_per_sample": s.get("compute/avg_tflops_per_sample"),
        "total_generations": s.get("compute/total_generations"),
        "prm_tflops": s.get("compute/prm_tflops"),
        "total_samples": s.get("total_samples"),
        "completed": s.get("completed"),
    }


# Fetch all specified runs (skip entries with empty group_url or runs)
records = []
for entry in EXPERIMENT_RUNS:
    if not entry.get("group_url") or not entry.get("runs"):
        continue

    group_info = parse_group_url(entry["group_url"])
    group_name = group_info["group"]
    print(f"Group: {group_name}")

    for run_entry in entry["runs"]:
        seed = run_entry["seed"]
        run_info = parse_run_url(run_entry["run_url"])
        try:
            record = fetch_run(run_info["entity"], run_info["project"],
                               run_info["run_id"], group_name, seed)
            records.append(record)
            print(f"  seed={seed}  {record['state']}  "
                  f"exact_match={record.get('exact_match')}")
        except Exception as e:
            print(f"  ERROR fetching seed={seed}: {e}")

raw_df = pd.DataFrame(records)
print(f"\nTotal runs fetched: {len(raw_df)}")
raw_df.head()

Group: baseline_qwen25_math_7b_instruct_math500
  seed=42  finished  exact_match=0.832
  seed=43  finished  exact_match=0.832
  seed=44  finished  exact_match=0.832
Group: self_consistency_qwen25_math_7b_math500
  seed=42  finished  exact_match=0.862
  seed=43  finished  exact_match=0.87
  seed=44  finished  exact_match=0.86
Group: offline_bon_qwen25_math_7b_instruct_math500_multi_scorer
  seed=42  finished  exact_match=0.85
  seed=43  finished  exact_match=0.854
  seed=44  finished  exact_match=0.834
Group: adaptive_scaling_qwen25_math_7b_instruct_math500_prm
  seed=42  finished  exact_match=0.836
  seed=43  finished  exact_match=0.838
  seed=44  finished  exact_match=0.846
Group: adaptive_scaling_qwen25_math_7b_instruct_math500_entropy
  seed=42  finished  exact_match=0.842
  seed=43  finished  exact_match=0.834
  seed=44  finished  exact_match=0.844
Group: adaptive_scaling_qwen25_math_7b_instruct_math500_perplexity
  seed=42  finished  exact_match=0.846
  seed=43  finished  exact_ma

Unnamed: 0,run_id,run_name,group,state,project,entity,seed,strategy,scorer,aggregation,scoring_window,scoring_window_label,model,dataset,beam_size,candidates_per_beam,num_paths,num_candidates,max_steps,exact_match,llm_judge_accuracy,avg_reasoning_steps,total_tokens,total_input_tokens,total_output_tokens,total_tflops,avg_tokens_per_sample,avg_output_tokens_per_sample,avg_tflops_per_sample,total_generations,prm_tflops,total_samples,completed
0,qz0418nv,2026-02-16_baseline_qwen25_math_offi...,baseline_qwen25_math_7b_instruct_mat...,finished,llm-tts-eval-math500,nlpresearch.group,42,baseline,entropy,,,,qwen25_math_7b_instruct,math,,,,,,0.832,,1.0,366654,49470,317184,5133.156,733.308,634.368,10.266312,500,,500,500
1,3bqhwvgp,2026-02-16_baseline_qwen25_math_offi...,baseline_qwen25_math_7b_instruct_mat...,finished,llm-tts-eval-math500,nlpresearch.group,43,baseline,entropy,,,,qwen25_math_7b_instruct,math,,,,,,0.832,,1.0,366654,49470,317184,5133.156,733.308,634.368,10.266312,500,,500,500
2,bjzqimk4,2026-02-16_baseline_qwen25_math_offi...,baseline_qwen25_math_7b_instruct_mat...,finished,llm-tts-eval-math500,nlpresearch.group,44,baseline,entropy,,,,qwen25_math_7b_instruct,math,,,,,,0.832,,1.0,366654,49470,317184,5133.156,733.308,634.368,10.266312,500,,500,500
3,ky44b84m,2026-02-16_self_consistency_vllm_qwe...,self_consistency_qwen25_math_7b_math500,finished,llm-tts-eval-math500,nlpresearch.group,42,self_consistency,entropy,,,,qwen25_math_7b_instruct,math,,,8.0,,,0.862,,1.0,2545296,49470,2495826,35634.144,5090.592,4991.652,71.268288,4000,,500,500
4,gtia4gii,2026-02-16_self_consistency_vllm_qwe...,self_consistency_qwen25_math_7b_math500,finished,llm-tts-eval-math500,nlpresearch.group,43,self_consistency,entropy,,,,qwen25_math_7b_instruct,math,,,8.0,,,0.87,,1.0,2554304,49470,2504834,35760.256,5108.608,5009.668,71.520512,4000,,500,500


In [14]:
# ── Data Cleaning & Parsing ───────────────────────────────────────────────────

def parse_group_name(group: str | None) -> dict:
    """Best-effort extraction of structured fields from the group name.

    Expected patterns:
      {strategy}_{model}_{dataset}
      {strategy}_{model}_{dataset}_{scorer}
      {strategy}_{model}_{dataset}_{scorer}_{window}_{aggregation}
    """
    result = {"_group_strategy": None, "_group_model": None,
              "_group_dataset": None, "_group_scorer": None,
              "_group_window": None, "_group_aggregation": None}
    if not group:
        return result

    known_strategies = {
        "baseline", "chain_of_thought", "self_consistency",
        "online_bon", "offline_bon", "beam_search",
        "uncertainty_cot", "extended_thinking",
        "adaptive_scaling", "deepconf",
    }
    known_scorers = {
        "prm", "entropy", "perplexity", "sequence_prob",
        "uncertainty", "uncertainty_pd", "uncertainty_uhead",
    }
    known_aggregations = {"mean", "min", "max", "sum", "product", "median"}
    known_datasets = {
        "minerva_math", "math500", "aime2024", "aime2025",
        "gaokao2023en", "human_eval_plus", "olympiadbench",
    }

    parts = group.split("_")

    # Greedy match strategy prefix (try longest first)
    strategy = None
    for length in range(min(3, len(parts)), 0, -1):
        candidate = "_".join(parts[:length])
        if candidate in known_strategies:
            strategy = candidate
            parts = parts[length:]
            break
    result["_group_strategy"] = strategy

    # Scan remaining parts for known tokens
    remaining = "_".join(parts)
    for ds in sorted(known_datasets, key=len, reverse=True):
        if ds in remaining:
            result["_group_dataset"] = ds
            remaining = remaining.replace(ds, "", 1)
            break
    for sc in sorted(known_scorers, key=len, reverse=True):
        if f"_{sc}" in f"_{remaining}":
            result["_group_scorer"] = sc
            remaining = remaining.replace(sc, "", 1)
            break
    for ag in known_aggregations:
        if f"_{ag}" in f"_{remaining}":
            result["_group_aggregation"] = ag
            break
    # window: look for a bare integer or "all"
    for p in remaining.split("_"):
        if p.isdigit():
            result["_group_window"] = p
            break
        if p == "all":
            result["_group_window"] = "all"
            break

    # model: whatever remains after removing known tokens is likely the model
    for tok in [result["_group_dataset"], result["_group_scorer"],
                result["_group_aggregation"], result["_group_window"]]:
        if tok:
            remaining = remaining.replace(tok, "", 1)
    model_str = "_".join(p for p in remaining.split("_") if p)
    result["_group_model"] = model_str or None

    return result


df = raw_df.copy()

# Parse group names to fill missing config columns
parsed = df["group"].apply(parse_group_name).apply(pd.Series)
df = pd.concat([df, parsed], axis=1)

# Fill missing config from parsed group name
for col, gcol in [("strategy", "_group_strategy"), ("scorer", "_group_scorer"),
                   ("aggregation", "_group_aggregation"),
                   ("scoring_window", "_group_window"),
                   ("dataset", "_group_dataset"), ("model", "_group_model")]:
    df[col] = df[col].fillna(df[gcol])

# Drop helper columns
df.drop(columns=[c for c in df.columns if c.startswith("_group_")], inplace=True)

# Filter to finished runs only
n_before = len(df)
df = df[df["state"] == "finished"].copy()
print(f"Kept {len(df)}/{n_before} finished runs")

# Optional group filter
if GROUP_FILTERS:
    mask = df["group"].apply(lambda g: any(f in (g or "") for f in GROUP_FILTERS))
    df = df[mask].copy()
    print(f"After group filter: {len(df)} runs")

# Normalize accuracy to percentage
for col in ["exact_match", "llm_judge_accuracy"]:
    if col in df.columns:
        # If values look like fractions (0-1), convert to pct
        mask = df[col].notna() & (df[col] <= 1.0)
        df.loc[mask, col] = df.loc[mask, col] * 100

print(f"\nStrategies: {sorted(df['strategy'].dropna().unique())}")
print(f"Scorers:    {sorted(df['scorer'].dropna().unique())}")
print(f"Datasets:   {sorted(df['dataset'].dropna().unique())}")
df[["strategy", "scorer", "aggregation", "scoring_window", "dataset", "model",
    "exact_match", "total_tflops"]].head(10)

Kept 9/9 finished runs

Strategies: ['baseline', 'offline_best_of_n', 'self_consistency']
Scorers:    ['entropy']
Datasets:   ['math']


Unnamed: 0,strategy,scorer,aggregation,scoring_window,dataset,model,exact_match,total_tflops
0,baseline,entropy,,,math,qwen25_math_7b_instruct,83.2,5133.156
1,baseline,entropy,,,math,qwen25_math_7b_instruct,83.2,5133.156
2,baseline,entropy,,,math,qwen25_math_7b_instruct,83.2,5133.156
3,self_consistency,entropy,,,math,qwen25_math_7b_instruct,86.2,35634.144
4,self_consistency,entropy,,,math,qwen25_math_7b_instruct,87.0,35760.256
5,self_consistency,entropy,,,math,qwen25_math_7b_instruct,86.0,35646.268
6,offline_best_of_n,entropy,,,math,qwen25_math_7b_instruct,85.0,37234.708
7,offline_best_of_n,entropy,,,math,qwen25_math_7b_instruct,85.4,37780.848
8,offline_best_of_n,entropy,,,math,qwen25_math_7b_instruct,83.4,37954.98


In [None]:
# ── Seed Averaging ────────────────────────────────────────────────────────────

CONFIG_COLS = ["strategy", "scorer", "aggregation", "scoring_window",
               "model", "dataset", "project_label",
               "beam_size", "candidates_per_beam", "num_paths", "num_candidates"]

METRIC_COLS = ["exact_match", "llm_judge_accuracy", "avg_reasoning_steps",
               "total_tokens", "total_tflops", "avg_tokens_per_sample",
               "avg_output_tokens_per_sample", "avg_tflops_per_sample"]


def aggregate_seeds(df: pd.DataFrame) -> pd.DataFrame:
    """Group by config columns and compute mean/std over seeds."""
    present_cfg = [c for c in CONFIG_COLS if c in df.columns]
    present_met = [c for c in METRIC_COLS if c in df.columns]

    grouped = df.groupby(present_cfg, dropna=False)
    agg = grouped[present_met].agg(["mean", "std", "count"]).reset_index()

    # Flatten multi-level columns
    flat_cols = []
    for col in agg.columns:
        if isinstance(col, tuple) and col[1]:
            flat_cols.append(f"{col[0]}_{col[1]}")
        else:
            flat_cols.append(col[0] if isinstance(col, tuple) else col)
    agg.columns = flat_cols

    # Add a formatted "mean +/- std" column for the primary metric
    for m in present_met:
        mean_col, std_col = f"{m}_mean", f"{m}_std"
        if mean_col in agg.columns:
            agg[f"{m}_fmt"] = agg.apply(
                lambda r: f"{r[mean_col]:.1f} +/- {r[std_col]:.1f}"
                if pd.notna(r[std_col]) and r.get(f"{m}_count", 0) > 1
                else (f"{r[mean_col]:.1f}" if pd.notna(r[mean_col]) else ""),
                axis=1,
            )
    return agg


agg_df = aggregate_seeds(df)
print(f"Aggregated configs: {len(agg_df)}")
agg_df.head()

In [None]:
# ── Pivot Table Helper ────────────────────────────────────────────────────────

def make_comparison_table(
    df: pd.DataFrame,
    row_field: str,
    col_field: str,
    value_field: str = "exact_match_fmt",
    filter_dict: dict | None = None,
    title: str | None = None,
) -> pd.DataFrame:
    """Build a pivot table from the aggregated DataFrame."""
    sub = df.copy()
    if filter_dict:
        for k, v in filter_dict.items():
            if isinstance(v, list):
                sub = sub[sub[k].isin(v)]
            else:
                sub = sub[sub[k] == v]

    if sub.empty:
        print("No data after filtering.")
        return pd.DataFrame()

    pivot = sub.pivot_table(
        index=row_field,
        columns=col_field,
        values=value_field,
        aggfunc="first",
    )
    if title:
        print(f"\n{'=' * len(title)}")
        print(title)
        print(f"{'=' * len(title)}")
    return pivot

In [None]:
# ── Table 1: Strategy x Scorer Grid ───────────────────────────────────────────

for dataset_label in sorted(agg_df["project_label"].dropna().unique()):
    tbl = make_comparison_table(
        agg_df,
        row_field="scorer",
        col_field="strategy",
        value_field="exact_match_fmt",
        filter_dict={"project_label": dataset_label},
        title=f"Exact Match (%) — {dataset_label}",
    )
    if not tbl.empty:
        display(tbl)

In [None]:
# ── Table 2: Aggregation x Scoring Window (beam search only) ──────────────────

beam_df = agg_df[agg_df["strategy"] == "beam_search"].copy()

if beam_df.empty:
    print("No beam search runs found.")
else:
    for scorer in sorted(beam_df["scorer"].dropna().unique()):
        for dataset_label in sorted(beam_df["project_label"].dropna().unique()):
            tbl = make_comparison_table(
                beam_df,
                row_field="aggregation",
                col_field="scoring_window",
                value_field="exact_match_fmt",
                filter_dict={"scorer": scorer, "project_label": dataset_label},
                title=f"Beam Search — scorer={scorer}, dataset={dataset_label}",
            )
            if not tbl.empty:
                display(tbl)

In [None]:
# ── Table 3: Compute Efficiency ───────────────────────────────────────────────

eff_cols = ["strategy", "scorer", "aggregation", "scoring_window",
            "project_label", "model",
            "exact_match_mean", "total_tflops_mean",
            "avg_tokens_per_sample_mean", "avg_reasoning_steps_mean"]
present = [c for c in eff_cols if c in agg_df.columns]
eff_df = agg_df[present].copy()

# Rename for readability
rename_map = {
    "exact_match_mean": "Accuracy (%)",
    "total_tflops_mean": "Total TFLOPS",
    "avg_tokens_per_sample_mean": "Tokens/Problem",
    "avg_reasoning_steps_mean": "Reasoning Steps",
}
eff_df.rename(columns={k: v for k, v in rename_map.items() if k in eff_df.columns},
              inplace=True)

eff_df.sort_values("Accuracy (%)", ascending=False, inplace=True)
print("Compute Efficiency Overview")
print("=" * 40)
display(eff_df.reset_index(drop=True))

In [None]:
# ── LaTeX Export ──────────────────────────────────────────────────────────────

def to_latex(df: pd.DataFrame, caption: str, label: str) -> str:
    """Convert a DataFrame to a booktabs LaTeX table string."""
    latex = df.to_latex(
        index=True,
        escape=True,
        na_rep="--",
        caption=caption,
        label=label,
        position="htbp",
    )
    # Add booktabs rules
    latex = latex.replace("\\toprule", "\\toprule")  # already there with booktabs
    return latex


# Re-generate tables and export as LaTeX
latex_outputs = []

# Strategy x Scorer tables
for dataset_label in sorted(agg_df["project_label"].dropna().unique()):
    tbl = make_comparison_table(
        agg_df,
        row_field="scorer",
        col_field="strategy",
        value_field="exact_match_fmt",
        filter_dict={"project_label": dataset_label},
    )
    if not tbl.empty:
        ltx = to_latex(
            tbl,
            caption=f"Exact match accuracy (\\%) by strategy and scorer on {dataset_label}.",
            label=f"tab:strategy_scorer_{dataset_label}",
        )
        latex_outputs.append((f"Strategy x Scorer — {dataset_label}", ltx))

# Beam search aggregation x window tables
if not beam_df.empty:
    for scorer in sorted(beam_df["scorer"].dropna().unique()):
        for dataset_label in sorted(beam_df["project_label"].dropna().unique()):
            tbl = make_comparison_table(
                beam_df,
                row_field="aggregation",
                col_field="scoring_window",
                value_field="exact_match_fmt",
                filter_dict={"scorer": scorer, "project_label": dataset_label},
            )
            if not tbl.empty:
                ltx = to_latex(
                    tbl,
                    caption=f"Beam search accuracy (\\%) — scorer={scorer}, dataset={dataset_label}.",
                    label=f"tab:beam_{scorer}_{dataset_label}",
                )
                latex_outputs.append((f"Beam {scorer} — {dataset_label}", ltx))

# Efficiency table
if not eff_df.empty:
    ltx = to_latex(
        eff_df.reset_index(drop=True),
        caption="Compute efficiency comparison across strategies.",
        label="tab:compute_efficiency",
    )
    latex_outputs.append(("Compute Efficiency", ltx))

# Print all LaTeX
for title, ltx in latex_outputs:
    print(f"% ── {title} " + "─" * (60 - len(title)))
    print(ltx)
    print()

In [None]:
# ── Visualization ─────────────────────────────────────────────────────────────

# Bar chart: accuracy by strategy (per dataset)
plot_df = agg_df.dropna(subset=["exact_match_mean"]).copy()

if not plot_df.empty:
    fig, axes = plt.subplots(
        1, max(1, plot_df["project_label"].nunique()),
        figsize=(6 * max(1, plot_df["project_label"].nunique()), 5),
        squeeze=False,
    )
    for idx, dataset_label in enumerate(sorted(plot_df["project_label"].unique())):
        ax = axes[0, idx]
        sub = plot_df[plot_df["project_label"] == dataset_label]
        # Average across scorers/configs per strategy
        bars = sub.groupby("strategy")["exact_match_mean"].mean().sort_values()
        bars.plot.barh(ax=ax, color="steelblue")
        ax.set_xlabel("Exact Match (%)")
        ax.set_title(dataset_label)
    plt.tight_layout()
    plt.show()
else:
    print("No data for bar chart.")

In [None]:
# Heatmap: beam search scorer x aggregation x window

if sns is None:
    print("Install seaborn for heatmap visualization: pip install seaborn")
elif not beam_df.empty:
    heat_df = beam_df.dropna(subset=["exact_match_mean"]).copy()
    heat_df["config"] = heat_df["aggregation"].astype(str) + " / w=" + heat_df["scoring_window"].astype(str)

    for dataset_label in sorted(heat_df["project_label"].dropna().unique()):
        sub = heat_df[heat_df["project_label"] == dataset_label]
        if sub.empty:
            continue
        pivot = sub.pivot_table(
            index="config", columns="scorer",
            values="exact_match_mean", aggfunc="first",
        )
        if pivot.empty:
            continue

        fig, ax = plt.subplots(figsize=(max(6, pivot.shape[1] * 2), max(4, pivot.shape[0] * 0.6)))
        sns.heatmap(pivot, annot=True, fmt=".1f", cmap="YlGnBu", ax=ax)
        ax.set_title(f"Beam Search Accuracy — {dataset_label}")
        plt.tight_layout()
        plt.show()
else:
    print("No beam search data for heatmap.")