# W&B Results Analysis

Pull experiment metrics from Weights & Biases, build comparison tables, and export LaTeX for the paper.

In [None]:
import wandb
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
try:
    import seaborn as sns
except ImportError:
    sns = None
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.width", 200)

api = wandb.Api()

In [None]:
# ── Configuration ──────────────────────────────────────────────────────────────
WANDB_ENTITY = "nlpresearch.group"

# Map a short label to the wandb project name
PROJECTS = {
    "aime2024": "llm-tts-eval-aime24",
    "aime2025": "llm-tts-eval-aime25",
    "minerva_math": "llm-tts-eval-minerva-math",
    "math500": "llm-tts-eval-math500",
    "gaokao2023en": "llm-tts-eval-gaokao2023en",
    "olympiadbench": "llm-tts-eval-olympiadbench",
    "gpqa_diamond": "llm-tts-eval-gpqa-diamond",
}

# Which evaluator metric to use as the primary accuracy column
PRIMARY_EVALUATOR = "exact_match"  # or "llm_judge"

# Optional: only keep runs whose group matches one of these substrings.
# Set to None to keep everything.
GROUP_FILTERS = None  # e.g. ["beam_search", "offline_bon"]

In [None]:
# ── Data Fetching ─────────────────────────────────────────────────────────────

def fetch_runs(project: str, filters: dict | None = None) -> pd.DataFrame:
    """Pull all runs from a wandb project and return a flat DataFrame."""
    path = f"{WANDB_ENTITY}/{project}"
    runs = api.runs(path, filters=filters or {})

    records = []
    for run in runs:
        cfg = run.config
        s = run.summary._json_dict

        # Nested config access helpers
        strategy_cfg = cfg.get("strategy", {})
        scorer_cfg = cfg.get("scorer", {})
        model_cfg = cfg.get("model", {})
        dataset_cfg = cfg.get("dataset", {})
        system_cfg = cfg.get("system", {})

        record = {
            # identifiers
            "run_id": run.id,
            "run_name": run.name,
            "group": run.group,
            "state": run.state,
            "project": project,
            # config fields
            "strategy": strategy_cfg.get("type"),
            "scorer": scorer_cfg.get("type"),
            "aggregation": strategy_cfg.get("aggregation"),
            "scoring_window": strategy_cfg.get("scoring_window"),
            "scoring_window_label": strategy_cfg.get("scoring_window_label"),
            "model": model_cfg.get("model_short_name") or model_cfg.get("model_name"),
            "dataset": dataset_cfg.get("data_name"),
            "seed": system_cfg.get("seed"),
            "beam_size": strategy_cfg.get("beam_size"),
            "candidates_per_beam": strategy_cfg.get("candidates_per_beam"),
            "num_paths": strategy_cfg.get("num_paths"),
            "num_candidates": strategy_cfg.get("num_candidates"),
            "max_steps": strategy_cfg.get("max_steps"),
            # summary metrics
            "exact_match": s.get("exact_match/accuracy"),
            "llm_judge_accuracy": s.get("llm_judge/accuracy"),
            "avg_reasoning_steps": s.get("avg_reasoning_steps_per_trajectory"),
            "total_tokens": s.get("compute/total_tokens"),
            "total_input_tokens": s.get("compute/total_input_tokens"),
            "total_output_tokens": s.get("compute/total_output_tokens"),
            "total_tflops": s.get("compute/total_tflops"),
            "avg_tokens_per_sample": s.get("compute/avg_tokens_per_sample"),
            "avg_output_tokens_per_sample": s.get("compute/avg_output_tokens_per_sample"),
            "avg_tflops_per_sample": s.get("compute/avg_tflops_per_sample"),
            "total_generations": s.get("compute/total_generations"),
            "prm_tflops": s.get("compute/prm_tflops"),
            "total_samples": s.get("total_samples"),
            "completed": s.get("completed"),
        }
        records.append(record)

    return pd.DataFrame(records)


# Fetch all projects
dfs = []
for label, proj in PROJECTS.items():
    print(f"Fetching {label} ({proj})...")
    try:
        df = fetch_runs(proj)
        df["project_label"] = label
        dfs.append(df)
        print(f"  -> {len(df)} runs")
    except Exception as e:
        print(f"  -> ERROR: {e}")

raw_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print(f"\nTotal raw runs: {len(raw_df)}")
raw_df.head()

In [None]:
# ── Data Cleaning & Parsing ───────────────────────────────────────────────────

def parse_group_name(group: str | None) -> dict:
    """Best-effort extraction of structured fields from the group name.

    Expected patterns:
      {strategy}_{model}_{dataset}
      {strategy}_{model}_{dataset}_{scorer}
      {strategy}_{model}_{dataset}_{scorer}_{window}_{aggregation}
    """
    result = {"_group_strategy": None, "_group_model": None,
              "_group_dataset": None, "_group_scorer": None,
              "_group_window": None, "_group_aggregation": None}
    if not group:
        return result

    known_strategies = {
        "baseline", "chain_of_thought", "self_consistency",
        "online_bon", "offline_bon", "beam_search",
        "uncertainty_cot", "extended_thinking",
        "adaptive_scaling", "deepconf",
    }
    known_scorers = {
        "prm", "entropy", "perplexity", "sequence_prob",
        "uncertainty", "uncertainty_pd", "uncertainty_uhead",
    }
    known_aggregations = {"mean", "min", "max", "sum", "product", "median"}
    known_datasets = {
        "minerva_math", "math500", "aime2024", "aime2025",
        "gaokao2023en", "human_eval_plus", "olympiadbench",
    }

    parts = group.split("_")

    # Greedy match strategy prefix (try longest first)
    strategy = None
    for length in range(min(3, len(parts)), 0, -1):
        candidate = "_".join(parts[:length])
        if candidate in known_strategies:
            strategy = candidate
            parts = parts[length:]
            break
    result["_group_strategy"] = strategy

    # Scan remaining parts for known tokens
    remaining = "_".join(parts)
    for ds in sorted(known_datasets, key=len, reverse=True):
        if ds in remaining:
            result["_group_dataset"] = ds
            remaining = remaining.replace(ds, "", 1)
            break
    for sc in sorted(known_scorers, key=len, reverse=True):
        if f"_{sc}" in f"_{remaining}":
            result["_group_scorer"] = sc
            remaining = remaining.replace(sc, "", 1)
            break
    for ag in known_aggregations:
        if f"_{ag}" in f"_{remaining}":
            result["_group_aggregation"] = ag
            break
    # window: look for a bare integer or "all"
    for p in remaining.split("_"):
        if p.isdigit():
            result["_group_window"] = p
            break
        if p == "all":
            result["_group_window"] = "all"
            break

    # model: whatever remains after removing known tokens is likely the model
    for tok in [result["_group_dataset"], result["_group_scorer"],
                result["_group_aggregation"], result["_group_window"]]:
        if tok:
            remaining = remaining.replace(tok, "", 1)
    model_str = "_".join(p for p in remaining.split("_") if p)
    result["_group_model"] = model_str or None

    return result


df = raw_df.copy()

# Parse group names to fill missing config columns
parsed = df["group"].apply(parse_group_name).apply(pd.Series)
df = pd.concat([df, parsed], axis=1)

# Fill missing config from parsed group name
for col, gcol in [("strategy", "_group_strategy"), ("scorer", "_group_scorer"),
                   ("aggregation", "_group_aggregation"),
                   ("scoring_window", "_group_window"),
                   ("dataset", "_group_dataset"), ("model", "_group_model")]:
    df[col] = df[col].fillna(df[gcol])

# Drop helper columns
df.drop(columns=[c for c in df.columns if c.startswith("_group_")], inplace=True)

# Filter to finished runs only
n_before = len(df)
df = df[df["state"] == "finished"].copy()
print(f"Kept {len(df)}/{n_before} finished runs")

# Optional group filter
if GROUP_FILTERS:
    mask = df["group"].apply(lambda g: any(f in (g or "") for f in GROUP_FILTERS))
    df = df[mask].copy()
    print(f"After group filter: {len(df)} runs")

# Normalize accuracy to percentage
for col in ["exact_match", "llm_judge_accuracy"]:
    if col in df.columns:
        # If values look like fractions (0-1), convert to pct
        mask = df[col].notna() & (df[col] <= 1.0)
        df.loc[mask, col] = df.loc[mask, col] * 100

print(f"\nStrategies: {sorted(df['strategy'].dropna().unique())}")
print(f"Scorers:    {sorted(df['scorer'].dropna().unique())}")
print(f"Datasets:   {sorted(df['dataset'].dropna().unique())}")
df[["strategy", "scorer", "aggregation", "scoring_window", "dataset", "model",
    "exact_match", "total_tflops"]].head(10)

In [None]:
# ── Seed Averaging ────────────────────────────────────────────────────────────

CONFIG_COLS = ["strategy", "scorer", "aggregation", "scoring_window",
               "model", "dataset", "project_label",
               "beam_size", "candidates_per_beam", "num_paths", "num_candidates"]

METRIC_COLS = ["exact_match", "llm_judge_accuracy", "avg_reasoning_steps",
               "total_tokens", "total_tflops", "avg_tokens_per_sample",
               "avg_output_tokens_per_sample", "avg_tflops_per_sample"]


def aggregate_seeds(df: pd.DataFrame) -> pd.DataFrame:
    """Group by config columns and compute mean/std over seeds."""
    present_cfg = [c for c in CONFIG_COLS if c in df.columns]
    present_met = [c for c in METRIC_COLS if c in df.columns]

    grouped = df.groupby(present_cfg, dropna=False)
    agg = grouped[present_met].agg(["mean", "std", "count"]).reset_index()

    # Flatten multi-level columns
    flat_cols = []
    for col in agg.columns:
        if isinstance(col, tuple) and col[1]:
            flat_cols.append(f"{col[0]}_{col[1]}")
        else:
            flat_cols.append(col[0] if isinstance(col, tuple) else col)
    agg.columns = flat_cols

    # Add a formatted "mean +/- std" column for the primary metric
    for m in present_met:
        mean_col, std_col = f"{m}_mean", f"{m}_std"
        if mean_col in agg.columns:
            agg[f"{m}_fmt"] = agg.apply(
                lambda r: f"{r[mean_col]:.1f} +/- {r[std_col]:.1f}"
                if pd.notna(r[std_col]) and r.get(f"{m}_count", 0) > 1
                else (f"{r[mean_col]:.1f}" if pd.notna(r[mean_col]) else ""),
                axis=1,
            )
    return agg


agg_df = aggregate_seeds(df)
print(f"Aggregated configs: {len(agg_df)}")
agg_df.head()

In [None]:
# ── Pivot Table Helper ────────────────────────────────────────────────────────

def make_comparison_table(
    df: pd.DataFrame,
    row_field: str,
    col_field: str,
    value_field: str = "exact_match_fmt",
    filter_dict: dict | None = None,
    title: str | None = None,
) -> pd.DataFrame:
    """Build a pivot table from the aggregated DataFrame."""
    sub = df.copy()
    if filter_dict:
        for k, v in filter_dict.items():
            if isinstance(v, list):
                sub = sub[sub[k].isin(v)]
            else:
                sub = sub[sub[k] == v]

    if sub.empty:
        print("No data after filtering.")
        return pd.DataFrame()

    pivot = sub.pivot_table(
        index=row_field,
        columns=col_field,
        values=value_field,
        aggfunc="first",
    )
    if title:
        print(f"\n{'=' * len(title)}")
        print(title)
        print(f"{'=' * len(title)}")
    return pivot

In [None]:
# ── Table 1: Strategy x Scorer Grid ───────────────────────────────────────────

for dataset_label in sorted(agg_df["project_label"].dropna().unique()):
    tbl = make_comparison_table(
        agg_df,
        row_field="scorer",
        col_field="strategy",
        value_field="exact_match_fmt",
        filter_dict={"project_label": dataset_label},
        title=f"Exact Match (%) — {dataset_label}",
    )
    if not tbl.empty:
        display(tbl)

In [None]:
# ── Table 2: Aggregation x Scoring Window (beam search only) ──────────────────

beam_df = agg_df[agg_df["strategy"] == "beam_search"].copy()

if beam_df.empty:
    print("No beam search runs found.")
else:
    for scorer in sorted(beam_df["scorer"].dropna().unique()):
        for dataset_label in sorted(beam_df["project_label"].dropna().unique()):
            tbl = make_comparison_table(
                beam_df,
                row_field="aggregation",
                col_field="scoring_window",
                value_field="exact_match_fmt",
                filter_dict={"scorer": scorer, "project_label": dataset_label},
                title=f"Beam Search — scorer={scorer}, dataset={dataset_label}",
            )
            if not tbl.empty:
                display(tbl)

In [None]:
# ── Table 3: Compute Efficiency ───────────────────────────────────────────────

eff_cols = ["strategy", "scorer", "aggregation", "scoring_window",
            "project_label", "model",
            "exact_match_mean", "total_tflops_mean",
            "avg_tokens_per_sample_mean", "avg_reasoning_steps_mean"]
present = [c for c in eff_cols if c in agg_df.columns]
eff_df = agg_df[present].copy()

# Rename for readability
rename_map = {
    "exact_match_mean": "Accuracy (%)",
    "total_tflops_mean": "Total TFLOPS",
    "avg_tokens_per_sample_mean": "Tokens/Problem",
    "avg_reasoning_steps_mean": "Reasoning Steps",
}
eff_df.rename(columns={k: v for k, v in rename_map.items() if k in eff_df.columns},
              inplace=True)

eff_df.sort_values("Accuracy (%)", ascending=False, inplace=True)
print("Compute Efficiency Overview")
print("=" * 40)
display(eff_df.reset_index(drop=True))

In [None]:
# ── LaTeX Export ──────────────────────────────────────────────────────────────

def to_latex(df: pd.DataFrame, caption: str, label: str) -> str:
    """Convert a DataFrame to a booktabs LaTeX table string."""
    latex = df.to_latex(
        index=True,
        escape=True,
        na_rep="--",
        caption=caption,
        label=label,
        position="htbp",
    )
    # Add booktabs rules
    latex = latex.replace("\\toprule", "\\toprule")  # already there with booktabs
    return latex


# Re-generate tables and export as LaTeX
latex_outputs = []

# Strategy x Scorer tables
for dataset_label in sorted(agg_df["project_label"].dropna().unique()):
    tbl = make_comparison_table(
        agg_df,
        row_field="scorer",
        col_field="strategy",
        value_field="exact_match_fmt",
        filter_dict={"project_label": dataset_label},
    )
    if not tbl.empty:
        ltx = to_latex(
            tbl,
            caption=f"Exact match accuracy (\\%) by strategy and scorer on {dataset_label}.",
            label=f"tab:strategy_scorer_{dataset_label}",
        )
        latex_outputs.append((f"Strategy x Scorer — {dataset_label}", ltx))

# Beam search aggregation x window tables
if not beam_df.empty:
    for scorer in sorted(beam_df["scorer"].dropna().unique()):
        for dataset_label in sorted(beam_df["project_label"].dropna().unique()):
            tbl = make_comparison_table(
                beam_df,
                row_field="aggregation",
                col_field="scoring_window",
                value_field="exact_match_fmt",
                filter_dict={"scorer": scorer, "project_label": dataset_label},
            )
            if not tbl.empty:
                ltx = to_latex(
                    tbl,
                    caption=f"Beam search accuracy (\\%) — scorer={scorer}, dataset={dataset_label}.",
                    label=f"tab:beam_{scorer}_{dataset_label}",
                )
                latex_outputs.append((f"Beam {scorer} — {dataset_label}", ltx))

# Efficiency table
if not eff_df.empty:
    ltx = to_latex(
        eff_df.reset_index(drop=True),
        caption="Compute efficiency comparison across strategies.",
        label="tab:compute_efficiency",
    )
    latex_outputs.append(("Compute Efficiency", ltx))

# Print all LaTeX
for title, ltx in latex_outputs:
    print(f"% ── {title} " + "─" * (60 - len(title)))
    print(ltx)
    print()

In [None]:
# ── Visualization ─────────────────────────────────────────────────────────────

# Bar chart: accuracy by strategy (per dataset)
plot_df = agg_df.dropna(subset=["exact_match_mean"]).copy()

if not plot_df.empty:
    fig, axes = plt.subplots(
        1, max(1, plot_df["project_label"].nunique()),
        figsize=(6 * max(1, plot_df["project_label"].nunique()), 5),
        squeeze=False,
    )
    for idx, dataset_label in enumerate(sorted(plot_df["project_label"].unique())):
        ax = axes[0, idx]
        sub = plot_df[plot_df["project_label"] == dataset_label]
        # Average across scorers/configs per strategy
        bars = sub.groupby("strategy")["exact_match_mean"].mean().sort_values()
        bars.plot.barh(ax=ax, color="steelblue")
        ax.set_xlabel("Exact Match (%)")
        ax.set_title(dataset_label)
    plt.tight_layout()
    plt.show()
else:
    print("No data for bar chart.")

In [None]:
# Heatmap: beam search scorer x aggregation x window

if sns is None:
    print("Install seaborn for heatmap visualization: pip install seaborn")
elif not beam_df.empty:
    heat_df = beam_df.dropna(subset=["exact_match_mean"]).copy()
    heat_df["config"] = heat_df["aggregation"].astype(str) + " / w=" + heat_df["scoring_window"].astype(str)

    for dataset_label in sorted(heat_df["project_label"].dropna().unique()):
        sub = heat_df[heat_df["project_label"] == dataset_label]
        if sub.empty:
            continue
        pivot = sub.pivot_table(
            index="config", columns="scorer",
            values="exact_match_mean", aggfunc="first",
        )
        if pivot.empty:
            continue

        fig, ax = plt.subplots(figsize=(max(6, pivot.shape[1] * 2), max(4, pivot.shape[0] * 0.6)))
        sns.heatmap(pivot, annot=True, fmt=".1f", cmap="YlGnBu", ax=ax)
        ax.set_title(f"Beam Search Accuracy — {dataset_label}")
        plt.tight_layout()
        plt.show()
else:
    print("No beam search data for heatmap.")