In [None]:
!pip install pandas numpy plotly nbformat

In [None]:
# Performance sweep viz (interactive Plotly) + per-run variance + outlier flagging
from __future__ import annotations

import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


## 0) Plotly renderer (VSCode)
pio.renderers.default = "vscode" if os.environ.get("VSCODE_PID") else "notebook_connected"


## 1) Paths / load
DATA_DIR = Path(".")  # change to Path("...") if your CSVs live elsewhere

LOAD_TIMES_CSV = DATA_DIR / "load_times.csv"
REQUESTS_CSV   = DATA_DIR / "requests.csv"
SUMMARY_CSV    = DATA_DIR / "summary.csv"

load_df = pd.read_csv(LOAD_TIMES_CSV)
req_df  = pd.read_csv(REQUESTS_CSV)
sum_df  = pd.read_csv(SUMMARY_CSV)

for df in (load_df, req_df, sum_df):
    if "timestamp_utc" in df.columns:
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], errors="coerce", utc=True)


## 2) Per-run derived metrics (stream runs)
stream = req_df[
    (~req_df["warmup_run"]) &
    (req_df["run_mode"] == "stream") &
    (req_df["ok"] == True)
].copy()

stream["decode_time_s"] = stream["wall_s"] - stream["ttft_s"]
stream["prefill_eff_tps"] = stream["prompt_tokens"] / stream["ttft_s"].replace(0, np.nan)
stream["decode_eff_tps"]  = stream["completion_tokens"] / stream["decode_time_s"].replace(0, np.nan)
stream["est_wall_s"] = stream["ttft_s"] + (stream["max_output_tokens"] / stream["decode_eff_tps"].replace(0, np.nan))

stream["early_stop"] = stream["completion_tokens"] < 0.9 * stream["max_output_tokens"]

stream = stream[
    np.isfinite(stream["ttft_s"]) &
    np.isfinite(stream["prefill_eff_tps"]) &
    np.isfinite(stream["decode_eff_tps"]) &
    np.isfinite(stream["est_wall_s"]) &
    (stream["ttft_s"] > 0) &
    (stream["decode_time_s"] > 0)
].copy()

agg = (
    stream
    .groupby(["model", "target_prompt_tokens"], as_index=False)
    .agg(
        n=("idx", "size"),
        ttft_mean=("ttft_s", "mean"),
        ttft_std=("ttft_s", "std"),
        prefill_mean=("prefill_eff_tps", "mean"),
        prefill_std=("prefill_eff_tps", "std"),
        decode_mean=("decode_eff_tps", "mean"),
        decode_std=("decode_eff_tps", "std"),
        est_wall_mean=("est_wall_s", "mean"),
        est_wall_std=("est_wall_s", "std"),
        completion_tokens_min=("completion_tokens", "min"),
        completion_tokens_med=("completion_tokens", "median"),
        completion_tokens_max=("completion_tokens", "max"),
        early_stop_frac=("early_stop", "mean"),
    )
    .sort_values(["model", "target_prompt_tokens"])
    .reset_index(drop=True)
)

for c in ["ttft_std", "prefill_std", "decode_std", "est_wall_std"]:
    agg[c] = agg[c].fillna(0.0)


## 3) Outlier detection (stream vs nonstream wall mismatches)
nonstream = req_df[
    (~req_df["warmup_run"]) &
    (req_df["run_mode"] == "nonstream") &
    (req_df["ok"] == True)
].copy()

paired = stream.merge(
    nonstream[["job_id", "idx", "wall_s"]],
    on=["job_id", "idx"],
    how="left",
    suffixes=("_stream", "_nonstream"),
)

paired["wall_diff_s"] = paired["wall_s_stream"] - paired["wall_s_nonstream"]
timing_outliers = paired[
    paired["wall_s_nonstream"].notna() &
    (paired["wall_diff_s"].abs() > 5)
].copy()


## 4) Shared colors
models = sorted(agg["model"].unique())
palette = (
    px.colors.qualitative.Alphabet
    + px.colors.qualitative.Dark24
    + px.colors.qualitative.Light24
)
base_color_map = {m: palette[i % len(palette)] for i, m in enumerate(models)}

def hex_to_rgba(hex_color: str, alpha: float) -> str:
    hc = hex_color.lstrip("#")
    if len(hc) != 6:
        return hex_color
    r = int(hc[0:2], 16)
    g = int(hc[2:4], 16)
    b = int(hc[4:6], 16)
    return f"rgba({r},{g},{b},{alpha})"

LINE_ALPHA = 0.70
FILL_ALPHA = 0.20
POINT_ALPHA = 0.35


## 5) Tick helpers
def _nice_step(raw_step: float) -> float:
    if not np.isfinite(raw_step) or raw_step <= 0:
        return 1.0
    exp = np.floor(np.log10(raw_step))
    base = raw_step / (10 ** exp)
    if base <= 1:
        nice = 1
    elif base <= 2:
        nice = 2
    elif base <= 5:
        nice = 5
    else:
        nice = 10
    return nice * (10 ** exp)

def dense_y_ticks(fig: go.Figure, y_min: float, y_max: float, target_ticks: int = 14):
    if not (np.isfinite(y_min) and np.isfinite(y_max)) or y_min == y_max:
        return fig
    span = y_max - y_min
    raw_step = span / max(2, target_ticks - 1)
    dtick = _nice_step(raw_step)
    y0 = np.floor(y_min / dtick) * dtick
    y1 = np.ceil(y_max / dtick) * dtick
    fig.update_yaxes(tickmode="linear", tick0=y0, dtick=dtick, range=[y0, y1])
    return fig

def _range(df: pd.DataFrame, col: str) -> tuple[float, float]:
    v = pd.to_numeric(df[col], errors="coerce").dropna().values
    if v.size == 0:
        return (0.0, 1.0)
    return (float(v.min()), float(v.max()))


## 6) Plot primitives
def mean_line_with_band_and_errors(
    agg_df: pd.DataFrame,
    run_df: pd.DataFrame,
    y_mean_col: str,
    y_std_col: str,
    run_y_col: str,
    title: str,
    y_label: str,
    show_band: bool = True,
    show_errorbars: bool = True,
    show_runs: bool = True,
) -> go.Figure:
    fig = go.Figure()

    if show_runs:
        for m in models:
            d = run_df[run_df["model"] == m].sort_values("target_prompt_tokens")
            if d.empty:
                continue
            fig.add_trace(
                go.Scatter(
                    x=d["target_prompt_tokens"],
                    y=d[run_y_col],
                    mode="markers",
                    name=f"{m} (runs)",
                    legendgroup=m,
                    showlegend=False,
                    marker=dict(
                        color=hex_to_rgba(base_color_map[m], POINT_ALPHA),
                        size=7,
                    ),
                    hovertemplate=(
                        f"Model: {m}<br>"
                        "Prompt: %{x}<br>"
                        f"{run_y_col}: %{{y:.4g}}<br>"
                        "job_id: %{customdata[0]}<br>"
                        "idx: %{customdata[1]}<extra></extra>"
                    ),
                    customdata=np.stack([d["job_id"].astype(str), d["idx"].astype(str)], axis=1),
                )
            )

    for m in models:
        g = agg_df[agg_df["model"] == m].sort_values("target_prompt_tokens")
        if g.empty:
            continue

        x = g["target_prompt_tokens"].values
        y = g[y_mean_col].values
        s = g[y_std_col].values
        color_line = hex_to_rgba(base_color_map[m], LINE_ALPHA)

        if show_band:
            upper = y + s
            lower = y - s
            fig.add_trace(
                go.Scatter(
                    x=x,
                    y=upper,
                    mode="lines",
                    line=dict(width=0),
                    name=f"{m} (+1σ)",
                    legendgroup=m,
                    showlegend=False,
                    hoverinfo="skip",
                )
            )
            fig.add_trace(
                go.Scatter(
                    x=x,
                    y=lower,
                    mode="lines",
                    line=dict(width=0),
                    fill="tonexty",
                    fillcolor=hex_to_rgba(base_color_map[m], FILL_ALPHA),
                    name=f"{m} (±1σ)",
                    legendgroup=m,
                    showlegend=False,
                    hoverinfo="skip",
                )
            )

        err = dict(type="data", array=s, visible=True) if show_errorbars else None

        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                mode="lines+markers",
                name=m,
                legendgroup=m,
                line=dict(color=color_line, width=2),
                marker=dict(size=8, color=color_line),
                error_y=err,
                hovertemplate=(
                    f"Model: {m}<br>"
                    "Prompt: %{x}<br>"
                    f"{y_mean_col}: %{{y:.4g}}<br>"
                    f"{y_std_col}: %{{customdata[0]:.4g}}<br>"
                    "n: %{customdata[1]}<extra></extra>"
                ),
                customdata=np.stack([s, g["n"].values], axis=1),
            )
        )

    fig.update_layout(
        title=title,
        template="plotly_white",
        hovermode="x unified",
        xaxis_title="Target prompt tokens",
        yaxis_title=y_label,
        margin=dict(l=75, r=25, t=60, b=60),
        legend_title_text="Model",
    )
    fig.update_xaxes(tickmode="array", tickvals=sorted(agg_df["target_prompt_tokens"].unique()))

    y_min, y_max = _range(agg_df, y_mean_col)
    pad = 0.05 * (y_max - y_min) if y_max > y_min else 1.0
    dense_y_ticks(fig, y_min - pad, y_max + pad, target_ticks=16)

    return fig


def heatmap_mean(
    agg_df: pd.DataFrame,
    value_col: str,
    title: str,
    z_fmt: str = ".4g"
) -> go.Figure:
    piv = agg_df.pivot_table(index="model", columns="target_prompt_tokens", values=value_col, aggfunc="mean")
    piv = piv.reindex(index=models)
    piv = piv.sort_index(axis=1)

    x = [int(c) for c in piv.columns]
    y = piv.index.tolist()
    z = piv.values

    fig = go.Figure(
        data=go.Heatmap(
            x=x,
            y=y,
            z=z,
            colorbar=dict(title=value_col),
            hovertemplate=(
                "Model: %{y}<br>"
                "Prompt tokens: %{x}<br>"
                f"{value_col}: %{{z:{z_fmt}}}<extra></extra>"
            ),
        )
    )
    fig.update_layout(
        title=title,
        template="plotly_white",
        xaxis_title="Target prompt tokens",
        yaxis_title="Model",
        margin=dict(l=180, r=25, t=60, b=60),
    )
    fig.update_yaxes(automargin=True)
    fig.update_xaxes(tickmode="array", tickvals=x)
    return fig


## Shared subset for 4096-target products
df_4096 = agg[agg["target_prompt_tokens"] == 4096].copy()

## Figure 1: TTFT vs Prompt Length
Mean TTFT by model across prompt sizes, with per-run points, ±1σ bands, and error bars. Use this to compare startup responsiveness under increasing context.

In [None]:
# Figure: TTFT vs prompt length
fig_ttft = mean_line_with_band_and_errors(
    agg_df=agg,
    run_df=stream,
    y_mean_col="ttft_mean",
    y_std_col="ttft_std",
    run_y_col="ttft_s",
    title="TTFT vs prompt length (mean ±1σ, per-run dots)",
    y_label="TTFT (seconds)",
    show_band=True,
    show_errorbars=True,
    show_runs=True,
)
fig_ttft.show()

## Figure 2: Prefill Throughput vs Prompt Length
Mean prefill throughput ($tokens/s$) with variability overlays. This highlights how efficiently each model processes input context before decoding.

In [None]:
# Figure: Prefill throughput vs prompt length
fig_prefill = mean_line_with_band_and_errors(
    agg_df=agg,
    run_df=stream,
    y_mean_col="prefill_mean",
    y_std_col="prefill_std",
    run_y_col="prefill_eff_tps",
    title="Prefill throughput vs prompt length (mean ±1σ, per-run dots)",
    y_label="Prefill throughput (tokens/sec)",
    show_band=True,
    show_errorbars=True,
    show_runs=True,
)
fig_prefill.show()

## Figure 3: Decode Throughput vs Prompt Length
Mean decode throughput ($tokens/s$) with per-run scatter and ±1σ context. Use this for generation-speed comparisons across context sizes.

In [None]:
# Figure: Decode throughput vs prompt length
fig_decode = mean_line_with_band_and_errors(
    agg_df=agg,
    run_df=stream,
    y_mean_col="decode_mean",
    y_std_col="decode_std",
    run_y_col="decode_eff_tps",
    title="Decode throughput vs prompt length (mean ±1σ, per-run dots)",
    y_label="Decode throughput (tokens/sec)",
    show_band=True,
    show_errorbars=True,
    show_runs=True,
)
fig_decode.show()

## Figure 4: Estimated Wall Time vs Prompt Length
Estimated completion time defined as $TTFT + \frac{max\_output\_tokens}{decode\_throughput}$. Useful for end-to-end latency planning.

In [None]:
# Figure: Estimated wall time vs prompt length
fig_est_wall = mean_line_with_band_and_errors(
    agg_df=agg,
    run_df=stream,
    y_mean_col="est_wall_mean",
    y_std_col="est_wall_std",
    run_y_col="est_wall_s",
    title="Estimated wall time vs prompt length (TTFT + max_output/decode) (mean ±1σ, per-run dots)",
    y_label="Estimated wall time (seconds)",
    show_band=True,
    show_errorbars=True,
    show_runs=True,
)
fig_est_wall.show()

## Figure 5: Prefill Throughput Heatmap (Mean)
Heatmap of average prefill throughput by model and prompt length for quick ranking of context-ingestion performance.

In [None]:
# Figure: Prefill throughput heatmap (mean)
fig_heat_prefill = heatmap_mean(agg, "prefill_mean", "Prefill throughput heatmap (mean tokens/sec)")
fig_heat_prefill.show()

## Figure 6: Decode Throughput Heatmap (Mean)
Heatmap of average decode throughput by model and prompt length to compare generation speed at a glance.

In [None]:
# Figure: Decode throughput heatmap (mean)
fig_heat_decode = heatmap_mean(agg, "decode_mean", "Decode throughput heatmap (mean tokens/sec)")
fig_heat_decode.show()

## Figure 7: Prefill Throughput Heatmap (Std Dev)
Variability heatmap for prefill throughput. Larger values indicate less stable performance across runs.

In [None]:
# Figure: Prefill throughput heatmap (std dev)
fig_heat_prefill_std = heatmap_mean(agg, "prefill_std", "Prefill throughput heatmap (std dev tokens/sec)")
fig_heat_prefill_std.show()

## Figure 8: Decode Throughput Heatmap (Std Dev)
Variability heatmap for decode throughput to identify unstable generation behavior by model and context size.

In [None]:
# Figure: Decode throughput heatmap (std dev)
fig_heat_decode_std = heatmap_mean(agg, "decode_std", "Decode throughput heatmap (std dev tokens/sec)")
fig_heat_decode_std.show()

## Figure 9: Tradeoff at 4096 Tokens
Scatter plot of TTFT versus decode throughput at 4096 prompt tokens with ±1σ error bars, showing latency/speed tradeoffs.

In [None]:
# Figure: Tradeoff @ 4096 prompt tokens
fig_tradeoff_4096 = px.scatter(
    df_4096,
    x="ttft_mean",
    y="decode_mean",
    color="model",
    color_discrete_map=base_color_map,
    error_x="ttft_std",
    error_y="decode_std",
    hover_data={
        "model": True,
        "n": True,
        "ttft_mean": ":.4g",
        "ttft_std": ":.4g",
        "prefill_mean": ":.4g",
        "decode_mean": ":.4g",
        "decode_std": ":.4g",
        "est_wall_mean": ":.4g",
        "early_stop_frac": ":.3f",
    },
    title="Tradeoff @ 4096 prompt tokens: TTFT vs decode (error bars = ±1σ)",
)
fig_tradeoff_4096.update_layout(
    template="plotly_white",
    xaxis_title="TTFT mean (s) @ 4096",
    yaxis_title="Decode throughput mean (tokens/sec) @ 4096",
    margin=dict(l=75, r=25, t=60, b=60),
)
dense_y_ticks(fig_tradeoff_4096, *_range(df_4096, "decode_mean"), target_ticks=14)
fig_tradeoff_4096.show()

## Figure 10: Stream vs Non-Stream Wall-Time Outliers
Highlights runs where $|wall_{stream} - wall_{nonstream}| > 5s$. Use this to investigate suspicious timing divergence.

In [None]:
# Figure: Outliers for stream vs nonstream wall mismatch
if not timing_outliers.empty:
    fig_wall_mismatch = px.scatter(
        timing_outliers,
        x="target_prompt_tokens",
        y="wall_diff_s",
        color="model",
        color_discrete_map=base_color_map,
        symbol="model",
        hover_data={
            "job_id": True,
            "idx": True,
            "wall_s_stream": ":.4g",
            "wall_s_nonstream": ":.4g",
            "wall_diff_s": ":.4g",
            "ttft_s": ":.4g",
            "completion_tokens": True,
            "max_output_tokens": True,
            "early_stop": True,
        },
        title="Outliers: stream vs nonstream wall-time mismatch (wall_stream - wall_nonstream)",
    )
    fig_wall_mismatch.update_layout(
        template="plotly_white",
        xaxis_title="Target prompt tokens",
        yaxis_title="Wall time difference (seconds)",
        margin=dict(l=75, r=25, t=60, b=60),
    )
    dense_y_ticks(fig_wall_mismatch, *_range(timing_outliers, "wall_diff_s"), target_ticks=12)
    fig_wall_mismatch.show()
else:
    print("No stream/nonstream wall-time outliers above threshold.")

## Product 1: Leaderboard Table at 4096
Sorted table of models by estimated wall time at 4096 prompt tokens, including TTFT/prefill/decode means, std dev, and early-stop fraction.

In [None]:
# Product: Leaderboard table @ 4096
leader_4096 = (
    df_4096[["model", "ttft_mean", "ttft_std", "prefill_mean", "prefill_std", "decode_mean", "decode_std", "est_wall_mean", "est_wall_std", "early_stop_frac", "n"]]
    .sort_values("est_wall_mean")
    .reset_index(drop=True)
)
display(leader_4096)

## Product 2: Early-Stop Runs
Lists runs where completion tokens are below the early-stop threshold, with timing and throughput fields for debugging.

In [None]:
# Product: Runs with early stops
early_stop_runs = stream[stream["early_stop"]][
    ["job_id", "model", "target_prompt_tokens", "idx", "completion_tokens", "max_output_tokens", "wall_s", "ttft_s", "decode_eff_tps", "prefill_eff_tps"]
] .sort_values(["model", "target_prompt_tokens", "idx"])
if not early_stop_runs.empty:
    display(early_stop_runs)
else:
    print("No early-stop runs found.")

## Product 3: Timing Mismatch Table
Detailed table of stream/non-stream wall-time mismatches for direct inspection and filtering.

In [None]:
# Product: Stream vs nonstream mismatch table
if not timing_outliers.empty:
    display(
        timing_outliers[
            ["job_id", "model", "target_prompt_tokens", "idx", "wall_s_stream", "wall_s_nonstream", "wall_diff_s", "ttft_s", "completion_tokens", "max_output_tokens", "early_stop"]
        ].sort_values("wall_diff_s")
    )
else:
    print("No timing mismatches above threshold.")

## Product 4: Warmup Bookkeeping Reminder
Prints warmup row count and success rate to help interpret warmup-related anomalies in benchmark logs.

In [None]:
# Product: Warmup bookkeeping reminder
warmup_rows = req_df[req_df["warmup_run"]]
if not warmup_rows.empty:
    print(f"Note: warmup_run rows in requests.csv: {len(warmup_rows)}; ok-rate={warmup_rows['ok'].mean():.3f} (often 0.0 due to harness bookkeeping/errors).")
else:
    print("No warmup rows found in requests.csv.")