# Results Dashboard
Use this notebook to explore evaluation outputs generated under `results/`.
Update `RESULT_ROOT` to point at your experiment directory.

In [None]:
from __future__ import annotations
import json
from pathlib import Path
from typing import Dict, List

import pandas as pd
import plotly.express as px

RESULT_ROOT = Path('results')
CORE_TASKS = ['mmlu', 'gsm8k', 'hellaswag', 'arc_easy', 'arc_challenge', 'boolq']
LM_DATASETS = ['wikitext', 'pg19']
LONG_SUITES = ['long', 'scrolls']
LRA_TASKS = ['listops', 'text', 'retrieval']


In [None]:
def load_metrics(path: Path) -> Dict:
    if not path.exists():
        return {}
    with path.open() as handle:
        return json.load(handle)

def collect_suite_metrics(model_tag: str, suite: str) -> Dict:
    return load_metrics(RESULT_ROOT / model_tag / suite / 'metrics.json')

def available_models() -> List[str]:
    return [p.name for p in RESULT_ROOT.iterdir() if p.is_dir()]

MODELS = available_models()
MODELS


In [None]:
quality_rows = []
for model_tag in MODELS:
    core = collect_suite_metrics(model_tag, 'core')
    for task in CORE_TASKS:
        score = core.get('results', {}).get(task, {}).get('accuracy')
        if score is not None:
            quality_rows.append({'model': model_tag, 'task': task, 'score': score})
quality_df = pd.DataFrame(quality_rows)
quality_df
if not quality_df.empty:
    fig = px.bar(quality_df, x='task', y='score', color='model', barmode='group', title='Core benchmark accuracy')
    fig.show()


In [None]:
perf_path = RESULT_ROOT / MODELS[0] / 'perf' / 'latency_mem.csv' if MODELS else None
if perf_path and perf_path.exists():
    perf_df = pd.read_csv(perf_path)
    display(perf_df)
    fig = px.line(perf_df, x='seq_len', y='latency_ms', color='batch_size', markers=True, title='Latency vs sequence length')
    fig.show()
