![Banner](https://github.com/LittleHouse75/flatiron-resources/raw/main/NevitsBanner.png)
---
# Experiment 3 — Frontier LLMs via OpenRouter
### Zero-Shot Dialogue Summarization Using API Models
---

This notebook evaluates **frontier large language models** (OpenAI, Anthropic, Google, Mistral, etc.)  
via **OpenRouter**, using a *single* API interface.

We:
- Load the SAMSum validation set  
- Sample N examples  
- Send them to multiple frontier models  
- Score ROUGE  
- Save predictions + latencies  
- Produce qualitative examples  

This establishes the **upper-bound performance baseline** for the project.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from pathlib import Path
import sys
import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

RESULTS_DIR = PROJECT_ROOT / "experiments" / "exp3_api_llm_results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_DIR

In [None]:
# %%
N_SAMPLES = 1       # cost control
SEED = 42

# Summary length for the frontier models
MAX_OUT_TOKENS = 128

# Which OpenRouter models to evaluate
OPENROUTER_MODELS = {
    "gpt5_nano": "openai/gpt-5-nano",
    "gpt5_mini": "openai/gpt-5-mini",
    "gpt_oss_120b": "openai/gpt-oss-120b",
    "gpt_oss_20b": "openai/gpt-oss-20b",
}

OPENROUTER_MODELS

In [None]:
from src.data.load_data import load_samsum

train_df, val_df, test_df = load_samsum()
len(train_df), len(val_df), len(test_df)

In [None]:
rng = np.random.default_rng(SEED)

if N_SAMPLES >= len(val_df):
    eval_df = val_df.copy().reset_index(drop=True)
else:
    eval_df = val_df.sample(n=N_SAMPLES, random_state=SEED).reset_index(drop=True)

eval_df.head()

In [None]:
def build_summarization_prompt(dialogue: str) -> str:
    return (
        "Summarize the following chat conversation in 1–3 sentences. "
        "Focus on actions, decisions, and plans. "
        "Do not add information not supported by the text.\n\n"
        "DIALOGUE:\n"
        "-----\n"
        f"{dialogue}\n"
        "-----\n\n"
        "SUMMARY:"
    )

build_summarization_prompt(eval_df['dialogue'].iloc[0])[:300]

In [None]:
import importlib
import src.utils.openrouter_client as openrouter_client
importlib.reload(openrouter_client)

In [None]:
from tqdm.auto import tqdm

results_by_model = {}

for label, model_id in OPENROUTER_MODELS.items():
    rows = []
    for i, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc=label):
        dialogue = row["dialogue"]
        reference = row["summary"]

        prompt = build_summarization_prompt(dialogue)
        
        try:
            pred, latency = openrouter_client.call_openrouter_llm(
                model=model_id,
                prompt=prompt,
                max_tokens=MAX_OUT_TOKENS,
                temperature=0.2,
            )
        except Exception as e:
            pred = f"[ERROR: {e}]"
            latency = np.nan

        rows.append({
            "dialogue": dialogue,
            "reference_summary": reference,
            "model_summary": pred,
            "latency_seconds": latency,
        })
    
    df_out = pd.DataFrame(rows)
    results_by_model[label] = df_out
    df_out.to_csv(RESULTS_DIR / f"{label}.csv", index=False)

results_by_model.keys()

In [None]:
from src.eval.rouge_eval import compute_rouge_from_df

rouge_scores = {}

for label, df in results_by_model.items():
    print(f"\n=== ROUGE for {label} ===")
    scores = compute_rouge_from_df(df)
    rouge_scores[label] = scores
    print(scores)

In [None]:
def latency_summary(df, label):
    print(f"\n=== Latency stats: {label} ===")
    vals = df["latency_seconds"].replace([np.inf, -np.inf], np.nan).dropna()
    print(vals.describe(percentiles=[0.5, 0.9, 0.95]))

for label, df in results_by_model.items():
    latency_summary(df, label)

In [None]:
def show_examples(df, n=5, max_chars=600):
    sample = df.sample(n=min(n, len(df)), random_state=SEED)
    for _, row in sample.iterrows():
        d = row["dialogue"]
        d = d[:max_chars] + " ... [truncated]" if len(d) > max_chars else d
        
        print("\n=== Example ===")
        print("[DIALOGUE]")
        print(d)
        print("\n[HUMAN SUMMARY]")
        print(row["reference_summary"])
        print("\n[MODEL SUMMARY]")
        print(row["model_summary"])
        print("\n" + "-"*60)

for label, df in results_by_model.items():
    print(f"\n##### Examples for {label} #####")
    show_examples(df, n=5)

---
# Key Takeaways — Experiment 3 (Frontier LLMs via OpenRouter)
---

Fill this in after running:

- ROUGE performance of each frontier LLM  
- Latency comparisons  
- Style differences (concise vs narrative)  
- Error cases or hallucinations  
- Cost/latency trade-offs compared to local models  
- Which model will be used in the final comparison notebook  

This experiment establishes the upper-bound performance for the project.