![Banner](https://github.com/LittleHouse75/flatiron-resources/raw/main/NevitsBanner.png)
---
# Experiment 3 — Frontier LLMs via OpenRouter
### Zero-Shot Dialogue Summarization Using API Models
---

This notebook evaluates **frontier large language models** (OpenAI, Anthropic, Google, Mistral, etc.)  
via **OpenRouter**, using a *single* API interface.

We:
- Load the SAMSum validation set  
- Sample N examples  
- Send them to multiple frontier models  
- Score ROUGE  
- Save predictions + latencies  
- Produce qualitative examples  

This establishes the **upper-bound performance baseline** for the project.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from pathlib import Path
import sys
import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

RESULTS_DIR = PROJECT_ROOT / "experiments" / "exp3_api_llm_results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_DIR

PosixPath('/Users/timnevits/Projects/flatiron-language-models-for-ai/experiments/exp3_api_llm_results')

In [None]:
# %%
N_SAMPLES = 100       # cost control
SEED = 42

# Summary length for the frontier models
MAX_OUT_TOKENS = 512

# Which OpenRouter models to evaluate
OPENROUTER_MODELS = {
    # OpenAI family – small → big
    "gpt5_nano":       "openai/gpt-5-nano",
    "gpt5_mini":       "openai/gpt-5-mini",
    "gpt5_full":       "openai/gpt-5",          # flagship upper bound

    # OpenAI open-weight models
    "gpt_oss_20b":     "openai/gpt-oss-20b",
    "gpt_oss_120b":    "openai/gpt-oss-120b",

    # Google Gemini – fast, very strong general model
    "gemini_25_flash": "google/gemini-2.5-flash",

    # Anthropic Claude – strong competitor
    "claude_45_sonnet": "anthropic/claude-4.5-sonnet-20250929",

    # Qwen – top-tier open(-ish) model
    "qwen25_72b":      "qwen/qwen-2.5-72b-instruct",
}

OPENROUTER_MODELS

{'gpt5_nano': 'openai/gpt-5-nano',
 'gpt5_mini': 'openai/gpt-5-mini',
 'gpt5_full': 'openai/gpt-5',
 'gpt_oss_20b': 'openai/gpt-oss-20b',
 'gpt_oss_120b': 'openai/gpt-oss-120b',
 'gemini_25_flash': 'google/gemini-2.5-flash',
 'claude_45_sonnet': 'anthropic/claude-4.5-sonnet-20250929',
 'qwen25_72b': 'qwen/qwen-2.5-72b-instruct'}

In [4]:
from src.data.load_data import load_samsum

train_df, val_df, test_df = load_samsum()
len(train_df), len(val_df), len(test_df)

(14731, 818, 819)

In [5]:
rng = np.random.default_rng(SEED)

if N_SAMPLES >= len(val_df):
    eval_df = val_df.copy().reset_index(drop=True)
else:
    eval_df = val_df.sample(n=N_SAMPLES, random_state=SEED).reset_index(drop=True)

eval_df.head()

Unnamed: 0,id,dialogue,summary
0,13680857,"Edd: wow, did you hear that they're transferri...",Rose and Edd will be transferred to a new depa...
1,13716124,"Tom: Where is the ""Sala del Capitolo""\nKevin: ...","""Sala del Capitolo"" Tom is looking for is in t..."
2,13864418,Patricia: The rowing practice is cancelled!\nK...,The rowing practice is cancelled. A few member...
3,13729340,"Tom: U OK?\nAlex: Yeah, pretty good. U?\nTom: ...",Tom and Alex had fun last night. They drank a ...
4,13818813,"Patricia: Hello, here's the fair-trade brand I...",Patricia recommends a fair-trade brand she tal...


In [6]:
def build_summarization_prompt(dialogue: str) -> str:
    return (
        "Summarize the following conversation in 1–2 sentences. "
        "Keep it brief—aim for 15–30 words. "
        "Focus on the main point, decisions, requests, or outcomes. "
        "Ignore small talk and do not add details that aren't supported by the text.\n\n"
        "DIALOGUE:\n"
        "-----\n"
        f"{dialogue}\n"
        "-----\n\n"
        "SUMMARY:"
    )

build_summarization_prompt(eval_df['dialogue'].iloc[0])[:300]

"Summarize the following conversation in 1–2 sentences. Keep it brief—aim for 15–30 words. Focus on the main point, decisions, requests, or outcomes. Ignore small talk and do not add details that aren't supported by the text.\n\nDIALOGUE:\n-----\nEdd: wow, did you hear that they're transferring us to a d"

In [7]:
import importlib
import src.utils.openrouter_client as openrouter_client
importlib.reload(openrouter_client)

<module 'src.utils.openrouter_client' from '/Users/timnevits/Projects/flatiron-language-models-for-ai/src/utils/openrouter_client.py'>

In [8]:
from tqdm.auto import tqdm

results_by_model = {}

for label, model_id in OPENROUTER_MODELS.items():
    rows = []
    for i, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc=label):
        dialogue = row["dialogue"]
        reference = row["summary"]

        prompt = build_summarization_prompt(dialogue)
        
        try:
            pred, latency = openrouter_client.call_openrouter_llm(
                model=model_id,
                prompt=prompt,
                max_tokens=MAX_OUT_TOKENS,
                temperature=0.2,
            )
        except Exception as e:
            pred = f"[ERROR: {e}]"
            latency = np.nan

        rows.append({
            "dialogue": dialogue,
            "reference_summary": reference,
            "model_summary": pred,
            "latency_seconds": latency,
        })
    
    df_out = pd.DataFrame(rows)
    results_by_model[label] = df_out
    df_out.to_csv(RESULTS_DIR / f"{label}.csv", index=False)

results_by_model.keys()

gpt5_nano:   0%|          | 0/10 [00:00<?, ?it/s]

gpt5_mini:   0%|          | 0/10 [00:00<?, ?it/s]

gpt5_full:   0%|          | 0/10 [00:00<?, ?it/s]

gpt_oss_20b:   0%|          | 0/10 [00:00<?, ?it/s]

gpt_oss_120b:   0%|          | 0/10 [00:00<?, ?it/s]

gemini_25_flash:   0%|          | 0/10 [00:00<?, ?it/s]

claude_45_sonnet:   0%|          | 0/10 [00:00<?, ?it/s]

qwen25_72b:   0%|          | 0/10 [00:00<?, ?it/s]

dict_keys(['gpt5_nano', 'gpt5_mini', 'gpt5_full', 'gpt_oss_20b', 'gpt_oss_120b', 'gemini_25_flash', 'claude_45_sonnet', 'qwen25_72b'])

In [9]:
import pandas as pd
from src.eval.rouge_eval import compute_rouge_from_df

rouge_scores = {}

for label, df in results_by_model.items():
    scores = compute_rouge_from_df(df)
    rouge_scores[label] = scores

# Make a single DataFrame
rouge_df = pd.DataFrame.from_dict(rouge_scores, orient="index")
rouge_df.index.name = "model"
rouge_df = rouge_df.sort_values(by="rougeL", ascending=False)

display(rouge_df)

Unnamed: 0_level_0,rouge1,rouge2,rougeL,rougeLsum
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
qwen25_72b,0.397278,0.148522,0.355726,0.355392
gemini_25_flash,0.394541,0.125473,0.323182,0.321349
gpt5_full,0.390875,0.172374,0.318699,0.317469
gpt_oss_20b,0.359935,0.136262,0.314601,0.313288
gpt5_mini,0.365505,0.162422,0.311356,0.307065
gpt_oss_120b,0.364635,0.135672,0.296989,0.293644
gpt5_nano,0.34307,0.1433,0.295669,0.293645
claude_45_sonnet,0.365731,0.14966,0.281742,0.282952


In [10]:
import numpy as np
import pandas as pd

latency_stats = {}

for label, df in results_by_model.items():
    vals = df["latency_seconds"].replace([np.inf, -np.inf], np.nan).dropna()
    desc = vals.describe(percentiles=[0.5, 0.9, 0.95])
    latency_stats[label] = desc

latency_df = pd.DataFrame(latency_stats).T  # models as rows
latency_df.index.name = "model"

# Prettier column names
latency_df = latency_df.rename(
    columns={
        "count": "count",
        "mean": "mean",
        "std": "std",
        "min": "min",
        "50%": "p50",
        "90%": "p90",
        "95%": "p95",
        "max": "max",
    }
)

# Sort by mean latency, ascending (fastest first)
latency_df = latency_df.sort_values(by="mean", ascending=True)

display(latency_df)

Unnamed: 0_level_0,count,mean,std,min,p50,p90,p95,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gemini_25_flash,10.0,0.642808,0.0609,0.548486,0.661273,0.704779,0.708731,0.712683
qwen25_72b,10.0,0.849944,0.271886,0.581328,0.835179,1.040987,1.272119,1.503251
gpt5_mini,10.0,1.314891,0.30771,1.042807,1.201313,1.536704,1.805291,2.073877
gpt5_nano,10.0,1.355169,0.39795,0.84472,1.355637,1.637445,1.96327,2.289095
gpt5_full,10.0,1.442256,0.677555,1.061309,1.238382,1.631731,2.484627,3.337523
gpt_oss_120b,10.0,1.531679,0.787024,0.776933,1.07886,2.548445,2.573005,2.597564
gpt_oss_20b,10.0,2.837158,2.14503,0.924915,2.184061,5.086322,6.524129,7.961935
claude_45_sonnet,10.0,4.456505,1.461348,2.915412,3.930015,6.774541,7.063994,7.353447


In [11]:
def show_examples(df, n=5, max_chars=600):
    sample = df.sample(n=min(n, len(df)), random_state=SEED)
    for _, row in sample.iterrows():
        d = row["dialogue"]
        d = d[:max_chars] + " ... [truncated]" if len(d) > max_chars else d
        
        print("\n=== Example ===")
        print("[DIALOGUE]")
        print(d)
        print("\n[HUMAN SUMMARY]")
        print(row["reference_summary"])
        print("\n[MODEL SUMMARY]")
        print(row["model_summary"])
        print("\n" + "-"*60)

for label, df in results_by_model.items():
    print(f"\n##### Examples for {label} #####")
    show_examples(df, n=5)


##### Examples for gpt5_nano #####

=== Example ===
[DIALOGUE]
Iza: <file_other>
Monica: OMG
Monica: yesssssss!!!
Iza: I know 
Iza: I have been waiting for this for so long!
Monica: me too!
Monica: :D 

[HUMAN SUMMARY]
Iza has good news.

[MODEL SUMMARY]
Iza and Monica express excitement and anticipation together, acknowledging they’ve both waited for this moment.

------------------------------------------------------------

=== Example ===
[DIALOGUE]
Tom: Where is the "Sala del Capitolo"
Kevin: it's in the main building
Martin: The one with the huge round table
Tom: ok! I know.
Tom: Thx

[HUMAN SUMMARY]
"Sala del Capitolo" Tom is looking for is in the main building.

[MODEL SUMMARY]
Tom asked where the Sala del Capitolo is; Kevin says it's in the main building, and Martin confirms it's the one with the huge round table. Tom acknowledges.

------------------------------------------------------------

=== Example ===
[DIALOGUE]
Mark: What time is the breakfast?
Susanne: 8-11 a.m.
Mark

---
# Key Takeaways — Experiment 3 (Frontier LLMs via OpenRouter)
---

Fill this in after running:

- ROUGE performance of each frontier LLM  
- Latency comparisons  
- Style differences (concise vs narrative)  
- Error cases or hallucinations  
- Cost/latency trade-offs compared to local models  
- Which model will be used in the final comparison notebook  

This experiment establishes the upper-bound performance for the project.