![Banner](https://github.com/LittleHouse75/flatiron-resources/raw/main/NevitsBanner.png)
---
# Experiment 3 — Frontier LLMs via OpenRouter
### Zero-Shot Dialogue Summarization Using API Models
---

This notebook evaluates **frontier large language models** (OpenAI, Anthropic, Google, Mistral, etc.)  
via **OpenRouter**, using a *single* API interface.

We:
- Load the SAMSum validation set  
- Sample N examples  
- Send them to multiple frontier models  
- Score ROUGE  
- Save predictions + latencies  
- Produce qualitative examples  

This establishes the **upper-bound performance baseline** for the project.

In [None]:
%load_ext autoreload
%autoreload 2

## 1. Environment Setup

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from pathlib import Path
import sys
import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

## 2. Configuration

**Important:** Set `RUN_API_CALLS` to `False` to load cached results instead of 
making new API calls. This saves money and time when you just want to analyze
previous results.

You can also control individual models with the `MODELS_TO_RUN` dictionary.

In [None]:
# %%
# =============================================================================
# API CALL FLAGS - Set these to control what runs
# =============================================================================

# Master switch: Set False to load ALL results from cache (no API calls)
RUN_API_CALLS = False

# Per-model control: Set individual models to False to skip them
# Only matters if RUN_API_CALLS = True
MODELS_TO_RUN = {
    "gpt5_nano":        True,
    "gpt5_mini":        True,
    "gpt5_full":        True,
    "gpt_oss_20b":      True,
    "gpt_oss_120b":     True,
    "gemini_25_flash":  True,
    "claude_45_sonnet": True,
    "qwen25_72b":       True,
}

# =============================================================================
# EXPERIMENT PARAMETERS
# =============================================================================
N_SAMPLES = 100       # Number of dialogues to evaluate (cost control)
SEED = 42
MAX_OUT_TOKENS = 512  # Max tokens for model responses

# =============================================================================
# PATHS
# =============================================================================
RESULTS_DIR = PROJECT_ROOT / "experiments" / "exp3_api_llm_results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

ROUGE_SUMMARY_PATH = RESULTS_DIR / "rouge_summary.csv"
LATENCY_SUMMARY_PATH = RESULTS_DIR / "latency_summary.csv"

# =============================================================================
# MODEL DEFINITIONS
# =============================================================================
OPENROUTER_MODELS = {
    # OpenAI family – small → big
    "gpt5_nano":       "openai/gpt-5-nano",
    "gpt5_mini":       "openai/gpt-5-mini",
    "gpt5_full":       "openai/gpt-5",          # flagship upper bound

    # OpenAI open-weight models
    "gpt_oss_20b":     "openai/gpt-oss-20b",
    "gpt_oss_120b":    "openai/gpt-oss-120b",

    # Google Gemini – fast, very strong general model
    "gemini_25_flash": "google/gemini-2.5-flash",

    # Anthropic Claude – strong competitor
    "claude_45_sonnet": "anthropic/claude-4.5-sonnet-20250929",

    # Qwen – top-tier open(-ish) model
    "qwen25_72b":      "qwen/qwen-2.5-72b-instruct",
}

print(f"Results directory: {RESULTS_DIR}")
print(f"RUN_API_CALLS: {RUN_API_CALLS}")
print(f"N_SAMPLES: {N_SAMPLES}")
print(f"Models configured: {len(OPENROUTER_MODELS)}")

## 3. Load SAMSum Data

In [None]:
from src.data.load_data import load_samsum

train_df, val_df, test_df = load_samsum()
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# Sample the evaluation set
rng = np.random.default_rng(SEED)

if N_SAMPLES >= len(val_df):
    eval_df = val_df.copy().reset_index(drop=True)
    print(f"Using full validation set: {len(eval_df)} examples")
else:
    eval_df = val_df.sample(n=N_SAMPLES, random_state=SEED).reset_index(drop=True)
    print(f"Sampled {len(eval_df)} examples from validation set")

eval_df.head()

## 4. Prompt Construction

In [None]:
def build_summarization_prompt(dialogue: str) -> str:
    """
    Build a zero-shot summarization prompt for frontier LLMs.
    """
    return (
        "Summarize the following conversation in 1–2 sentences. "
        "Keep it brief—aim for 15–30 words. "
        "Focus on the main point, decisions, requests, or outcomes. "
        "Ignore small talk and do not add details that aren't supported by the text.\n\n"
        "DIALOGUE:\n"
        "-----\n"
        f"{dialogue}\n"
        "-----\n\n"
        "SUMMARY:"
    )

# Test the prompt
print("Example prompt (truncated):")
print(build_summarization_prompt(eval_df['dialogue'].iloc[0])[:400] + "...")


## 5. Run API Calls or Load Cached Results

In [None]:
from src.utils.logging import heading

def get_cached_result_path(model_label: str) -> Path:
    """Get the path where results for a model would be cached."""
    return RESULTS_DIR / f"{model_label}.csv"


def load_cached_results(model_label: str) -> pd.DataFrame | None:
    """Load cached results for a model if they exist."""
    path = get_cached_result_path(model_label)
    if path.exists():
        return pd.read_csv(path)
    return None


def run_model_evaluation(model_label: str, model_id: str, eval_df: pd.DataFrame) -> pd.DataFrame:
    """Run API calls for a single model and return results DataFrame."""
    from tqdm.auto import tqdm
    import src.utils.openrouter_client as openrouter_client
    
    rows = []
    for i, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc=model_label):
        dialogue = row["dialogue"]
        reference = row["summary"]
        prompt = build_summarization_prompt(dialogue)
        
        try:
            pred, latency = openrouter_client.call_openrouter_llm(
                model=model_id,
                prompt=prompt,
                max_tokens=MAX_OUT_TOKENS,
                temperature=0.2,
            )
        except Exception as e:
            pred = f"[ERROR: {e}]"
            latency = np.nan

        rows.append({
            "dialogue": dialogue,
            "reference_summary": reference,
            "model_summary": pred,
            "latency_seconds": latency,
        })
    
    return pd.DataFrame(rows)

In [None]:
heading("Loading/Running Model Evaluations")

results_by_model = {}
models_run = []
models_loaded = []
models_skipped = []

for label, model_id in OPENROUTER_MODELS.items():
    cache_path = get_cached_result_path(label)
    
    if RUN_API_CALLS and MODELS_TO_RUN.get(label, True):
        # Check if we should skip because results already exist
        if cache_path.exists():
            print(f"\n{label}: Cache exists. Re-running anyway (RUN_API_CALLS=True)")
        else:
            print(f"\n{label}: Running API calls...")
        
        # Run the evaluation
        df_out = run_model_evaluation(label, model_id, eval_df)
        
        # Save to cache
        df_out.to_csv(cache_path, index=False)
        print(f"  ✓ Saved {len(df_out)} results to {cache_path.name}")
        
        results_by_model[label] = df_out
        models_run.append(label)
        
    else:
        # Try to load from cache
        cached = load_cached_results(label)
        
        if cached is not None:
            results_by_model[label] = cached
            models_loaded.append(label)
            print(f"{label}: Loaded {len(cached)} cached results")
        else:
            models_skipped.append(label)
            print(f"{label}: No cached results found (skipping)")

# Summary
print("\n" + "="*60)
print("SUMMARY:")
print(f"  Models run (API calls):  {len(models_run)} - {models_run}")
print(f"  Models loaded (cache):   {len(models_loaded)} - {models_loaded}")
print(f"  Models skipped:          {len(models_skipped)} - {models_skipped}")
print(f"  Total models available:  {len(results_by_model)}")

## 6. Compute ROUGE Scores

In [None]:
heading("ROUGE Evaluation")

from src.eval.rouge_eval import compute_rouge_from_df

rouge_scores = {}

for label, df in results_by_model.items():
    # Skip if there were errors in all responses
    valid_responses = df[~df["model_summary"].str.startswith("[ERROR")]
    
    if len(valid_responses) == 0:
        print(f"{label}: No valid responses to evaluate")
        continue
    
    if len(valid_responses) < len(df):
        print(f"{label}: {len(df) - len(valid_responses)} errors, evaluating {len(valid_responses)} valid responses")
    
    scores = compute_rouge_from_df(valid_responses)
    rouge_scores[label] = scores

# Create summary DataFrame
if rouge_scores:
    rouge_df = pd.DataFrame.from_dict(rouge_scores, orient="index")
    rouge_df.index.name = "model"
    rouge_df = rouge_df.sort_values(by="rougeL", ascending=False)
    
    # Save summary
    rouge_df.to_csv(ROUGE_SUMMARY_PATH)
    print(f"\nSaved ROUGE summary to: {ROUGE_SUMMARY_PATH}")
    
    display(rouge_df)
else:
    rouge_df = None
    print("No ROUGE scores to display.")

## 7. Latency Analysis

In [None]:
heading("Latency Analysis")

latency_stats = {}

for label, df in results_by_model.items():
    vals = df["latency_seconds"].replace([np.inf, -np.inf], np.nan).dropna()
    
    if len(vals) == 0:
        continue
        
    desc = vals.describe(percentiles=[0.5, 0.9, 0.95])
    latency_stats[label] = desc

if latency_stats:
    latency_df = pd.DataFrame(latency_stats).T
    latency_df.index.name = "model"
    
    # Prettier column names
    latency_df = latency_df.rename(
        columns={
            "count": "count",
            "mean": "mean",
            "std": "std",
            "min": "min",
            "50%": "p50",
            "90%": "p90",
            "95%": "p95",
            "max": "max",
        }
    )
    
    # Sort by mean latency (fastest first)
    latency_df = latency_df.sort_values(by="mean", ascending=True)
    
    # Save summary
    latency_df.to_csv(LATENCY_SUMMARY_PATH)
    print(f"Saved latency summary to: {LATENCY_SUMMARY_PATH}")
    
    display(latency_df)
else:
    latency_df = None
    print("No latency data to display.")

## 8. Visualizations

In [None]:
if rouge_df is not None and len(rouge_df) > 0:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # ROUGE scores bar chart
    ax = axes[0]
    rouge_plot_df = rouge_df[["rouge1", "rouge2", "rougeL"]].sort_values("rougeL", ascending=True)
    rouge_plot_df.plot(kind="barh", ax=ax)
    ax.set_xlabel("Score")
    ax.set_title("ROUGE Scores by Model")
    ax.legend(loc="lower right")
    ax.grid(True, alpha=0.3, axis="x")
    
    # Latency bar chart
    ax = axes[1]
    if latency_df is not None and len(latency_df) > 0:
        latency_plot = latency_df["mean"].sort_values(ascending=True)
        latency_plot.plot(kind="barh", ax=ax, color="coral")
        ax.set_xlabel("Mean Latency (seconds)")
        ax.set_title("API Latency by Model")
        ax.grid(True, alpha=0.3, axis="x")
    else:
        ax.text(0.5, 0.5, "No latency data", ha="center", va="center", transform=ax.transAxes)
        ax.set_title("API Latency by Model")
    
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / "rouge_latency_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()
    
    print(f"Saved figure to: {RESULTS_DIR / 'rouge_latency_comparison.png'}")


## 9. Qualitative Examples

In [None]:
heading("Qualitative Examples")

def show_examples(df, model_label, n=5, max_chars=600):
    """Show n random examples from a model's results."""
    # Filter out errors
    valid_df = df[~df["model_summary"].str.startswith("[ERROR")]
    
    if len(valid_df) == 0:
        print(f"No valid examples for {model_label}")
        return
    
    sample = valid_df.sample(n=min(n, len(valid_df)), random_state=SEED)
    
    print(f"\n{'='*60}")
    print(f"Examples for: {model_label}")
    print(f"{'='*60}")
    
    for _, row in sample.iterrows():
        d = row["dialogue"]
        d = d[:max_chars] + " ... [truncated]" if len(d) > max_chars else d
        
        print("\n--- Example ---")
        print(f"[DIALOGUE]\n{d}")
        print(f"\n[HUMAN SUMMARY]\n{row['reference_summary']}")
        print(f"\n[MODEL SUMMARY]\n{row['model_summary']}")
        print("-" * 40)


# Show examples for each model
for label, df in results_by_model.items():
    show_examples(df, label, n=3)

## 10. Side-by-Side Model Comparison

In [None]:
heading("Side-by-Side Comparison (Same Examples)")

# Pick a few examples to compare across all models
if len(results_by_model) > 1:
    # Get the first model's dataframe to sample indices
    first_model = list(results_by_model.keys())[0]
    first_df = results_by_model[first_model]
    
    # Sample 3 indices
    sample_indices = first_df.sample(n=min(3, len(first_df)), random_state=SEED).index.tolist()
    
    for idx in sample_indices:
        print(f"\n{'='*70}")
        print(f"EXAMPLE (index {idx})")
        print(f"{'='*70}")
        
        # Get dialogue and reference from first model (they're the same across all)
        row = first_df.loc[idx]
        dialogue = row["dialogue"]
        reference = row["reference_summary"]
        
        print(f"\n[DIALOGUE]\n{dialogue[:500]}{'...' if len(dialogue) > 500 else ''}")
        print(f"\n[HUMAN SUMMARY]\n{reference}")
        print(f"\n[MODEL SUMMARIES]")
        
        for label, df in results_by_model.items():
            if idx in df.index:
                model_summary = df.loc[idx, "model_summary"]
                # Truncate long summaries
                if len(model_summary) > 200:
                    model_summary = model_summary[:200] + "..."
                print(f"  {label:20s}: {model_summary}")
        
        print("-" * 70)
else:
    print("Need at least 2 models for side-by-side comparison.")

## 11. Error Analysis

In [None]:
heading("Error Analysis")

error_summary = []

for label, df in results_by_model.items():
    total = len(df)
    errors = df["model_summary"].str.startswith("[ERROR").sum()
    error_rate = errors / total * 100 if total > 0 else 0
    
    error_summary.append({
        "model": label,
        "total_requests": total,
        "errors": errors,
        "error_rate_pct": f"{error_rate:.1f}%",
    })
    
    # Show sample errors if any
    if errors > 0:
        error_samples = df[df["model_summary"].str.startswith("[ERROR")]["model_summary"].head(2).tolist()
        print(f"{label}: {errors} errors ({error_rate:.1f}%)")
        for err in error_samples:
            print(f"  - {err[:100]}...")

error_df = pd.DataFrame(error_summary)
display(error_df)

## 12. Summary Table

In [None]:
heading("Final Summary")

if rouge_df is not None and latency_df is not None:
    # Combine ROUGE and latency into one summary
    summary_data = []
    
    for model in rouge_df.index:
        row = {
            "Model": model,
            "ROUGE-1": f"{rouge_df.loc[model, 'rouge1']:.3f}",
            "ROUGE-2": f"{rouge_df.loc[model, 'rouge2']:.3f}",
            "ROUGE-L": f"{rouge_df.loc[model, 'rougeL']:.3f}",
        }
        
        if model in latency_df.index:
            row["Mean Latency (s)"] = f"{latency_df.loc[model, 'mean']:.2f}"
            row["P95 Latency (s)"] = f"{latency_df.loc[model, 'p95']:.2f}"
        else:
            row["Mean Latency (s)"] = "N/A"
            row["P95 Latency (s)"] = "N/A"
        
        summary_data.append(row)
    
    final_summary_df = pd.DataFrame(summary_data)
    final_summary_df = final_summary_df.sort_values("ROUGE-L", ascending=False)
    
    # Save final summary
    final_summary_path = RESULTS_DIR / "final_summary.csv"
    final_summary_df.to_csv(final_summary_path, index=False)
    print(f"Saved final summary to: {final_summary_path}")
    
    display(final_summary_df)
else:
    print("Insufficient data for final summary.")

---
# 13. Key Takeaways — Experiment 3 (Frontier LLMs via OpenRouter)
---

*Fill this in after running:*

**ROUGE Performance:**
- Best performing model: [TBD]
- Worst performing model: [TBD]
- Score range: [TBD]

**Latency:**
- Fastest model: [TBD]
- Slowest model: [TBD]
- Latency range: [TBD]

**Quality Observations:**
- Style differences (concise vs narrative): [TBD]
- Error cases or hallucinations: [TBD]
- Which model follows instructions best: [TBD]

**Cost/Latency Trade-offs:**
- Best value (quality/latency ratio): [TBD]
- Comparison to local fine-tuned models: [TBD]

**Recommendation for Final Comparison:**
- Model to use as "frontier baseline": [TBD]
- Reasoning: [TBD]

---