![Banner](https://github.com/LittleHouse75/flatiron-resources/raw/main/NevitsBanner.png)
---
# Model Evaluation and Comparison
### Final Test Set Evaluation Across All Experiments
---

This notebook provides a fair comparison of all models on the **held-out test set**.

We evaluate:
- **Experiment 1:** BERT → GPT-2 (custom encoder-decoder)
- **Experiment 2:** BART and T5 (pretrained seq2seq)
- **Experiment 3:** Frontier LLMs via API

All models are evaluated on the exact same test examples using consistent
ROUGE computation settings.

## 1. Setup

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import json

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from src.data.load_data import load_samsum
from src.eval.rouge_eval import compute_rouge_from_lists
from src.eval.qualitative import generate_summary
from src.utils.logging import heading

## 2. Configuration

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# Paths to saved models
BERT_GPT2_DIR = PROJECT_ROOT / "models" / "bert-gpt2" / "best"
BART_DIR = PROJECT_ROOT / "models" / "bart" / "best"
T5_DIR = PROJECT_ROOT / "models" / "t5" / "best"

# API results from Experiment 3
API_RESULTS_DIR = PROJECT_ROOT / "experiments" / "exp3_api_llm_results"

# Output paths
RESULTS_DIR = PROJECT_ROOT / "experiments" / "final_comparison"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Evaluation settings
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 128
BATCH_SIZE = 8

# Which models to evaluate (set False to skip)
EVALUATE_MODELS = {
    "bert_gpt2": True,
    "bart": True,
    "t5": True,
    "api_models": True,  # Will use best API model from Experiment 3
}

print("Configuration loaded.")
print(f"Results will be saved to: {RESULTS_DIR}")

## 3. Load Test Data

In [None]:
heading("Loading Test Data")

train_df, val_df, test_df = load_samsum()

print(f"Test set size: {len(test_df)} examples")
print(f"\nSample dialogue:")
print(test_df.iloc[0]["dialogue"][:200] + "...")
print(f"\nSample summary:")
print(test_df.iloc[0]["summary"])

## 4. Build Evaluation

In [None]:
# Cell 5: Helper function
def evaluate_model_on_test(
    model,
    encoder_tokenizer,          # Changed: was just "tokenizer"
    decoder_tokenizer,          # Added: separate decoder tokenizer
    test_df,
    device,
    max_source_len,
    max_target_len,
    source_prefix="",
    model_name="model",
):
    """
    Generate summaries for all test examples and compute ROUGE.
    
    Parameters
    ----------
    model : transformers model
        The model to evaluate
    encoder_tokenizer : tokenizer
        Tokenizer for encoding input (dialogue)
    decoder_tokenizer : tokenizer
        Tokenizer for decoding output (summary)
    test_df : pd.DataFrame
        Test data with 'dialogue' and 'summary' columns
    device : torch.device
        CPU or GPU
    max_source_len : int
        Maximum input length
    max_target_len : int
        Maximum output length
    source_prefix : str
        Prefix to add to input (e.g., "summarize: " for T5)
    model_name : str
        Name for progress bar display
    
    Returns
    -------
    tuple : (predictions_df, rouge_scores)
    """
    from tqdm.auto import tqdm
    
    # =========================================================================
    # Validate tokenizer compatibility with model
    # =========================================================================
    # Check that the decoder tokenizer's vocab size matches the model's output vocab
    model_vocab_size = None
    
    # Different model types store vocab size differently
    if hasattr(model.config, 'vocab_size'):
        model_vocab_size = model.config.vocab_size
    elif hasattr(model.config, 'decoder') and hasattr(model.config.decoder, 'vocab_size'):
        model_vocab_size = model.config.decoder.vocab_size
    
    if model_vocab_size is not None:
        tokenizer_vocab_size = len(decoder_tokenizer)
        # Allow small differences (some models add special tokens)
        if abs(model_vocab_size - tokenizer_vocab_size) > 10:
            print(f"⚠️  WARNING: Vocab size mismatch for {model_name}!")
            print(f"   Model vocab: {model_vocab_size}, Tokenizer vocab: {tokenizer_vocab_size}")
            print(f"   This may indicate the wrong tokenizer is being used.")
    
    predictions = []
    references = []
    
    model.eval()
    
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc=model_name):
        dialogue = row["dialogue"]
        reference = row["summary"]
        
        # Generate summary using the correct tokenizers
        pred = generate_summary(
            model=model,
            encoder_tokenizer=encoder_tokenizer,   # For encoding input
            decoder_tokenizer=decoder_tokenizer,   # For decoding output
            text=dialogue,
            device=device,
            max_source_len=max_source_len,
            max_target_len=max_target_len,
            source_prefix=source_prefix,
        )
        
        predictions.append(pred)
        references.append(reference)
    
    # Compute ROUGE
    rouge_scores = compute_rouge_from_lists(predictions, references)
    
    # Create results DataFrame
    results_df = test_df.copy()
    results_df["model_prediction"] = predictions
    
    return results_df, rouge_scores


## 5. Evaluate BERT-GPT2

In [None]:
all_results = {}

if EVALUATE_MODELS["bert_gpt2"] and BERT_GPT2_DIR.exists():
    heading("Evaluating BERT → GPT-2")
    
    from transformers import EncoderDecoderModel, BertTokenizer, GPT2Tokenizer
    
    # Load model
    bert_gpt2_model = EncoderDecoderModel.from_pretrained(BERT_GPT2_DIR).to(device)
    bert_tokenizer = BertTokenizer.from_pretrained(BERT_GPT2_DIR)
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained(BERT_GPT2_DIR)
    
    # Ensure pad token is set for GPT-2
    if gpt2_tokenizer.pad_token is None:
        gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
    
    # NOW we pass BOTH tokenizers correctly
    results_df, rouge = evaluate_model_on_test(
        model=bert_gpt2_model,
        encoder_tokenizer=bert_tokenizer,   # BERT for encoding dialogues
        decoder_tokenizer=gpt2_tokenizer,   # GPT-2 for decoding summaries
        test_df=test_df,
        device=device,
        max_source_len=MAX_SOURCE_LEN,
        max_target_len=MAX_TARGET_LEN,
        source_prefix="",
        model_name="BERT-GPT2",
    )
    
    all_results["BERT-GPT2"] = {
        "rouge": rouge,
        "predictions": results_df,
    }
    
    print(f"\nBERT-GPT2 Results:")
    print(f"  ROUGE-1: {rouge['rouge1']:.4f}")
    print(f"  ROUGE-2: {rouge['rouge2']:.4f}")
    print(f"  ROUGE-L: {rouge['rougeL']:.4f}")
    
    # Save predictions
    results_df.to_csv(RESULTS_DIR / "bert_gpt2_predictions.csv", index=False)
    
else:
    print("Skipping BERT-GPT2 (not found or disabled)")


## 6. Evaluate BART

In [None]:
if EVALUATE_MODELS["bart"] and BART_DIR.exists():
    heading("Evaluating BART")
    
    from transformers import BartForConditionalGeneration, BartTokenizer
    
    bart_model = BartForConditionalGeneration.from_pretrained(BART_DIR).to(device)
    bart_tokenizer = BartTokenizer.from_pretrained(BART_DIR)
    
    # BART uses the same tokenizer for both encoder and decoder
    results_df, rouge = evaluate_model_on_test(
        model=bart_model,
        encoder_tokenizer=bart_tokenizer,
        decoder_tokenizer=bart_tokenizer,  # Same tokenizer for both
        test_df=test_df,
        device=device,
        max_source_len=MAX_SOURCE_LEN,
        max_target_len=MAX_TARGET_LEN,
        source_prefix="",
        model_name="BART",
    )
    
    all_results["BART"] = {
        "rouge": rouge,
        "predictions": results_df,
    }
    
    print(f"\nBART Results:")
    print(f"  ROUGE-1: {rouge['rouge1']:.4f}")
    print(f"  ROUGE-2: {rouge['rouge2']:.4f}")
    print(f"  ROUGE-L: {rouge['rougeL']:.4f}")
    
    results_df.to_csv(RESULTS_DIR / "bart_predictions.csv", index=False)
    
else:
    print("Skipping BART (not found or disabled)")


## 7. Evaluate T5

In [None]:
if EVALUATE_MODELS["t5"] and T5_DIR.exists():
    heading("Evaluating T5")
    
    from transformers import T5ForConditionalGeneration, T5Tokenizer
    
    t5_model = T5ForConditionalGeneration.from_pretrained(T5_DIR).to(device)
    t5_tokenizer = T5Tokenizer.from_pretrained(T5_DIR)
    
    # Load the prefix used during training
    prefix_path = T5_DIR / "source_prefix.txt"
    if prefix_path.exists():
        t5_prefix = prefix_path.read_text().strip()
        print(f"Using saved prefix: '{t5_prefix}'")
    else:
        t5_prefix = "summarize: "
        print(f"No prefix file found, using default: '{t5_prefix}'")
    
    # T5 uses the same tokenizer for both encoder and decoder
    results_df, rouge = evaluate_model_on_test(
        model=t5_model,
        encoder_tokenizer=t5_tokenizer,
        decoder_tokenizer=t5_tokenizer,  # Same tokenizer for both
        test_df=test_df,
        device=device,
        max_source_len=MAX_SOURCE_LEN,
        max_target_len=MAX_TARGET_LEN,
        source_prefix=t5_prefix,
        model_name="T5",
    )
    
    all_results["T5"] = {
        "rouge": rouge,
        "predictions": results_df,
    }
    
    print(f"\nT5 Results:")
    print(f"  ROUGE-1: {rouge['rouge1']:.4f}")
    print(f"  ROUGE-2: {rouge['rouge2']:.4f}")
    print(f"  ROUGE-L: {rouge['rougeL']:.4f}")
    
    results_df.to_csv(RESULTS_DIR / "t5_predictions.csv", index=False)
    
else:
    print("Skipping T5 (not found or disabled)")

## 8. Evaluate API Results

In [None]:
if EVALUATE_MODELS["api_models"]:
    heading("Loading API Model Results")
    
    # Check what split the API models were evaluated on
    api_metadata_path = API_RESULTS_DIR / "evaluation_metadata.json"
    
    if api_metadata_path.exists():
        with open(api_metadata_path, 'r') as f:
            api_metadata = json.load(f)
        api_split = api_metadata.get("split_used", "unknown")
        api_n_samples = api_metadata.get("n_samples", "unknown")
        print(f"API models were evaluated on: {api_split} set ({api_n_samples} samples)")
    else:
        api_split = "unknown"
        print("⚠ Could not determine which split API models were evaluated on")
    
    # Load the ROUGE summary from Experiment 3
    rouge_summary_path = API_RESULTS_DIR / "rouge_summary.csv"
    
    if rouge_summary_path.exists():
        api_rouge_df = pd.read_csv(rouge_summary_path, index_col=0)
        
        # Find the best API model by ROUGE-L
        best_api_model = api_rouge_df["rougeL"].idxmax()
        best_api_scores = api_rouge_df.loc[best_api_model]
        
        print(f"\nBest API model: {best_api_model}")
        print(f"  ROUGE-1: {best_api_scores['rouge1']:.4f}")
        print(f"  ROUGE-2: {best_api_scores['rouge2']:.4f}")
        print(f"  ROUGE-L: {best_api_scores['rougeL']:.4f}")
        
        # CRITICAL: Check if we can fairly compare
        if api_split != "test":
            print("\n" + "="*60)
            print("⚠️  WARNING: COMPARISON MAY NOT BE VALID")
            print("="*60)
            print(f"API models were evaluated on '{api_split}' set,")
            print(f"but local models are evaluated on 'test' set.")
            print("\nTo fix this:")
            print("1. Open 04_experiment3_api_models.ipynb")
            print("2. Set USE_TEST_SET = True")
            print("3. Set RUN_API_CALLS = True")
            print("4. Re-run the notebook")
            print("="*60)
            
            # Mark the comparison as provisional
            comparison_note = "(evaluated on different splits - not directly comparable)"
        else:
            comparison_note = ""
        
        all_results[f"API: {best_api_model}"] = {
            "rouge": {
                "rouge1": best_api_scores["rouge1"],
                "rouge2": best_api_scores["rouge2"],
                "rougeL": best_api_scores["rougeL"],
                "rougeLsum": best_api_scores.get("rougeLsum", np.nan),
            },
            "predictions": None,
            "note": comparison_note,
        }
    else:
        print(f"API results not found at {rouge_summary_path}")


## 11. Create Comparision Table

In [None]:
heading("Final Comparison")

if all_results:
    comparison_data = []
    
    for model_name, result in all_results.items():
        rouge = result["rouge"]
        
        # Count how many examples this model was evaluated on
        if result["predictions"] is not None:
            n_evaluated = len(result["predictions"])
        else:
            # For API models loaded from file
            n_evaluated = "see notebook 04"
        
        comparison_data.append({
            "Model": model_name,
            "N_Samples": n_evaluated,  # NEW: Show sample count
            "ROUGE-1": rouge["rouge1"],
            "ROUGE-2": rouge["rouge2"],
            "ROUGE-L": rouge["rougeL"],
            "Note": result.get("note", ""),  # Any caveats
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values("ROUGE-L", ascending=False)
    comparison_df = comparison_df.reset_index(drop=True)
    
    # Check if all models were evaluated on the same number of examples
    sample_counts = comparison_df["N_Samples"].unique()
    if len(sample_counts) > 1:
        print("⚠️  WARNING: Models were evaluated on different numbers of examples!")
        print(f"   Sample counts: {list(sample_counts)}")
        print("   Scores may not be directly comparable.\n")
    
    # Format for display
    display_df = comparison_df.copy()
    for col in ["ROUGE-1", "ROUGE-2", "ROUGE-L"]:
        display_df[col] = display_df[col].apply(lambda x: f"{x:.4f}")
    
    print("\n" + "=" * 70)
    print("FINAL MODEL COMPARISON (Sorted by ROUGE-L)")
    print("=" * 70)
    display(display_df)


## 12. Visualization

In [None]:
if all_results and len(all_results) > 1:
    import matplotlib.pyplot as plt
    
    heading("Visualization")
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    models = comparison_df["Model"].tolist()
    x = np.arange(len(models))
    width = 0.25
    
    r1 = comparison_df["ROUGE-1"].tolist()
    r2 = comparison_df["ROUGE-2"].tolist()
    rL = comparison_df["ROUGE-L"].tolist()
    
    bars1 = ax.bar(x - width, r1, width, label="ROUGE-1", color="#2ecc71")
    bars2 = ax.bar(x, r2, width, label="ROUGE-2", color="#3498db")
    bars3 = ax.bar(x + width, rL, width, label="ROUGE-L", color="#9b59b6")
    
    ax.set_xlabel("Model")
    ax.set_ylabel("Score")
    ax.set_title("Model Comparison: ROUGE Scores on Test Set")
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha="right")
    ax.legend()
    ax.grid(True, alpha=0.3, axis="y")
    
    # Add value labels on bars
    for bars in [bars1, bars2, bars3]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.2f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom',
                       fontsize=8)
    
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / "model_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()
    
    print(f"Saved figure to: {RESULTS_DIR / 'model_comparison.png'}")

## 13. Qualitative Comparison

In [None]:
heading("Qualitative Comparison: Same Examples Across Models")

# Pick 3 random test examples
sample_indices = test_df.sample(3, random_state=42).index.tolist()

for idx in sample_indices:
    row = test_df.loc[idx]
    
    print("\n" + "=" * 70)
    print(f"TEST EXAMPLE (index {idx})")
    print("=" * 70)
    
    print(f"\n[DIALOGUE]\n{row['dialogue'][:500]}{'...' if len(row['dialogue']) > 500 else ''}")
    print(f"\n[HUMAN SUMMARY]\n{row['summary']}")
    
    print(f"\n[MODEL PREDICTIONS]")
    for model_name, result in all_results.items():
        if result["predictions"] is not None:
            pred = result["predictions"].loc[idx, "model_prediction"]
            print(f"  {model_name:15s}: {pred}")
        else:
            print(f"  {model_name:15s}: (predictions not available)")
    
    print("-" * 70)


## 14. Summary and Conclusions

TBD
