# Meta-Generation Experiments: Cross-Model Likelihood Analysis

By Graham Neubig for [11-664/763 Inference Algorithms for Language Modeling](https://phontron.com/class/lminference-fall2025/)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neubig/lminference-fall2025-code/blob/main/02-generation-basics/meta_generation.ipynb)

This notebook implements fascinating meta-generation experiments that explore how different
language models evaluate text generated by other models. We'll investigate the complex
relationships between generators and evaluators in the language modeling space.

## Research Questions

Our experiments address several intriguing questions:

1. **Cross-Model Evaluation**: How does GPT-2 Small vs Medium evaluate the same generated text?
2. **Self-Preference Bias**: Do models prefer text they generated themselves?
3. **Model Capability Patterns**: What patterns emerge in cross-model preferences?
4. **Quality Assessment**: Can we use cross-model agreement as a quality signal?

## Why This Matters

Understanding cross-model evaluation helps us:
- Identify model biases and preferences
- Explore the relationship between model size and text evaluation
- Develop better quality assessment methods using multiple model perspectives
- Gain meta-learning insights from model disagreements

These experiments reveal deep insights about how language models "think" about text quality.

In [None]:
# %%
# Install required packages
%pip install torch>=2.5.0 numpy>=2.0.0 matplotlib>=3.9.0 seaborn>=0.13.0

%%

In [None]:
from __future__ import annotations

In [None]:
import json
import time
from dataclasses import asdict, dataclass
from typing import Any

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from nanogpt import GPT2Tokenizer
from nanogpt import (
    GPT2,
    GPT2Config,
)
from plotting_utils import plot_meta_generation_dashboard, save_figure

In [None]:
@dataclass
class MetaGenerationResult:
    """Result of meta-generation experiment"""

    prompt: str
    generator_model: str
    generated_text: str
    small_likelihood: float
    medium_likelihood: float
    likelihood_ratio: float  # medium - small (in log space)
    perplexity_small: float
    perplexity_medium: float
    generation_time: float
    text_length: int
    cross_entropy_small: float
    cross_entropy_medium: float

In [None]:
@dataclass
class CrossModelComparison:
    """Comparison between models on the same text"""

    text: str
    prompt: str
    generator: str
    evaluations: dict[
        str, dict[str, float]
    ]  # model_name -> {likelihood, perplexity, etc.}
    preference_winner: str
    preference_strength: float

In [None]:
class LikelihoodAnalyzer:
    """Analyzes text likelihood under different models"""

    def __init__(
        self,
        models: dict[str, GPT2],
        tokenizers: dict[str, GPT2Tokenizer],
        device: str,
    ) -> None:
        self.models = models
        self.tokenizers = tokenizers
        self.device = device

    def compute_likelihood_metrics(
        self, model_name: str, text: str
    ) -> dict[str, float]:
        """
        Compute comprehensive likelihood metrics for text under a model

        Returns:
            Dictionary with likelihood, perplexity, cross_entropy, and token-level stats
        """
        model = self.models[model_name]
        tokenizer = self.tokenizers[model_name]

        # Tokenize
        inputs = torch.tensor([tokenizer.encode(text)]).to(self.device)

        if len(inputs[0]) <= 1:
            return {
                "likelihood": float("-inf"),
                "perplexity": float("inf"),
                "cross_entropy": float("inf"),
                "token_count": 0,
                "avg_token_prob": 0.0,
            }

        with torch.no_grad():
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss.item()
            logits = outputs.logits

            # Shift for next token prediction
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()

            # Calculate log probabilities
            log_probs = F.log_softmax(shift_logits, dim=-1)

            # Get log probabilities for actual tokens
            token_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(
                -1
            )

            # Metrics
            avg_log_likelihood = token_log_probs.mean().item()
            perplexity = torch.exp(torch.tensor(loss)).item()
            cross_entropy = loss
            token_count = len(shift_labels[0])
            avg_token_prob = torch.exp(token_log_probs).mean().item()

        return {
            "likelihood": avg_log_likelihood,
            "perplexity": perplexity,
            "cross_entropy": cross_entropy,
            "token_count": token_count,
            "avg_token_prob": avg_token_prob,
        }

    def compare_models_on_text(
        self, text: str, prompt: str, generator: str
    ) -> CrossModelComparison:
        """Compare how different models evaluate the same text"""
        evaluations = {}

        for model_name in self.models:
            full_text = prompt + " " + text
            metrics = self.compute_likelihood_metrics(model_name, full_text)
            evaluations[model_name] = metrics

        # Determine preference (higher likelihood wins)
        model_names = list(evaluations.keys())
        if len(model_names) >= 2:
            model1, model2 = model_names[0], model_names[1]
            likelihood_diff = (
                evaluations[model2]["likelihood"] - evaluations[model1]["likelihood"]
            )

            if likelihood_diff > 0:
                preference_winner = model2
                preference_strength = likelihood_diff
            else:
                preference_winner = model1
                preference_strength = abs(likelihood_diff)
        else:
            preference_winner = model_names[0] if model_names else "none"
            preference_strength = 0.0

        return CrossModelComparison(
            text=text,
            prompt=prompt,
            generator=generator,
            evaluations=evaluations,
            preference_winner=preference_winner,
            preference_strength=preference_strength,
        )

In [None]:
class MetaGenerationExperiment:
    """Comprehensive meta-generation experiments"""

    def __init__(self, device: str = "auto") -> None:
        """Initialize the experiment"""
        self.device = self._get_device(device)
        self.models = {}
        self.tokenizers = {}
        self.results = []
        self.analyzer = None

        # Load models
        self._load_models()

        # Initialize analyzer
        self.analyzer = LikelihoodAnalyzer(self.models, self.tokenizers, self.device)

        # Test prompts - diverse to see different behaviors
        self.prompts = [
            "The scientific method involves",
            "In the depths of the ocean",
            "Artificial intelligence will",
            "The history of civilization shows",
            "When I was a child",
            "The economy is affected by",
            "Space exploration has revealed",
            "The human brain is",
            "Climate change represents",
            "Technology has transformed",
            "The future of education",
            "In quantum mechanics",
            "The art of storytelling",
            "Democracy requires",
            "The universe contains",
        ]

    def _get_device(self, device: str) -> str:
        """Determine the best device to use"""
        if device == "auto":
            if torch.backends.mps.is_available():
                return "mps"
            elif torch.cuda.is_available():
                return "cuda"
            else:
                return "cpu"
        return device

    def _load_models(self) -> None:
        """Load GPT-2 models"""
        print("🔄 Loading GPT-2 models for meta-generation...")

        # Create tokenizer
        tokenizer = GPT2Tokenizer()

        # Load different model configurations
        configs = {
            "nanogpt-small": GPT2Config(n_layer=6, n_head=6, n_embd=384),
            "nanogpt-medium": GPT2Config(n_layer=12, n_head=12, n_embd=768),
        }

        for model_name, config in configs.items():
            print(f"  Creating {model_name}...")

            # Store tokenizer
            self.tokenizers[model_name] = tokenizer

            # Create model
            model = GPT2(config)
            model.to(self.device)
            model.eval()
            self.models[model_name] = model

            print(f"    {model_name} created on {self.device}")

    def generate_text(
        self,
        model_name: str,
        prompt: str,
        max_length: int = 50,
        temperature: float = 0.8,
        strategy: str = "temperature",
    ) -> tuple[str, float]:
        """Generate text with specified model and strategy"""
        model = self.models[model_name]
        tokenizer = self.tokenizers[model_name]

        # Tokenize prompt
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(self.device)

        # Prepare generation kwargs
        gen_kwargs = {
            "input_ids": inputs,
            "max_length": len(inputs[0]) + max_length,
            "pad_token_id": tokenizer.eos_token_id,
            "num_return_sequences": 1,
        }

        if strategy == "greedy":
            gen_kwargs["do_sample"] = False
        elif strategy == "temperature":
            gen_kwargs["do_sample"] = True
            gen_kwargs["temperature"] = temperature
        elif strategy == "top_p":
            gen_kwargs["do_sample"] = True
            gen_kwargs["top_p"] = 0.9
            gen_kwargs["temperature"] = temperature

        # Generate
        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(**gen_kwargs)
        generation_time = time.time() - start_time

        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Remove the prompt from the generated text
        generated_only = generated_text[len(prompt) :].strip()

        return generated_only, generation_time

    def run_cross_model_evaluation(
        self, max_length: int = 40
    ) -> list[MetaGenerationResult]:
        """Run cross-model evaluation experiment"""
        print("🔄 CROSS-MODEL EVALUATION EXPERIMENT")
        print("=" * 60)
        print("Comparing how GPT-2 small and medium evaluate generated text\n")

        results = []
        total_experiments = (
            len(self.prompts) * len(self.models) * 2
        )  # 2 strategies per model
        experiment_count = 0

        strategies = [
            ("temperature", {"temperature": 0.8}),
            ("top_p", {"temperature": 0.8}),
        ]

        for prompt in self.prompts:
            print(f"\nPrompt: '{prompt}'")
            print("-" * 50)

            # Generate with each model using different strategies
            for generator_model in self.models:
                for strategy_name, strategy_kwargs in strategies:
                    experiment_count += 1
                    print(
                        f"[{experiment_count}/{total_experiments}] {generator_model} + {strategy_name}..."
                    )

                    try:
                        # Generate text
                        generated_text, gen_time = self.generate_text(
                            generator_model,
                            prompt,
                            max_length,
                            strategy=strategy_name,
                            **strategy_kwargs,
                        )

                        if not generated_text.strip():
                            print("  Empty generation, skipping...")
                            continue

                        full_text = prompt + " " + generated_text
                        print(f"  Generated: '{generated_text[:60]}...'")

                        # Compute likelihoods under both models
                        print("  Computing cross-model likelihoods...")
                        if self.analyzer is None:
                            print(
                                "  Analyzer not available, skipping likelihood computation..."
                            )
                            continue
                        small_metrics = self.analyzer.compute_likelihood_metrics(
                            "gpt2", full_text
                        )
                        medium_metrics = self.analyzer.compute_likelihood_metrics(
                            "gpt2-medium", full_text
                        )

                        # Calculate likelihood ratio (medium - small in log space)
                        likelihood_ratio = (
                            medium_metrics["likelihood"] - small_metrics["likelihood"]
                        )

                        result = MetaGenerationResult(
                            prompt=prompt,
                            generator_model=f"{generator_model}_{strategy_name}",
                            generated_text=generated_text,
                            small_likelihood=small_metrics["likelihood"],
                            medium_likelihood=medium_metrics["likelihood"],
                            likelihood_ratio=likelihood_ratio,
                            perplexity_small=small_metrics["perplexity"],
                            perplexity_medium=medium_metrics["perplexity"],
                            generation_time=gen_time,
                            text_length=len(generated_text.split()),
                            cross_entropy_small=small_metrics["cross_entropy"],
                            cross_entropy_medium=medium_metrics["cross_entropy"],
                        )

                        results.append(result)

                        # Show likelihood comparison
                        preference = "Medium" if likelihood_ratio > 0 else "Small"
                        print(f"  Small likelihood: {small_metrics['likelihood']:.3f}")
                        print(
                            f"  Medium likelihood: {medium_metrics['likelihood']:.3f}"
                        )
                        print(
                            f"  Ratio (M-S): {likelihood_ratio:.3f} → {preference} prefers this text"
                        )

                    except Exception as e:
                        print(f"  ❌ Error: {e}")

        self.results = results
        return results

    def run_self_vs_other_evaluation(self, max_length: int = 30) -> dict[str, Any]:
        """Compare how models evaluate their own vs other models' generations"""
        print("\n🔄 SELF VS OTHER EVALUATION")
        print("=" * 60)

        self_preferences = {"gpt2": [], "gpt2-medium": []}
        cross_preferences = {"gpt2": [], "gpt2-medium": []}

        test_prompts = self.prompts[:5]  # Use subset for this analysis

        for prompt in test_prompts:
            print(f"\nAnalyzing: '{prompt[:40]}...'")

            # Generate with both models
            generations = {}
            for model_name in self.models:
                text, _ = self.generate_text(model_name, prompt, max_length)
                generations[model_name] = text

            # Evaluate each generation under both models
            if self.analyzer is None:
                print("  Analyzer not available, skipping likelihood computation...")
                continue
            for generator in generations:
                for evaluator in self.models:
                    full_text = prompt + " " + generations[generator]
                    metrics = self.analyzer.compute_likelihood_metrics(
                        evaluator, full_text
                    )

                    if generator == evaluator:
                        # Self-evaluation
                        self_preferences[evaluator].append(metrics["likelihood"])
                    else:
                        # Cross-evaluation
                        cross_preferences[evaluator].append(metrics["likelihood"])

        # Compute statistics
        results = {}
        for model in self.models:
            if self_preferences[model] and cross_preferences[model]:
                self_avg = np.mean(self_preferences[model])
                cross_avg = np.mean(cross_preferences[model])
                self_bias = self_avg - cross_avg

                results[model] = {
                    "self_avg_likelihood": self_avg,
                    "cross_avg_likelihood": cross_avg,
                    "self_bias": self_bias,
                    "prefers_own": self_bias > 0,
                }

                print(f"\n{model} evaluation:")
                print(f"  Own generations: {self_avg:.3f}")
                print(f"  Other generations: {cross_avg:.3f}")
                print(
                    f"  Self-bias: {self_bias:.3f} ({'prefers own' if self_bias > 0 else 'prefers other'})"
                )

        return results

    def analyze_results(self) -> dict[str, Any]:
        """Comprehensive analysis of meta-generation results"""
        print("\nMETA-GENERATION ANALYSIS")
        print("=" * 60)

        if not self.results:
            print("No results to analyze")
            return {}

        # Basic statistics
        total_results = len(self.results)
        medium_preferred = sum(1 for r in self.results if r.likelihood_ratio > 0)
        small_preferred = total_results - medium_preferred

        print(f"Total comparisons: {total_results}")
        print(
            f"Medium model prefers: {medium_preferred} ({medium_preferred / total_results * 100:.1f}%)"
        )
        print(
            f"Small model prefers: {small_preferred} ({small_preferred / total_results * 100:.1f}%)"
        )

        # Analyze by generator
        print("\nBy generator model:")
        generator_stats = {}
        for result in self.results:
            gen_model = result.generator_model
            if gen_model not in generator_stats:
                generator_stats[gen_model] = {"total": 0, "medium_preferred": 0}

            generator_stats[gen_model]["total"] += 1
            if result.likelihood_ratio > 0:
                generator_stats[gen_model]["medium_preferred"] += 1

        for gen_model, stats in generator_stats.items():
            pct = stats["medium_preferred"] / stats["total"] * 100
            print(
                f"  {gen_model}: {stats['medium_preferred']}/{stats['total']} ({pct:.1f}%) preferred by medium"
            )

        # Find extreme cases
        print("\nExtreme preferences:")

        # Strongest medium preference
        max_medium = max(self.results, key=lambda r: r.likelihood_ratio)
        print(
            f"  Strongest medium preference (ratio: {max_medium.likelihood_ratio:.3f}):"
        )
        print(f"    Generator: {max_medium.generator_model}")
        print(f"    Text: '{max_medium.generated_text[:80]}...'")

        # Strongest small preference
        min_small = min(self.results, key=lambda r: r.likelihood_ratio)
        print(
            f"  Strongest small preference (ratio: {min_small.likelihood_ratio:.3f}):"
        )
        print(f"    Generator: {min_small.generator_model}")
        print(f"    Text: '{min_small.generated_text[:80]}...'")

        # Analyze correlations
        lengths = [r.text_length for r in self.results]
        ratios = [r.likelihood_ratio for r in self.results]
        gen_times = [r.generation_time for r in self.results]

        length_correlation = (
            np.corrcoef(lengths, ratios)[0, 1] if len(lengths) > 1 else 0
        )
        time_correlation = (
            np.corrcoef(gen_times, ratios)[0, 1] if len(gen_times) > 1 else 0
        )

        print("\nCorrelations:")
        print(f"  Text length vs likelihood ratio: {length_correlation:.3f}")
        print(f"  Generation time vs likelihood ratio: {time_correlation:.3f}")

        # Quality metrics comparison
        small_perplexities = [r.perplexity_small for r in self.results]
        medium_perplexities = [r.perplexity_medium for r in self.results]

        print("\nPerplexity comparison:")
        print(f"  Small model avg perplexity: {np.mean(small_perplexities):.2f}")
        print(f"  Medium model avg perplexity: {np.mean(medium_perplexities):.2f}")
        print(
            f"  Perplexity improvement: {np.mean(small_perplexities) - np.mean(medium_perplexities):.2f}"
        )

        return {
            "total_results": total_results,
            "medium_preferred": medium_preferred,
            "small_preferred": small_preferred,
            "generator_stats": generator_stats,
            "correlations": {"length": length_correlation, "time": time_correlation},
            "perplexity_stats": {
                "small_avg": np.mean(small_perplexities),
                "medium_avg": np.mean(medium_perplexities),
                "improvement": np.mean(small_perplexities)
                - np.mean(medium_perplexities),
            },
        }

    def create_visualizations(self) -> None:
        """Create comprehensive visualizations"""
        if not self.results:
            print("No results to visualize")
            return

        print("\nCreating visualizations...")

        # Prepare data for dashboard
        ratios = [r.likelihood_ratio for r in self.results]
        lengths = [r.text_length for r in self.results]
        gen_times = [r.generation_time for r in self.results]
        small_perp = [r.perplexity_small for r in self.results]
        medium_perp = [r.perplexity_medium for r in self.results]

        # Compute generator statistics
        generator_stats = {}
        generator_ratios = {}
        for result in self.results:
            gen_model = result.generator_model
            if gen_model not in generator_stats:
                generator_stats[gen_model] = {"total": 0, "medium_preferred": 0}
                generator_ratios[gen_model] = []
            generator_stats[gen_model]["total"] += 1
            if result.likelihood_ratio > 0:
                generator_stats[gen_model]["medium_preferred"] += 1
            generator_ratios[gen_model].append(result.likelihood_ratio)

        # Create preference matrix for heatmap
        unique_generators = list({r.generator_model for r in self.results})
        preference_matrix = np.zeros((2, len(unique_generators)))

        for i, gen in enumerate(unique_generators):
            gen_results = [r for r in self.results if r.generator_model == gen]
            avg_small_pref = np.mean([r.small_likelihood for r in gen_results])
            avg_medium_pref = np.mean([r.medium_likelihood for r in gen_results])
            preference_matrix[0, i] = avg_small_pref
            preference_matrix[1, i] = avg_medium_pref

        # Prepare data structure for dashboard
        results_data = {
            "likelihood_ratios": ratios,
            "text_lengths": lengths,
            "generation_times": gen_times,
            "small_perplexity": small_perp,
            "medium_perplexity": medium_perp,
            "generator_stats": generator_stats,
            "generator_ratios": generator_ratios,
            "preference_matrix": preference_matrix,
            "unique_generators": unique_generators,
        }

        # Create dashboard using plotting utilities
        fig = plot_meta_generation_dashboard(results_data, list(self.models.keys()))
        save_figure(fig, "meta_generation_experiments_results")
        print("Visualizations saved: meta_generation_experiments_results.png/pdf")

    def save_results(self, filename: str = "meta_generation_results.json") -> None:
        """Save results to JSON file"""
        if not self.results:
            print("No results to save")
            return

        # Convert results to dictionaries
        results_dict = [asdict(result) for result in self.results]

        # Compute summary statistics
        analysis = self.analyze_results()

        data = {
            "device": self.device,
            "total_prompts": len(self.prompts),
            "total_results": len(self.results),
            "results": results_dict,
            "analysis": analysis,
            "experiment_info": {
                "models_used": list(self.models.keys()),
                "prompts_used": len(self.prompts),
            },
        }

        with open(filename, "w") as f:
            json.dump(data, f, indent=2, default=str)

        print(f"💾 Results saved: {filename}")

    def run_full_experiment(self) -> tuple[list[MetaGenerationResult], dict[str, Any]]:
        """Run the complete meta-generation experiment"""
        print("META-GENERATION EXPERIMENTS: COMPREHENSIVE ANALYSIS")
        print("=" * 70)

        # Run cross-model evaluation
        results = self.run_cross_model_evaluation(max_length=35)

        # Run self vs other evaluation
        self_vs_other = self.run_self_vs_other_evaluation(max_length=30)

        # Analyze results
        analysis = self.analyze_results()

        # Create visualizations
        self.create_visualizations()

        # Save results
        self.save_results()

        # Print summary
        self._print_summary(analysis, self_vs_other)

        return results, {"analysis": analysis, "self_vs_other": self_vs_other}

    def _print_summary(self, analysis: dict, self_vs_other: dict) -> None:
        """Print experiment summary"""
        print("\nEXPERIMENT SUMMARY")
        print("=" * 60)

        print(f"Total cross-evaluations: {analysis.get('total_results', 0)}")
        print(
            f"Medium model preference rate: {analysis.get('medium_preferred', 0) / analysis.get('total_results', 1) * 100:.1f}%"
        )

        if self_vs_other:
            print("\nSelf-bias analysis:")
            for model, stats in self_vs_other.items():
                bias_direction = (
                    "prefers own" if stats["prefers_own"] else "prefers other"
                )
                print(f"  {model}: {bias_direction} (bias: {stats['self_bias']:.3f})")

        print("\nKey insights:")
        if analysis.get("correlations", {}).get("length", 0) > 0.1:
            print("  • Longer texts tend to be preferred by medium model")
        elif analysis.get("correlations", {}).get("length", 0) < -0.1:
            print("  • Shorter texts tend to be preferred by medium model")
        else:
            print("  • Text length has little correlation with model preference")

        perp_improvement = analysis.get("perplexity_stats", {}).get("improvement", 0)
        if perp_improvement > 0:
            print(
                f"  • Medium model shows {perp_improvement:.2f} lower perplexity on average"
            )

        print("\nMeta-generation experiment complete!")

In [None]:
def main() -> None:
    """Main function to run meta-generation experiments"""
    # Initialize experiment
    experiment = MetaGenerationExperiment(device="auto")

    # Run full experiment
    results, _ = experiment.run_full_experiment()

    # Show some interesting examples
    print("\nINTERESTING EXAMPLES")
    print("=" * 60)

    if results:
        # Most controversial (biggest disagreement)
        most_controversial = max(results, key=lambda r: abs(r.likelihood_ratio))
        print(
            f"\nMost controversial text (ratio: {most_controversial.likelihood_ratio:.3f}):"
        )
        print(f"Generator: {most_controversial.generator_model}")
        print(f"Prompt: '{most_controversial.prompt}'")
        print(f"Text: '{most_controversial.generated_text}'")

        # Highest agreement for medium
        if any(r.likelihood_ratio > 0 for r in results):
            medium_favorite = max(
                [r for r in results if r.likelihood_ratio > 0],
                key=lambda r: r.likelihood_ratio,
            )
            print(
                f"\nMedium model's strongest preference (ratio: {medium_favorite.likelihood_ratio:.3f}):"
            )
            print(f"Generator: {medium_favorite.generator_model}")
            print(f"Text: '{medium_favorite.generated_text[:100]}...'")

In [None]:
if __name__ == "__main__":
    main()