# Measure GPT-5 (or any model) Baseline in Google Colab

This notebook measures complete baseline metrics for a new OpenAI model and adds it to the CERT registry.

**What you'll measure:**
- Behavioral Consistency (C): How stable is the model?
- Performance Distribution (μ, σ): Mean quality and variability
- Context Propagation Effect (γ): Benefit from sequential processing

**Requirements:**
- OpenAI API key
- 10-15 minutes
- ~$2-5 in API costs

## 1. Install CERT SDK

In [None]:
!pip install -q git+https://github.com/Javihaus/CERT.git
print("✓ CERT SDK installed")

## 2. Import Libraries

In [None]:
import asyncio
import numpy as np
from getpass import getpass

import cert
from cert.models import ModelRegistry
from cert.providers import OpenAIProvider
from cert.providers.base import ProviderConfig
from cert.analysis.semantic import SemanticAnalyzer
from cert.analysis.quality import QualityScorer
from cert.core.metrics import (
    behavioral_consistency,
    empirical_performance_distribution,
    coordination_effect,
)

print("✓ Libraries imported")

## 3. Configuration

In [None]:
# Get API key
api_key = getpass("Enter your OpenAI API key: ")

# Model to measure
MODEL_NAME = "gpt-4o"  # Change to "gpt-5" or whatever model you're using
MODEL_FAMILY = "GPT-4o"  # Change to "GPT-5" for display

print(f"✓ Will measure baseline for: {MODEL_NAME}")

## 4. Analytical Prompts (from CERT paper)

In [None]:
# These are the same prompts used in the paper for all baseline measurements
ANALYTICAL_PROMPTS = [
    "Analyze the key factors in effective team communication.",
    "Evaluate the main considerations for project risk management.",
    "Assess the critical elements of successful strategic planning.",
    "Identify the primary aspects of organizational change management.",
    "Examine the essential components of decision-making frameworks.",
    "Analyze the challenges in cross-functional collaboration.",
    "Evaluate process optimization strategies in complex systems.",
    "Assess quality assurance methodologies and their effectiveness.",
    "Identify barriers to innovation in established organizations.",
    "Examine factors influencing stakeholder engagement.",
    "Analyze the relationship between leadership style and outcomes.",
    "Evaluate resource allocation strategies in constrained environments.",
    "Assess the impact of communication channels on information flow.",
    "Identify success metrics for collaborative initiatives.",
    "Examine the role of feedback mechanisms in performance improvement.",
]

print(f"✓ Using {len(ANALYTICAL_PROMPTS)} analytical prompts")

## 5. Initialize Provider

In [None]:
config = ProviderConfig(
    api_key=api_key,
    model_name=MODEL_NAME,
    temperature=0.7,  # Paper uses 0.7
    max_tokens=1024,
)

provider = OpenAIProvider(config)

print(f"✓ Provider initialized for {MODEL_NAME}")

## 6. STEP 1: Measure Behavioral Consistency (C)

Generate 20 responses to the same prompt and measure consistency using semantic distance variability.

**Formula:** C = 1 - (Std[distances] / Mean[distances])

**Time:** ~2-3 minutes

In [None]:
async def measure_consistency(provider, prompt, n_trials=20):
    """Measure Behavioral Consistency C from Equation 1."""
    print("="*70)
    print("MEASURING BEHAVIORAL CONSISTENCY")
    print("="*70)
    print(f"Prompt: {prompt[:60]}...")
    print(f"Trials: {n_trials}\n")
    
    responses = []
    
    for i in range(n_trials):
        response = await provider.generate_response(
            prompt=prompt,
            temperature=0.7,
        )
        responses.append(response)
        
        if (i + 1) % 5 == 0:
            print(f"  Progress: {i+1}/{n_trials} responses generated")
    
    print("\n  Calculating semantic distances...")
    analyzer = SemanticAnalyzer()
    distances = analyzer.pairwise_distances(responses)
    
    consistency = behavioral_consistency(distances)
    
    print(f"\n  ✓ Behavioral Consistency: C = {consistency:.3f}")
    print(f"    Mean distance: {np.mean(distances):.3f}")
    print(f"    Std distance:  {np.std(distances):.3f}")
    
    return consistency

# Run measurement
consistency = await measure_consistency(
    provider=provider,
    prompt=ANALYTICAL_PROMPTS[0],
    n_trials=20,
)

## 7. STEP 2: Measure Performance Distribution (μ, σ)

Generate responses to 15 different prompts and score quality.

**Formula:** μ = Mean[quality_scores], σ = Std[quality_scores]

**Time:** ~3-5 minutes

In [None]:
async def measure_performance(provider, prompts):
    """Measure Performance Distribution (μ, σ) from Equation 2."""
    print("\n" + "="*70)
    print("MEASURING PERFORMANCE DISTRIBUTION")
    print("="*70)
    print(f"Prompts: {len(prompts)}\n")
    
    scorer = QualityScorer()
    quality_scores = []
    
    for i, prompt in enumerate(prompts):
        response = await provider.generate_response(
            prompt=prompt,
            temperature=0.7,
        )
        
        components = scorer.score(prompt, response)
        quality_scores.append(components.composite_score)
        
        print(f"  Prompt {i+1}/{len(prompts)}: Q = {components.composite_score:.3f}")
    
    mu, sigma = empirical_performance_distribution(np.array(quality_scores))
    
    print(f"\n  ✓ Performance Distribution:")
    print(f"    Mean (μ):     {mu:.3f}")
    print(f"    Std Dev (σ):  {sigma:.3f}")
    print(f"    Min quality:  {min(quality_scores):.3f}")
    print(f"    Max quality:  {max(quality_scores):.3f}")
    
    return mu, sigma, quality_scores

# Run measurement
mu, sigma, quality_scores = await measure_performance(
    provider=provider,
    prompts=ANALYTICAL_PROMPTS,
)

## 8. STEP 3: Measure Context Propagation Effect (γ) for 2-agent pipelines

Simulate 2-agent sequential processing and compare to independent execution.

**Formula:** γ = P_sequential / (P_agent1 × P_agent2)

**Time:** ~5-7 minutes

In [None]:
async def measure_coordination_2agent(provider, prompts, n_pairs=5):
    """Measure Context Propagation Effect γ for 2-agent pipelines."""
    print("\n" + "="*70)
    print("MEASURING CONTEXT PROPAGATION EFFECT (2-agent)")
    print("="*70)
    print(f"Testing {n_pairs} prompt pairs\n")
    
    scorer = QualityScorer()
    gamma_values = []
    
    for i in range(n_pairs):
        prompt1 = prompts[i * 2]
        prompt2 = prompts[i * 2 + 1]
        
        print(f"\n  Pair {i+1}/{n_pairs}:")
        print(f"    Prompt 1: {prompt1[:50]}...")
        print(f"    Prompt 2: {prompt2[:50]}...")
        
        # Independent execution
        response1_indep = await provider.generate_response(prompt=prompt1, temperature=0.7)
        response2_indep = await provider.generate_response(prompt=prompt2, temperature=0.7)
        
        quality1 = scorer.score(prompt1, response1_indep).composite_score
        quality2 = scorer.score(prompt2, response2_indep).composite_score
        
        print(f"      Independent Q1: {quality1:.3f}, Q2: {quality2:.3f}")
        
        # Sequential execution (agent 2 sees agent 1's output)
        response1_seq = await provider.generate_response(prompt=prompt1, temperature=0.7)
        
        # Agent 2 prompt includes agent 1's output as context
        prompt2_with_context = f"Building on this analysis:\n\n{response1_seq}\n\nNow: {prompt2}"
        response2_seq = await provider.generate_response(prompt=prompt2_with_context, temperature=0.7)
        
        quality_seq = scorer.score(prompt2, response2_seq).composite_score
        
        print(f"      Sequential Q:   {quality_seq:.3f}")
        
        # Calculate γ (Equation 3)
        gamma = coordination_effect(
            coordinated_performance=quality_seq,
            independent_performances=[quality1, quality2]
        )
        
        gamma_values.append(gamma)
        print(f"      γ = {gamma:.3f}")
    
    mean_gamma = np.mean(gamma_values)
    
    print(f"\n  ✓ Context Propagation Effect (2-agent):")
    print(f"    Mean γ:  {mean_gamma:.3f}")
    print(f"    Std γ:   {np.std(gamma_values):.3f}")
    print(f"    Min γ:   {min(gamma_values):.3f}")
    print(f"    Max γ:   {max(gamma_values):.3f}")
    
    return mean_gamma

# Run measurement
gamma_2agent = await measure_coordination_2agent(
    provider=provider,
    prompts=ANALYTICAL_PROMPTS,
    n_pairs=5,
)

## 9. Results Summary

In [None]:
print("\n" + "="*70)
print("MEASUREMENT COMPLETE")
print("="*70)

print(f"\nModel: {MODEL_NAME} ({MODEL_FAMILY})")
print(f"\nBaseline Metrics:")
print(f"  Consistency (C):            {consistency:.3f}")
print(f"  Mean Performance (μ):       {mu:.3f}")
print(f"  Std Performance (σ):        {sigma:.3f}")
print(f"  Context Effect γ (2-agent): {gamma_2agent:.3f}")

## 10. Comparison to Paper Baselines

In [None]:
import pandas as pd

# Paper baselines
paper_data = {
    "Model": ["Claude 3 Haiku", "GPT-4o", "Grok 3", "Gemini 3.5 Pro", f"{MODEL_FAMILY} (NEW)"],
    "C": [0.831, 0.831, 0.863, 0.895, consistency],
    "μ": [0.595, 0.638, 0.658, 0.831, mu],
    "γ": [1.462, 1.562, 1.625, 1.137, gamma_2agent],
}

df = pd.DataFrame(paper_data)
print("\n" + "="*70)
print("COMPARISON TO PAPER BASELINES")
print("="*70)
print()
print(df.to_string(index=False))

# Interpretation
print(f"\n{'='*70}")
print("INTERPRETATION")
print(f"{'='*70}")

if consistency > 0.85:
    print(f"\n✓ HIGH consistency (C={consistency:.3f})")
    print("  Very stable, predictable behavior. Low monitoring overhead.")
elif consistency > 0.80:
    print(f"\n✓ GOOD consistency (C={consistency:.3f})")
    print("  Stable behavior with minor variance. Standard monitoring.")
else:
    print(f"\n⚠ MODERATE consistency (C={consistency:.3f})")
    print("  Higher variance. Needs enhanced monitoring.")

if mu > 0.70:
    print(f"\n✓ HIGH baseline performance (μ={mu:.3f})")
    print("  Strong individual task performance.")
elif mu > 0.60:
    print(f"\n✓ GOOD baseline performance (μ={mu:.3f})")
    print("  Solid performance.")
else:
    print(f"\n⚠ MODERATE baseline performance (μ={mu:.3f})")
    print("  Room for improvement.")

if gamma_2agent > 1.5:
    print(f"\n✓ STRONG context propagation (γ={gamma_2agent:.3f})")
    print("  Benefits greatly from sequential processing. Excellent for multi-agent pipelines.")
elif gamma_2agent > 1.2:
    print(f"\n✓ MODERATE context propagation (γ={gamma_2agent:.3f})")
    print("  Benefits from context accumulation.")
elif gamma_2agent > 1.0:
    print(f"\n⚠ WEAK context propagation (γ={gamma_2agent:.3f})")
    print("  Minimal benefit from sequential processing. May work better independently.")
else:
    print(f"\n⚠ NEGATIVE context propagation (γ={gamma_2agent:.3f})")
    print("  Context may be degrading performance. Avoid sequential pipelines.")

## 11. Register in ModelRegistry

In [None]:
# Register the new baseline
baseline = ModelRegistry.register_custom_baseline(
    model_id=MODEL_NAME,
    provider="openai",
    model_family=MODEL_FAMILY,
    consistency=consistency,
    mean_performance=mu,
    std_performance=sigma,
    coordination_2agent=gamma_2agent,
)

print(f"✓ Registered: {baseline}")
print(f"\nYou can now use this baseline:")
print(f"  baseline = ModelRegistry.get_model('{MODEL_NAME}')")
print(f"  print(baseline.consistency)  # {consistency:.3f}")
print(f"  print(baseline.coordination_2agent)  # {gamma_2agent:.3f}")

## 12. Code to Add Permanently

To add this model permanently to the SDK, copy this code into `src/cert/models.py`:

In [None]:
print("\n" + "="*70)
print("TO ADD PERMANENTLY TO src/cert/models.py")
print("="*70)
print("\nAdd this to the _VALIDATED_MODELS dictionary:\n")

code = f'''    # {MODEL_FAMILY}
    "{MODEL_NAME}": ModelBaseline(
        model_id="{MODEL_NAME}",
        provider="openai",
        model_family="{MODEL_FAMILY}",
        consistency={consistency:.3f},
        mean_performance={mu:.3f},
        std_performance={sigma:.3f},
        coordination_2agent={gamma_2agent:.3f},
        coordination_5agent=None,  # Measure separately if needed
        paper_section="Custom Measurement",
        validation_date="{cert.__version__}",
    ),'''

print(code)

print("\n" + "="*70)
print("MEASUREMENT COMPLETE! 🎉")
print("="*70)

## 13. Test the New Baseline

Let's verify the baseline is working correctly:

In [None]:
# Retrieve the baseline
retrieved_baseline = ModelRegistry.get_model(MODEL_NAME)

if retrieved_baseline:
    print("✓ Baseline successfully registered and retrieved!")
    print(f"\n{retrieved_baseline}")
    print(f"\nγ (2-agent): {retrieved_baseline.coordination_2agent:.3f}")
    
    # Compare to your measured γ = 3.632 from the 3-agent pipeline
    print(f"\n{'='*70}")
    print("EXPECTED vs MEASURED γ")
    print(f"{'='*70}")
    print(f"\nBaseline γ (2-agent): {retrieved_baseline.coordination_2agent:.3f}")
    print(f"Your pipeline γ (3-agent): 3.632")
    print(f"\nDifference: The 3-agent pipeline shows STRONGER context propagation")
    print(f"because each agent builds on accumulated context from MULTIPLE")
    print(f"previous agents, creating a compounding effect.")
else:
    print("❌ Error: Baseline not found!")