# Multi-Agent Lead Agent Evaluation Testing

This notebook contains unit tests and integration tests for the evaluation pipeline.

In [None]:
# Import required modules
import sys
import os
sys.path.append('.')

from eval import (
    DelegationQualityJudge, ResourceEfficiencyJudge, 
    QueryClassificationJudge, PlanCoherenceJudge,
    LeadAgentEvaluator, EvaluationExample, EvaluationResult,
    create_sample_evaluation_data, composite_metric
)
from agent import QueryAnalysis, ResearchPlan, PlanStep
import dspy
from dotenv import load_dotenv

# Load environment
load_dotenv()
print("✅ Imports successful")

## Test 1: DSPy Configuration

In [None]:
# Test DSPy configuration
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
EVAL_MODEL = os.getenv("EVAL_MODEL", "anthropic/claude-3.5-sonnet")

assert OPENROUTER_API_KEY, "OPENROUTER_API_KEY not found in environment"
print(f"✅ API Key found: {OPENROUTER_API_KEY[:10]}...")
print(f"✅ Base URL: {OPENROUTER_BASE_URL}")
print(f"✅ Model: {EVAL_MODEL}")

# Configure DSPy
eval_lm = dspy.LM(
    model=EVAL_MODEL,
    api_key=OPENROUTER_API_KEY,
    api_base=OPENROUTER_BASE_URL
)
dspy.configure(lm=eval_lm)
print("✅ DSPy configured successfully")

## Test 2: Sample Data Creation

In [None]:
# Test sample data creation
sample_data = create_sample_evaluation_data()

assert len(sample_data) > 0, "No sample data created"
print(f"✅ Created {len(sample_data)} sample examples")

# Verify structure of first example
first_example = sample_data[0]
print(f"\nFirst example:")
print(f"  Query: {first_example.query}")
print(f"  Type: {first_example.analysis.query_type}")
print(f"  Complexity: {first_example.analysis.complexity}")
print(f"  Steps: {len(first_example.plan.steps)}")
print("✅ Sample data structure is correct")

## Test 3: Individual Judge Signatures

In [None]:
# Test individual judge signatures
example = sample_data[0]  # Simple query: "What is the population of Tokyo?"

# Convert to JSON strings for judges
analysis_str = example.analysis.model_dump_json()
plan_str = example.plan.model_dump_json()

print(f"Testing with query: {example.query}")
print(f"Analysis JSON length: {len(analysis_str)} chars")
print(f"Plan JSON length: {len(plan_str)} chars")

In [None]:
# Test DelegationQualityJudge
print("\n🧪 Testing DelegationQualityJudge...")
delegation_judge = dspy.ChainOfThought(DelegationQualityJudge)

try:
    delegation_result = delegation_judge(
        query=example.query,
        analysis=analysis_str,
        plan=plan_str
    )
    
    print(f"✅ Delegation Score: {delegation_result.delegation_score}")
    print(f"✅ Reasoning: {delegation_result.reasoning}")
    assert 0.0 <= delegation_result.delegation_score <= 1.0, "Score out of range"
    
except Exception as e:
    print(f"❌ DelegationQualityJudge failed: {e}")
    raise

In [None]:
# Test ResourceEfficiencyJudge
print("\n🧪 Testing ResourceEfficiencyJudge...")
efficiency_judge = dspy.ChainOfThought(ResourceEfficiencyJudge)

try:
    efficiency_result = efficiency_judge(
        query=example.query,
        complexity=example.analysis.complexity,
        plan=plan_str
    )
    
    print(f"✅ Efficiency Score: {efficiency_result.efficiency_score}")
    print(f"✅ Reasoning: {efficiency_result.reasoning}")
    assert 0.0 <= efficiency_result.efficiency_score <= 1.0, "Score out of range"
    
except Exception as e:
    print(f"❌ ResourceEfficiencyJudge failed: {e}")
    raise

In [None]:
# Test QueryClassificationJudge
print("\n🧪 Testing QueryClassificationJudge...")
classification_judge = dspy.ChainOfThought(QueryClassificationJudge)

try:
    classification_result = classification_judge(
        query=example.query,
        predicted_type=example.analysis.query_type,
        analysis=analysis_str
    )
    
    print(f"✅ Classification Score: {classification_result.classification_score}")
    print(f"✅ Reasoning: {classification_result.reasoning}")
    assert 0.0 <= classification_result.classification_score <= 1.0, "Score out of range"
    
except Exception as e:
    print(f"❌ QueryClassificationJudge failed: {e}")
    raise

In [None]:
# Test PlanCoherenceJudge
print("\n🧪 Testing PlanCoherenceJudge...")
coherence_judge = dspy.ChainOfThought(PlanCoherenceJudge)

try:
    coherence_result = coherence_judge(
        query=example.query,
        analysis=analysis_str,
        plan=plan_str
    )
    
    print(f"✅ Coherence Score: {coherence_result.coherence_score}")
    print(f"✅ Reasoning: {coherence_result.reasoning}")
    assert 0.0 <= coherence_result.coherence_score <= 1.0, "Score out of range"
    
except Exception as e:
    print(f"❌ PlanCoherenceJudge failed: {e}")
    raise

print("\n✅ All individual judges working correctly!")

## Test 4: Full Evaluation Pipeline

In [None]:
# Test complete evaluation pipeline
print("\n🧪 Testing Full Evaluation Pipeline...")

evaluator = LeadAgentEvaluator()

try:
    result = evaluator.forward(example)
    
    print(f"✅ Overall Score: {result.overall_score:.3f}")
    print(f"✅ Delegation: {result.delegation_score:.3f}")
    print(f"✅ Efficiency: {result.efficiency_score:.3f}")
    print(f"✅ Classification: {result.classification_score:.3f}")
    print(f"✅ Coherence: {result.coherence_score:.3f}")
    
    # Verify result structure
    assert isinstance(result, EvaluationResult), "Wrong result type"
    assert 0.0 <= result.overall_score <= 1.0, "Overall score out of range"
    assert len(result.reasoning) == 4, "Missing reasoning entries"
    
    print("\n✅ Full evaluation pipeline working correctly!")
    
except Exception as e:
    print(f"❌ Full evaluation failed: {e}")
    raise

## Test 5: Evaluate All Sample Data

In [None]:
# Test evaluation on all sample data
print("\n🧪 Testing All Sample Data...")

results = []
for i, example in enumerate(sample_data):
    print(f"\nEvaluating Example {i+1}: {example.query[:50]}...")
    
    try:
        result = evaluator.forward(example)
        results.append(result)
        
        print(f"  Overall: {result.overall_score:.3f}")
        print(f"  Delegation: {result.delegation_score:.3f}")
        print(f"  Efficiency: {result.efficiency_score:.3f}")
        print(f"  Classification: {result.classification_score:.3f}")
        print(f"  Coherence: {result.coherence_score:.3f}")
        
    except Exception as e:
        print(f"  ❌ Failed: {e}")
        raise

# Calculate averages
avg_overall = sum(r.overall_score for r in results) / len(results)
avg_delegation = sum(r.delegation_score for r in results) / len(results)
avg_efficiency = sum(r.efficiency_score for r in results) / len(results)
avg_classification = sum(r.classification_score for r in results) / len(results)
avg_coherence = sum(r.coherence_score for r in results) / len(results)

print(f"\n📊 Average Scores:")
print(f"  Overall: {avg_overall:.3f}")
print(f"  Delegation: {avg_delegation:.3f}")
print(f"  Efficiency: {avg_efficiency:.3f}")
print(f"  Classification: {avg_classification:.3f}")
print(f"  Coherence: {avg_coherence:.3f}")

print("\n✅ All sample data evaluated successfully!")

## Test 6: Composite Metric Function

In [None]:
# Test composite metric function
print("\n🧪 Testing Composite Metric Function...")

# Create a mock prediction object
class MockPrediction:
    def __init__(self, analysis, plan):
        self.analysis = analysis
        self.plan = plan

example = sample_data[0]
mock_prediction = MockPrediction(example.analysis, example.plan)

try:
    score = composite_metric(example, mock_prediction)
    
    print(f"✅ Composite metric score: {score:.3f}")
    assert 0.0 <= score <= 1.0, "Composite score out of range"
    assert isinstance(score, float), "Score is not a float"
    
    print("✅ Composite metric working correctly!")
    
except Exception as e:
    print(f"❌ Composite metric failed: {e}")
    raise

## Test 7: MIPROv2 Optimizer Setup (Basic)

In [None]:
# Test MIPROv2 optimizer setup (without running full optimization)
print("\n🧪 Testing MIPROv2 Optimizer Setup...")

try:
    from dspy.teleprompt import MIPROv2
    from eval import create_optimizer
    
    # Create optimizer with minimal settings
    optimizer = create_optimizer(sample_data)
    
    print(f"✅ MIPROv2 optimizer created successfully")
    print(f"  Type: {type(optimizer)}")
    print(f"  Auto mode: light")
    print(f"  Trials: 20")
    
    # Verify it's the right type
    assert isinstance(optimizer, MIPROv2), "Wrong optimizer type"
    
    print("✅ MIPROv2 setup working correctly!")
    
except ImportError as e:
    print(f"⚠️  MIPROv2 import failed - might not be available in this DSPy version: {e}")
except Exception as e:
    print(f"❌ MIPROv2 setup failed: {e}")
    raise

## Test Summary

In [None]:
print("\n🎉 All Tests Complete!")
print("\n✅ Passed Tests:")
print("  - DSPy configuration")
print("  - Sample data creation")
print("  - Individual judge signatures")
print("  - Full evaluation pipeline")
print("  - All sample data evaluation")
print("  - Composite metric function")
print("  - MIPROv2 optimizer setup")

print("\n🚀 Ready for production use!")
print("\n📝 Next steps:")
print("  - Run actual MIPROv2 optimization with larger dataset")
print("  - Integrate with agent.py for end-to-end optimization")
print("  - Add more diverse evaluation examples")
print("  - Tune optimization hyperparameters")