In [None]:
from datetime import datetime
from typing import Dict, List, Optional

class PromptEvaluator:
    """
    Unified evaluator combining custom functions and LLM judge.
    Provides comprehensive prompt evaluation on 0-150 scale.
    """
    
    def __init__(self, metric_config: dict, config: dict, api_key: str = None):
        self.metric_config = metric_config
        self.config = config
        
        # Initialize sub-evaluators
        self.custom_evaluator = CustomMetricEvaluator(metric_config)
        self.llm_evaluator = LLMJudgeEvaluator(metric_config, config, api_key)
    
    def evaluate(self, prompt: str, first_run: bool = True, 
                 below_threshold_metrics: List[str] = None) -> Dict:
        """
        Evaluate a prompt across all metrics.
        
        Args:
            prompt: The prompt text to evaluate
            first_run: If True, evaluate all metrics. If False, only re-evaluate specified metrics
            below_threshold_metrics: List of metric names to re-evaluate (used when first_run=False)
        
        Returns:
            Comprehensive evaluation results with scores and analysis
        """
        print(f"\n{'='*70}")
        print(f"EVALUATING PROMPT")
        print(f"{'='*70}")
        print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"First run: {first_run}")
        if not first_run and below_threshold_metrics:
            print(f"Re-evaluating {len(below_threshold_metrics)} metrics")
        print()
        
        results = {}
        
        # Step 1: Custom Function Metrics (always evaluate all)
        print("Step 1: Evaluating Custom Function Metrics...")
        custom_results = self.custom_evaluator.evaluate_all(prompt)
        results.update(custom_results)
        custom_score = sum(r['score'] for r in custom_results.values())
        custom_max = sum(r['max_points'] for r in custom_results.values())
        print(f"✅ Custom metrics complete: {custom_score:.2f}/{custom_max:.2f}")
        
        # Step 2: LLM Judge Metrics
        print("\nStep 2: Evaluating LLM Judge Metrics...")
        
        if first_run:
            # First run: evaluate all LLM metrics
            llm_results = self.llm_evaluator.evaluate_all(prompt, 
                                                          below_threshold_only=False)
        else:
            # Subsequent runs: only evaluate below-threshold LLM metrics
            llm_metrics_to_eval = [m for m in below_threshold_metrics 
                                   if self.metric_config[m]['type'] == 'llm_judge']
            
            if llm_metrics_to_eval:
                llm_results = self.llm_evaluator.evaluate_all(prompt,
                                                              below_threshold_only=True,
                                                              below_threshold_metrics=llm_metrics_to_eval)
            else:
                llm_results = {}
        
        results.update(llm_results)
        llm_score = sum(r['score'] for r in llm_results.values())
        llm_max = sum(r['max_points'] for r in llm_results.values())
        print(f"✅ LLM judge complete: {llm_score:.2f}/{llm_max:.2f}")
        
        # Step 3: Calculate totals and analysis
        evaluation_result = self._compile_results(prompt, results, first_run)
        
        return evaluation_result
    
    def _compile_results(self, prompt: str, metrics_results: Dict, 
                        first_run: bool) -> Dict:
        """Compile comprehensive evaluation results."""
        
        # Calculate total score
        total_score = sum(r['score'] for r in metrics_results.values())
        total_max = sum(r['max_points'] for r in metrics_results.values())
        percentage = (total_score / total_max * 100) if total_max > 0 else 0
        
        # Identify below-threshold metrics
        below_threshold = [
            name for name, result in metrics_results.items()
            if result['below_threshold']
        ]
        
        # Calculate group scores
        group_scores = {}
        for group_name in GROUP_TOTALS.keys():
            group_metrics = {
                name: result for name, result in metrics_results.items()
                if self.metric_config[name]['group'] == group_name
            }
            if group_metrics:
                group_score = sum(r['score'] for r in group_metrics.values())
                group_max = sum(r['max_points'] for r in group_metrics.values())
                group_scores[group_name] = {
                    'score': round(group_score, 2),
                    'max': group_max,
                    'percentage': round(group_score / group_max * 100, 1) if group_max > 0 else 0
                }
        
        # Compile result
        result = {
            'prompt': prompt,
            'timestamp': datetime.now().isoformat(),
            'first_run': first_run,
            'metrics': metrics_results,
            'total_score': round(total_score, 2),
            'total_max': total_max,
            'percentage': round(percentage, 1),
            'below_threshold_metrics': below_threshold,
            'below_threshold_count': len(below_threshold),
            'group_scores': group_scores
        }
        
        # Print summary
        self._print_summary(result)
        
        return result
    
    def _print_summary(self, result: Dict):
        """Print evaluation summary."""
        print(f"\n{'='*70}")
        print(f"EVALUATION SUMMARY")
        print(f"{'='*70}")
        print(f"Total Score: {result['total_score']:.2f} / {result['total_max']} ({result['percentage']:.1f}%)")
        print(f"Below Threshold: {result['below_threshold_count']} metrics")
        
        print(f"\nGroup Scores:")
        for group_name, group_data in result['group_scores'].items():
            status = "✅" if group_data['percentage'] >= 70 else "⚠️"
            print(f"  {status} {group_name.replace('_', ' ').title()}: "
                  f"{group_data['score']:.2f}/{group_data['max']} ({group_data['percentage']:.1f}%)")
        
        if result['below_threshold_metrics']:
            print(f"\nMetrics Below Threshold ({len(result['below_threshold_metrics'])}):")
            for metric in result['below_threshold_metrics'][:10]:  # Show first 10
                metric_result = result['metrics'][metric]
                print(f"  ❌ {metric}: {metric_result['score']:.2f}/{metric_result['max_points']:.2f}")
            if len(result['below_threshold_metrics']) > 10:
                print(f"  ... and {len(result['below_threshold_metrics']) - 10} more")
    
    def generate_detailed_report(self, result: Dict) -> str:
        """Generate a detailed text report of evaluation results."""
        
        report_lines = []
        report_lines.append("=" * 80)
        report_lines.append("PROMPT EVALUATION REPORT")
        report_lines.append("=" * 80)
        report_lines.append(f"Timestamp: {result['timestamp']}")
        report_lines.append(f"Total Score: {result['total_score']:.2f} / {result['total_max']} ({result['percentage']:.1f}%)")
        report_lines.append("")
        
        # Prompt
        report_lines.append("PROMPT:")
        report_lines.append("-" * 80)
        report_lines.append(result['prompt'])
        report_lines.append("")
        
        # Group-by-group breakdown
        for group_name in GROUP_TOTALS.keys():
            group_data = result['group_scores'].get(group_name)
            if not group_data:
                continue
            
            report_lines.append("")
            report_lines.append(f"{group_name.upper().replace('_', ' ')}")
            report_lines.append(f"Score: {group_data['score']:.2f}/{group_data['max']} ({group_data['percentage']:.1f}%)")
            report_lines.append("-" * 80)
            
            # Get metrics in this group
            group_metrics = {
                name: result['metrics'][name] 
                for name in result['metrics']
                if self.metric_config[name]['group'] == group_name
            }
            
            for metric_name, metric_result in group_metrics.items():
                status = "✅" if not metric_result['below_threshold'] else "❌"
                report_lines.append(f"\n{status} {metric_name}")
                report_lines.append(f"   Score: {metric_result['score']:.2f}/{metric_result['max_points']:.2f} "
                                  f"(Threshold: {metric_result['threshold']:.2f})")
                
                if metric_result['type'] == 'llm_judge':
                    report_lines.append(f"   Justification: {metric_result['justification']}")
                else:
                    report_lines.append(f"   Evidence: {metric_result['evidence']}")
        
        # Summary of improvements needed
        report_lines.append("")
        report_lines.append("=" * 80)
        report_lines.append("IMPROVEMENTS NEEDED")
        report_lines.append("=" * 80)
        
        if result['below_threshold_metrics']:
            report_lines.append(f"\n{len(result['below_threshold_metrics'])} metrics are below threshold:\n")
            for metric in result['below_threshold_metrics']:
                metric_result = result['metrics'][metric]
                gap = metric_result['threshold'] - metric_result['score']
                report_lines.append(f"  • {metric}: {metric_result['score']:.2f}/{metric_result['max_points']:.2f} "
                                  f"(need +{gap:.2f} points)")
        else:
            report_lines.append("\n✅ All metrics meet or exceed threshold!")
        
        return "\n".join(report_lines)


print("✅ Full Evaluator Pipeline Created!")

In [None]:
# Test prompts
test_prompts = {
    "minimal": "Write about AI",
    
    "moderate": """
Write a blog post about artificial intelligence for beginners.
Include:
- What is AI?
- Current applications
- Future trends

Keep it simple and engaging.
""",
    
    "comprehensive": """
# Task: Technical Article on Transformer Architecture

You are an expert ML engineer writing for a technical audience of software engineers and data scientists.

## Objective
Create a comprehensive technical guide comparing different transformer architectures (BERT, GPT, T5) for natural language processing tasks.

## Requirements
1. Start with transformer fundamentals (attention mechanism)
2. Compare architectural differences between BERT, GPT, and T5
3. Provide code examples for each architecture
4. Discuss use cases and trade-offs
5. Include performance benchmarks (cite sources)

## Output Format
- Use clear section headers
- Include code blocks with explanations
- Provide a comparison table
- Add a "Further Reading" section

## Constraints
- Keep technical accuracy paramount
- Validate all claims against published research
- If uncertain about recent developments, state limitations clearly
- Estimate reading time for the article

## Process
1. Think step-by-step through the architecture comparisons
2. Verify technical claims
3. Iterate on clarity and completeness
4. Self-review for accuracy

Remember our previous discussion about balancing depth with accessibility for intermediate practitioners.
"""
}

# Test function
def test_full_evaluator(api_key: str = None, use_mock: bool = True):
    """Test the full evaluator pipeline."""
    
    # Choose which prompt to test
    test_name = "comprehensive"
    test_prompt = test_prompts[test_name]
    
    print(f"Testing Full Evaluator with '{test_name}' prompt")
    print(f"Using {'MOCK' if use_mock else 'REAL'} LLM evaluation")
    print()
    
    # Initialize evaluator
    if use_mock:
        # Temporarily replace LLM evaluator with mock version
        evaluator = PromptEvaluator(METRIC_CONFIG, CONFIG, api_key=None)
        
        # Override the LLM evaluator's evaluate_all method with mock
        original_evaluate = evaluator.llm_evaluator.evaluate_all
        
        def mock_evaluate_all(prompt, below_threshold_only=False, below_threshold_metrics=None):
            llm_metrics = {
                name: config for name, config in METRIC_CONFIG.items()
                if config['type'] == 'llm_judge'
            }
            if below_threshold_only and below_threshold_metrics:
                llm_metrics = {
                    name: config for name, config in llm_metrics.items()
                    if name in below_threshold_metrics
                }
            return mock_llm_evaluation(prompt, list(llm_metrics.keys()))
        
        evaluator.llm_evaluator.evaluate_all = mock_evaluate_all
    else:
        if not api_key:
            print("⚠️  API key required for real LLM evaluation")
            return None
        evaluator = PromptEvaluator(METRIC_CONFIG, CONFIG, api_key=api_key)
    
    # Run evaluation
    result = evaluator.evaluate(test_prompt, first_run=True)
    
    # Generate detailed report
    print("\n" + "="*80)
    print("GENERATING DETAILED REPORT")
    print("="*80)
    report = evaluator.generate_detailed_report(result)
    
    # Show first part of report
    report_lines = report.split('\n')
    print('\n'.join(report_lines[:50]))  # Show first 50 lines
    if len(report_lines) > 50:
        print(f"\n... ({len(report_lines) - 50} more lines) ...")
    
    return result, report

# Run test
print("="*80)
print("RUNNING FULL EVALUATOR TEST")
print("="*80)
result, report = test_full_evaluator(use_mock=True)

# Save result for next steps
if result:
    print(f"\n✅ Evaluation complete!")
    print(f"   Result stored in 'result' variable")
    print(f"   Full report stored in 'report' variable")