In [None]:
import json
from typing import Dict, List, Tuple
import anthropic

class LLMJudgeEvaluator:
    """
    LLM-based evaluator for subjective metrics.
    Uses Claude API to score prompts on complex criteria.
    """
    
    def __init__(self, metric_config: dict, config: dict, api_key: str = None):
        self.metric_config = metric_config
        self.config = config
        self.client = anthropic.Anthropic(api_key=api_key) if api_key else None
    
    def evaluate_all(self, prompt: str, below_threshold_only: bool = False, 
                     below_threshold_metrics: List[str] = None) -> Dict[str, Dict]:
        """
        Evaluate LLM judge metrics.
        
        Args:
            prompt: The prompt to evaluate
            below_threshold_only: If True, only evaluate specified metrics
            below_threshold_metrics: List of metric names to re-evaluate
        """
        # Get LLM judge metrics
        llm_metrics = {
            name: config for name, config in self.metric_config.items()
            if config['type'] == 'llm_judge'
        }
        
        # Filter to below-threshold metrics if requested
        if below_threshold_only and below_threshold_metrics:
            llm_metrics = {
                name: config for name, config in llm_metrics.items()
                if name in below_threshold_metrics
            }
        
        # Batch metrics for evaluation
        metric_names = list(llm_metrics.keys())
        batches = self._create_batches(metric_names, self.config['llm_batch_size'])
        
        print(f"Evaluating {len(metric_names)} LLM judge metrics in {len(batches)} batches...")
        
        # Evaluate each batch
        all_results = {}
        for i, batch in enumerate(batches, 1):
            print(f"  Processing batch {i}/{len(batches)}...")
            batch_results = self._evaluate_batch(prompt, batch, llm_metrics)
            all_results.update(batch_results)
        
        return all_results
    
    def _create_batches(self, items: List[str], batch_size: int) -> List[List[str]]:
        """Split items into batches."""
        return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
    
    def _evaluate_batch(self, prompt: str, metric_names: List[str], 
                       llm_metrics: Dict) -> Dict[str, Dict]:
        """Evaluate a batch of metrics using Claude API."""
        
        # Build evaluation prompt
        eval_prompt = self._build_evaluation_prompt(prompt, metric_names, llm_metrics)
        
        # Call Claude API
        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4000,
                temperature=0,
                messages=[
                    {"role": "user", "content": eval_prompt}
                ]
            )
            
            response_text = response.content[0].text
            
            # Parse response
            results = self._parse_evaluation_response(response_text, metric_names, llm_metrics)
            return results
            
        except Exception as e:
            print(f"Error calling Claude API: {e}")
            # Return default scores
            return {
                name: {
                    'score': 0,
                    'max_points': llm_metrics[name]['max_points'],
                    'threshold': llm_metrics[name]['threshold'],
                    'below_threshold': True,
                    'justification': f"Error during evaluation: {str(e)}",
                    'type': 'llm_judge'
                }
                for name in metric_names
            }
    
    def _build_evaluation_prompt(self, prompt: str, metric_names: List[str], 
                                 llm_metrics: Dict) -> str:
        """Build the evaluation prompt for Claude."""
        
        metrics_description = []
        for name in metric_names:
            config = llm_metrics[name]
            metrics_description.append(
                f"**{name}** (max: {config['max_points']} points)\n"
                f"Description: {config['description']}\n"
                f"Threshold: {config['threshold']} points (70%)"
            )
        
        eval_prompt = f"""You are an expert prompt engineer evaluating the quality of a prompt across multiple dimensions.

PROMPT TO EVALUATE:
```
{prompt}
```

YOUR TASK:
Evaluate the above prompt on the following {len(metric_names)} metrics. For each metric, provide:
1. A score from 0 to the maximum points (be precise, use decimals)
2. A brief justification (1-2 sentences)

METRICS TO EVALUATE:

{chr(10).join(metrics_description)}

SCORING GUIDELINES:
- 0 points: Metric completely absent or fails entirely
- 25% of max: Metric barely present, minimal effort
- 50% of max: Metric partially present, moderate effort
- 75% of max: Metric well-implemented, meets threshold
- 100% of max: Metric excellently implemented, exemplary

IMPORTANT:
- Be objective and consistent
- Consider the prompt's actual content, not potential
- A simple prompt may legitimately score low on some metrics
- Use decimals for precision (e.g., 3.2, 4.5)

OUTPUT FORMAT:
Respond with a JSON object where each key is the metric name and the value is an object with "score" and "justification":
```json
{{
  "Metric Name 1": {{
    "score": 3.5,
    "justification": "Brief explanation of the score."
  }},
  "Metric Name 2": {{
    "score": 4.0,
    "justification": "Brief explanation of the score."
  }}
}}
```

Provide ONLY the JSON object, no additional text."""
        
        return eval_prompt
    
    def _parse_evaluation_response(self, response_text: str, metric_names: List[str],
                                   llm_metrics: Dict) -> Dict[str, Dict]:
        """Parse Claude's evaluation response."""
        
        # Extract JSON from response
        try:
            # Find JSON in response (handle markdown code blocks)
            json_match = re.search(r'```json\s*(\{[\s\S]*?\})\s*```', response_text)
            if json_match:
                json_str = json_match.group(1)
            else:
                # Try to find raw JSON
                json_match = re.search(r'\{[\s\S]*\}', response_text)
                if json_match:
                    json_str = json_match.group(0)
                else:
                    raise ValueError("No JSON found in response")
            
            evaluation_data = json.loads(json_str)
            
        except Exception as e:
            print(f"Error parsing JSON response: {e}")
            print(f"Response was: {response_text[:500]}")
            # Return default scores
            return {
                name: {
                    'score': 0,
                    'max_points': llm_metrics[name]['max_points'],
                    'threshold': llm_metrics[name]['threshold'],
                    'below_threshold': True,
                    'justification': "Failed to parse evaluation response",
                    'type': 'llm_judge'
                }
                for name in metric_names
            }
        
        # Build results
        results = {}
        for name in metric_names:
            config = llm_metrics[name]
            
            if name in evaluation_data:
                eval_result = evaluation_data[name]
                score = float(eval_result.get('score', 0))
                justification = eval_result.get('justification', 'No justification provided')
            else:
                score = 0
                justification = f"Metric not found in evaluation response"
            
            # Cap score at max_points
            score = min(score, config['max_points'])
            
            results[name] = {
                'score': round(score, 2),
                'max_points': config['max_points'],
                'threshold': config['threshold'],
                'below_threshold': score < config['threshold'],
                'justification': justification,
                'type': 'llm_judge'
            }
        
        return results


print("✅ LLM Judge Evaluator Created!")
print("\nLLM Judge Metrics (27 total):")
llm_metrics = [name for name, config in METRIC_CONFIG.items() if config['type'] == 'llm_judge']
print(f"Total: {len(llm_metrics)} metrics")
print(f"Batch size: {CONFIG['llm_batch_size']} metrics per API call")
print(f"Estimated API calls for full evaluation: {len(llm_metrics) // CONFIG['llm_batch_size'] + (1 if len(llm_metrics) % CONFIG['llm_batch_size'] else 0)}")

In [None]:
# Test setup
test_prompt_simple = "Write a blog post about AI."

test_prompt_complex = """
You are an expert technical writer with 10 years of experience in AI/ML documentation.

**Task**: Create a comprehensive guide comparing transformer architectures for a technical audience.

**Requirements**:
1. Start with foundational concepts
2. Compare at least 3 architectures (BERT, GPT, T5)
3. Include code examples where relevant
4. Cite recent research (2023-2024)

**Output Format**:
- Use clear section headers
- Include diagrams descriptions
- Provide a summary table

**Constraints**:
- Keep explanations accessible but technically accurate
- Validate claims against established literature
- If uncertain about recent developments, clearly state limitations

Please think step-by-step and validate your technical claims.
"""

# Function to test with API key
def test_llm_judge(api_key: str = None):
    """Test the LLM Judge Evaluator."""
    
    if not api_key:
        print("⚠️  API Key Required")
        print("To test the LLM Judge, provide your Anthropic API key:")
        print("  evaluator = LLMJudgeEvaluator(METRIC_CONFIG, CONFIG, api_key='your-key-here')")
        print("  results = evaluator.evaluate_all(test_prompt_complex)")
        return
    
    # Initialize with API key
    evaluator = LLMJudgeEvaluator(METRIC_CONFIG, CONFIG, api_key=api_key)
    
    print("Testing LLM Judge Evaluator...")
    print(f"\nPrompt: {test_prompt_complex[:100]}...\n")
    
    # Evaluate
    results = evaluator.evaluate_all(test_prompt_complex)
    
    # Display results
    total_score = sum(r['score'] for r in results.values())
    total_possible = sum(r['max_points'] for r in results.values())
    
    print(f"\n{'='*70}")
    print(f"LLM JUDGE RESULTS")
    print(f"{'='*70}")
    print(f"Total Score: {total_score:.2f} / {total_possible:.2f} ({total_score/total_possible*100:.1f}%)")
    print(f"\nMetric Breakdown:\n")
    
    # Group by category
    by_group = {}
    for metric_name, result in results.items():
        group = METRIC_CONFIG[metric_name]['group']
        if group not in by_group:
            by_group[group] = []
        by_group[group].append((metric_name, result))
    
    # Display by group
    for group_name, metrics in by_group.items():
        group_score = sum(r['score'] for _, r in metrics)
        group_max = sum(r['max_points'] for _, r in metrics)
        
        print(f"\n{group_name.upper().replace('_', ' ')}: {group_score:.2f}/{group_max:.2f}")
        print("-" * 70)
        
        for metric_name, result in metrics:
            status = "✅" if not result['below_threshold'] else "❌"
            print(f"{status} {metric_name}: {result['score']:.2f}/{result['max_points']:.2f}")
            print(f"   {result['justification'][:100]}...")
    
    return results

# Show test instructions
print("\n" + "="*70)
print("TEST INSTRUCTIONS")
print("="*70)
print("""
To test the LLM Judge Evaluator, you need an Anthropic API key.

Get your API key from: https://console.anthropic.com/

Then run:
    api_key = "sk-ant-..."  # Your API key
    results = test_llm_judge(api_key)

Or initialize directly:
    evaluator = LLMJudgeEvaluator(METRIC_CONFIG, CONFIG, api_key=api_key)
    results = evaluator.evaluate_all(test_prompt_complex)
""")