In [None]:
import re
from typing import Dict, Tuple

class CustomMetricEvaluator:
    """
    Custom function-based metrics using pattern matching and keyword detection.
    Each function returns (score, evidence) tuple.
    """
    
    def __init__(self, metric_config: dict):
        self.metric_config = metric_config
    
    def evaluate_all(self, prompt: str) -> Dict[str, Dict]:
        """Evaluate all custom function metrics."""
        results = {}
        
        # Get all custom function metrics
        custom_metrics = {
            name: config for name, config in self.metric_config.items()
            if config['type'] == 'custom_function'
        }
        
        for metric_name, config in custom_metrics.items():
            score, evidence = self._evaluate_metric(metric_name, prompt, config)
            results[metric_name] = {
                'score': score,
                'max_points': config['max_points'],
                'threshold': config['threshold'],
                'below_threshold': score < config['threshold'],
                'evidence': evidence,
                'type': 'custom_function'
            }
        
        return results
    
    def _evaluate_metric(self, metric_name: str, prompt: str, config: dict) -> Tuple[float, str]:
        """Route to specific evaluation function."""
        evaluators = {
            "Structured / Numbered Instructions": self._eval_structured_instructions,
            "Examples or Demonstrations": self._eval_examples,
            "Use of Role or Persona": self._eval_role_persona,
            "Audience Specification": self._eval_audience,
            "Output Validation Hooks": self._eval_validation_hooks,
            "Time/Effort Estimation Request": self._eval_time_estimation,
            "Self-Repair Loops": self._eval_self_repair,
            "Memory Anchoring": self._eval_memory_anchoring,
            "Calibration Requests": self._eval_calibration,
            "Comparison Requests": self._eval_comparison
        }
        
        evaluator = evaluators.get(metric_name)
        if evaluator:
            return evaluator(prompt, config['max_points'])
        return 0, "Evaluator not found"
    
    # ===========================================
    # INDIVIDUAL METRIC EVALUATORS
    # ===========================================
    
    def _eval_structured_instructions(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect structured formatting (bullets, numbers, sections)."""
        evidence_parts = []
        score = 0
        
        # Check for numbered lists (1. 2. or 1) 2) etc.)
        numbered_pattern = r'(?:^|\n)\s*\d+[\.)]\s+'
        numbered_matches = len(re.findall(numbered_pattern, prompt))
        if numbered_matches >= 3:
            score += max_points * 0.4
            evidence_parts.append(f"Numbered list with {numbered_matches} items")
        
        # Check for bullet points (-, *, •)
        bullet_pattern = r'(?:^|\n)\s*[-*•]\s+'
        bullet_matches = len(re.findall(bullet_pattern, prompt))
        if bullet_matches >= 3:
            score += max_points * 0.3
            evidence_parts.append(f"Bullet points with {bullet_matches} items")
        
        # Check for markdown headers (# ## ###)
        header_pattern = r'(?:^|\n)#{1,6}\s+.+$'
        header_matches = len(re.findall(header_pattern, prompt, re.MULTILINE))
        if header_matches >= 2:
            score += max_points * 0.3
            evidence_parts.append(f"{header_matches} section headers")
        
        # Cap at max_points
        score = min(score, max_points)
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No structured formatting detected"
        return score, evidence
    
    def _eval_examples(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect examples or demonstrations."""
        evidence_parts = []
        score = 0
        
        # Keywords indicating examples
        example_keywords = [
            r'\bexample[s]?\b', r'\bfor instance\b', r'\bsuch as\b',
            r'\be\.g\.\b', r'\blike\b.*\bthis\b', r'\bdemonstration\b',
            r'\bsample\b', r'\billustration\b'
        ]
        
        keyword_matches = 0
        for pattern in example_keywords:
            if re.search(pattern, prompt, re.IGNORECASE):
                keyword_matches += 1
        
        if keyword_matches > 0:
            score += max_points * 0.3
            evidence_parts.append(f"{keyword_matches} example indicator keywords")
        
        # Code blocks (``` or indented code)
        code_block_pattern = r'```[\s\S]*?```|(?:^|\n)    .+(?:\n    .+)*'
        code_blocks = len(re.findall(code_block_pattern, prompt))
        if code_blocks > 0:
            score += max_points * 0.4
            evidence_parts.append(f"{code_blocks} code block(s)")
        
        # Quoted examples ("..." or '...')
        quote_pattern = r'["\'](?:[^"\']{20,})["\']'
        quotes = len(re.findall(quote_pattern, prompt))
        if quotes > 0:
            score += max_points * 0.3
            evidence_parts.append(f"{quotes} quoted example(s)")
        
        # Cap at max_points
        score = min(score, max_points)
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No examples detected"
        return score, evidence
    
    def _eval_role_persona(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect role or persona assignment."""
        evidence_parts = []
        score = 0
        
        # Strong role indicators
        strong_patterns = [
            r'\byou are (?:a |an |the )?(\w+)',
            r'\bact as (?:a |an |the )?(\w+)',
            r'\bassume the role of (?:a |an |the )?(\w+)',
            r'\bpretend (?:to be|you are) (?:a |an |the )?(\w+)',
            r'\btake on the persona of (?:a |an |the )?(\w+)',
            r'\bas (?:a |an |the )?(\w+), you'
        ]
        
        for pattern in strong_patterns:
            matches = re.findall(pattern, prompt, re.IGNORECASE)
            if matches:
                score = max_points  # Full points for explicit role
                roles = [m if isinstance(m, str) else m[0] for m in matches[:3]]
                evidence_parts.append(f"Explicit role: {', '.join(roles)}")
                break
        
        # Weaker role indicators
        if score == 0:
            weak_patterns = [
                r'\bexpert\b', r'\bprofessional\b', r'\bspecialist\b',
                r'\bconsultant\b', r'\badviser\b', r'\bassistant\b'
            ]
            weak_matches = sum(1 for p in weak_patterns if re.search(p, prompt, re.IGNORECASE))
            if weak_matches > 0:
                score = max_points * 0.5
                evidence_parts.append(f"Implicit role indicators: {weak_matches} found")
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No role or persona detected"
        return score, evidence
    
    def _eval_audience(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect audience specification."""
        evidence_parts = []
        score = 0
        
        # Explicit audience patterns
        audience_patterns = [
            r'\bfor (?:a |an |the )?(\w+(?:\s+\w+)?)\s+audience\b',
            r'\btarget audience\s*:?\s*(\w+)',
            r'\bexplain to (?:a |an |the )?(\w+)',
            r'\bsuitable for (\w+(?:\s+\w+)?)',
            r'\bwrite for (\w+(?:\s+\w+)?)',
            r'\baddressed to (\w+(?:\s+\w+)?)'
        ]
        
        for pattern in audience_patterns:
            matches = re.findall(pattern, prompt, re.IGNORECASE)
            if matches:
                score = max_points  # Full points for explicit audience
                audiences = matches[:2]
                evidence_parts.append(f"Target audience: {', '.join(audiences)}")
                break
        
        # Implicit audience indicators
        if score == 0:
            implicit_patterns = [
                r'\bbeginners?\b', r'\bexperts?\b', r'\bstudents?\b',
                r'\bchildren\b', r'\bprofessionals?\b', r'\btechnical\b',
                r'\bnon-technical\b', r'\bgeneral public\b', r'\blaymen\b'
            ]
            implicit_matches = []
            for pattern in implicit_patterns:
                match = re.search(pattern, prompt, re.IGNORECASE)
                if match:
                    implicit_matches.append(match.group(0))
            
            if implicit_matches:
                score = max_points * 0.6
                evidence_parts.append(f"Implicit audience: {', '.join(implicit_matches[:2])}")
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No audience specification detected"
        return score, evidence
    
    def _eval_validation_hooks(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect validation or verification requests."""
        evidence_parts = []
        score = 0
        
        validation_keywords = [
            r'\bvalidate\b', r'\bverify\b', r'\bcheck\b', r'\bensure\b',
            r'\bconfirm\b', r'\breview\b', r'\bdouble-check\b',
            r'\bcross-check\b', r'\btest\b', r'\bproof\b'
        ]
        
        matches = []
        for pattern in validation_keywords:
            found = re.findall(pattern, prompt, re.IGNORECASE)
            matches.extend(found)
        
        unique_matches = len(set(m.lower() for m in matches))
        
        if unique_matches >= 3:
            score = max_points
            evidence_parts.append(f"{unique_matches} validation keywords")
        elif unique_matches == 2:
            score = max_points * 0.7
            evidence_parts.append(f"{unique_matches} validation keywords")
        elif unique_matches == 1:
            score = max_points * 0.4
            evidence_parts.append(f"{unique_matches} validation keyword")
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No validation hooks detected"
        return score, evidence
    
    def _eval_time_estimation(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect time or effort estimation requests."""
        evidence_parts = []
        score = 0
        
        time_patterns = [
            r'\bestimate (?:the )?time\b', r'\bhow long\b',
            r'\btime required\b', r'\btime estimate\b',
            r'\beffort required\b', r'\bestimate (?:the )?effort\b',
            r'\bduration\b', r'\btimeline\b'
        ]
        
        for pattern in time_patterns:
            if re.search(pattern, prompt, re.IGNORECASE):
                score = max_points
                evidence_parts.append(f"Time/effort estimation requested")
                break
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No time/effort estimation requested"
        return score, evidence
    
    def _eval_self_repair(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect self-correction or refinement loops."""
        evidence_parts = []
        score = 0
        
        repair_keywords = [
            r'\biterate\b', r'\brefine\b', r'\bimprove\b', r'\brevise\b',
            r'\badjust\b', r'\boptimize\b', r'\benhance\b',
            r'\bself-correct\b', r'\bfeedback loop\b', r'\biterative\b'
        ]
        
        matches = []
        for pattern in repair_keywords:
            found = re.findall(pattern, prompt, re.IGNORECASE)
            matches.extend(found)
        
        unique_matches = len(set(m.lower() for m in matches))
        
        if unique_matches >= 3:
            score = max_points
            evidence_parts.append(f"{unique_matches} self-repair indicators")
        elif unique_matches == 2:
            score = max_points * 0.7
            evidence_parts.append(f"{unique_matches} self-repair indicators")
        elif unique_matches == 1:
            score = max_points * 0.4
            evidence_parts.append(f"{unique_matches} self-repair indicator")
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No self-repair loops detected"
        return score, evidence
    
    def _eval_memory_anchoring(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect memory or context anchoring."""
        evidence_parts = []
        score = 0
        
        memory_keywords = [
            r'\bremember\b', r'\brecall\b', r'\bpreviously\b',
            r'\bcontext from\b', r'\bas we discussed\b',
            r'\bearlier conversation\b', r'\bfrom before\b',
            r'\bkeep in mind\b', r'\bdon\'t forget\b'
        ]
        
        matches = []
        for pattern in memory_keywords:
            found = re.findall(pattern, prompt, re.IGNORECASE)
            matches.extend(found)
        
        unique_matches = len(set(m.lower() for m in matches))
        
        if unique_matches >= 2:
            score = max_points
            evidence_parts.append(f"{unique_matches} memory anchoring indicators")
        elif unique_matches == 1:
            score = max_points * 0.6
            evidence_parts.append(f"{unique_matches} memory anchoring indicator")
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No memory anchoring detected"
        return score, evidence
    
    def _eval_calibration(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect calibration or confidence requests."""
        evidence_parts = []
        score = 0
        
        calibration_patterns = [
            r'\bcalibrate\b', r'\bconfidence level\b',
            r'\bcertainty\b', r'\buncertainty\b',
            r'\bhow confident\b', r'\btune\b', r'\badjust\b.*\bconfidence\b'
        ]
        
        for pattern in calibration_patterns:
            if re.search(pattern, prompt, re.IGNORECASE):
                score = max_points
                evidence_parts.append("Calibration/confidence request detected")
                break
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No calibration requests detected"
        return score, evidence
    
    def _eval_comparison(self, prompt: str, max_points: float) -> Tuple[float, str]:
        """Detect comparison requests."""
        evidence_parts = []
        score = 0
        
        comparison_keywords = [
            r'\bcompare\b', r'\bvs\.?\b', r'\bversus\b',
            r'\bcontrast\b', r'\bdifference(?:s)? between\b',
            r'\bsimilarit(?:y|ies)\b', r'\bbetter than\b',
            r'\bpros and cons\b', r'\badvantages? (?:and|vs) disadvantages?\b'
        ]
        
        matches = []
        for pattern in comparison_keywords:
            found = re.findall(pattern, prompt, re.IGNORECASE)
            matches.extend(found)
        
        unique_matches = len(set(m.lower() for m in matches))
        
        if unique_matches >= 2:
            score = max_points
            evidence_parts.append(f"{unique_matches} comparison indicators")
        elif unique_matches == 1:
            score = max_points * 0.6
            evidence_parts.append(f"{unique_matches} comparison indicator")
        
        evidence = "; ".join(evidence_parts) if evidence_parts else "No comparison requests detected"
        return score, evidence


# Test the evaluator
print("✅ Custom Metric Evaluator Created!")
print("\nCustom Function Metrics (10 total):")
custom_metrics = [name for name, config in METRIC_CONFIG.items() if config['type'] == 'custom_function']
for i, metric in enumerate(custom_metrics, 1):
    points = METRIC_CONFIG[metric]['max_points']
    print(f"  {i}. {metric} ({points} pts)")

In [None]:
# Initialize evaluator
evaluator = CustomMetricEvaluator(METRIC_CONFIG)

# Test prompts
test_prompts = {
    "Basic": "Write a blog post about AI.",
    
    "Structured": """
# Task: Write a Blog Post

Please complete the following:
1. Research AI trends
2. Write introduction
3. Develop main points
4. Conclude with insights

- Use clear language
- Include examples
- Keep it under 1000 words
""",
    
    "Rich": """
You are an expert technical writer targeting software engineers.

Task: Compare Python vs JavaScript for beginners.

Requirements:
1. Explain core differences
2. Provide code examples for each
3. Validate your claims with recent data
4. Estimate time to learn each language

Remember our previous discussion about syntax simplicity.
Please iterate on your draft and refine based on clarity.
"""
}

# Evaluate each test prompt
for name, prompt in test_prompts.items():
    print(f"\n{'='*60}")
    print(f"TEST: {name} Prompt")
    print(f"{'='*60}")
    print(f"Prompt: {prompt[:100]}...")
    print()
    
    results = evaluator.evaluate_all(prompt)
    
    total_score = sum(r['score'] for r in results.values())
    total_possible = sum(r['max_points'] for r in results.values())
    
    print(f"Custom Function Score: {total_score:.2f} / {total_possible:.2f}")
    print(f"\nMetric Breakdown:")
    
    for metric_name, result in results.items():
        status = "❌" if result['below_threshold'] else "✅"
        print(f"  {status} {metric_name}: {result['score']:.2f}/{result['max_points']:.2f}")
        print(f"     Evidence: {result['evidence']}")