# Example 2: Co-Evolving Validation Rubrics

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Javihaus/agents_observability_bootcamp/blob/main/chapter_04_production_hybrid_systems/examples/example_02_validation_rubrics.ipynb)

**Instructor demonstration** - Students follow along

---

## Objective

Demonstrate validation rubrics that improve over time based on production feedback.

**Key lesson**: Static validation becomes outdated. Co-evolving rubrics adapt to changing patterns.

## Setup

In [None]:
!pip install -q langchain==0.1.0 langchain-anthropic==0.1.1
from collections import Counter
from datetime import datetime
import json

print("Setup complete!")

## Co-Evolving Validator Implementation

In [None]:
class Rule:
    """Base validation rule"""
    def __init__(self, name, description):
        self.name = name
        self.description = description
        self.effectiveness = 0.0
    
    def check(self, output):
        """Return True if output passes validation"""
        raise NotImplementedError

class KeywordRule(Rule):
    """Rule checking for presence/absence of keywords"""
    def __init__(self, keyword, should_contain=True):
        super().__init__(
            f"keyword_{keyword}",
            f"{'Contains' if should_contain else 'Avoids'} '{keyword}'"
        )
        self.keyword = keyword.lower()
        self.should_contain = should_contain
    
    def check(self, output):
        contains = self.keyword in output.lower()
        return contains == self.should_contain

class LengthRule(Rule):
    """Rule checking output length"""
    def __init__(self, min_length=None, max_length=None):
        super().__init__(
            "length_check",
            f"Length between {min_length} and {max_length}"
        )
        self.min_length = min_length
        self.max_length = max_length
    
    def check(self, output):
        length = len(output)
        if self.min_length and length < self.min_length:
            return False
        if self.max_length and length > self.max_length:
            return False
        return True

print("Rule classes defined")

In [None]:
class CoEvolvingValidator:
    """Validator that improves based on production feedback"""
    
    def __init__(self):
        self.rules = []
        self.production_feedback = []
        self.evolution_history = []
    
    def validate(self, output):
        """Validate output using current rules"""
        for rule in self.rules:
            if not rule.check(output):
                return False, f"Failed: {rule.description}"
        return True, "All validations passed"
    
    def add_production_feedback(self, output, was_successful):
        """Record production outcome"""
        self.production_feedback.append({
            'output': output,
            'successful': was_successful,
            'timestamp': datetime.now()
        })
    
    def evolve(self, min_confidence=0.7):
        """Update rules based on production feedback"""
        print(f"\nEvolving validator with {len(self.production_feedback)} feedback entries...")
        
        # Analyze failures
        failures = [f for f in self.production_feedback if not f['successful']]
        successes = [f for f in self.production_feedback if f['successful']]
        
        print(f"Analyzing {len(failures)} failures and {len(successes)} successes")
        
        if len(failures) == 0:
            print("No failures to learn from")
            return
        
        # Extract patterns from failures
        new_rules = self._extract_patterns(failures, successes)
        
        # Add high-confidence rules
        rules_added = 0
        for rule, confidence in new_rules:
            if confidence >= min_confidence:
                self.rules.append(rule)
                rules_added += 1
                print(f"  + Added rule: {rule.description} (confidence: {confidence:.2f})")
        
        # Prune ineffective rules
        rules_removed = self._prune_ineffective_rules()
        
        # Record evolution event
        self.evolution_history.append({
            'timestamp': datetime.now(),
            'rules_added': rules_added,
            'rules_removed': rules_removed,
            'total_rules': len(self.rules)
        })
        
        print(f"\nEvolution complete: +{rules_added} rules, -{rules_removed} rules")
        print(f"Total active rules: {len(self.rules)}")
    
    def _extract_patterns(self, failures, successes):
        """Identify common patterns in failures vs successes"""
        patterns = []
        
        # Extract keywords from failures
        failure_keywords = Counter()
        for f in failures:
            words = f['output'].lower().split()
            failure_keywords.update(words)
        
        # Extract keywords from successes
        success_keywords = Counter()
        for s in successes:
            words = s['output'].lower().split()
            success_keywords.update(words)
        
        # Find keywords that appear frequently in failures but not successes
        for keyword, fail_count in failure_keywords.most_common(10):
            if len(keyword) < 3:  # Skip short words
                continue
            
            fail_rate = fail_count / len(failures)
            success_rate = success_keywords.get(keyword, 0) / max(len(successes), 1)
            
            # High in failures, low in successes = good indicator
            if fail_rate > 0.3 and success_rate < 0.1:
                rule = KeywordRule(keyword, should_contain=False)
                confidence = fail_rate - success_rate
                patterns.append((rule, confidence))
        
        return patterns
    
    def _prune_ineffective_rules(self, min_effectiveness=0.05):
        """Remove rules that don't prevent failures"""
        if len(self.production_feedback) == 0:
            return 0
        
        effective_rules = []
        removed = 0
        
        for rule in self.rules:
            # Count how many failures this rule would have caught
            would_catch = sum(
                1 for f in self.production_feedback
                if not f['successful'] and not rule.check(f['output'])
            )
            
            effectiveness = would_catch / len(self.production_feedback)
            rule.effectiveness = effectiveness
            
            if effectiveness >= min_effectiveness:
                effective_rules.append(rule)
            else:
                removed += 1
                print(f"  - Removed rule: {rule.description} (effectiveness: {effectiveness:.2%})")
        
        self.rules = effective_rules
        return removed
    
    def get_stats(self):
        """Get validator statistics"""
        return {
            'total_rules': len(self.rules),
            'production_feedback_count': len(self.production_feedback),
            'evolution_events': len(self.evolution_history),
            'rules': [{'name': r.name, 'description': r.description, 'effectiveness': r.effectiveness}
                     for r in self.rules]
        }

print("CoEvolvingValidator defined")

## Demonstration: Validator Evolution

In [None]:
# Initialize validator with basic rules
validator = CoEvolvingValidator()
validator.rules.append(LengthRule(min_length=10, max_length=500))

print("Initial validator state:")
print(f"  Rules: {len(validator.rules)}")

# Simulate production feedback
production_data = [
    ("Great product! Highly recommend.", True),
    ("Buy now! Limited offer expires soon!", False),  # Spam
    ("This helped me solve my problem.", True),
    ("Click here for amazing deals!", False),  # Spam
    ("Thank you for the support.", True),
    ("Urgent! Act fast to claim your prize!", False),  # Spam
    ("The documentation was clear and helpful.", True),
    ("Limited time offer! Buy now!", False),  # Spam
]

# Add production feedback
print("\nCollecting production feedback...")
for output, success in production_data:
    validator.add_production_feedback(output, success)
    print(f"  {'✓' if success else '✗'} {output[:50]}...")

# Evolve validator
validator.evolve(min_confidence=0.5)

# Show final state
print("\n" + "=" * 80)
print("FINAL VALIDATOR STATE")
print("=" * 80)
stats = validator.get_stats()
print(json.dumps(stats, indent=2, default=str))

# Test on new examples
print("\n" + "=" * 80)
print("TESTING ON NEW EXAMPLES")
print("=" * 80)

test_examples = [
    "This product works well.",
    "Buy now for limited time!",
    "I appreciate your help."
]

for example in test_examples:
    is_valid, message = validator.validate(example)
    print(f"\n{example}")
    print(f"  Result: {'✓ PASS' if is_valid else '✗ FAIL'}")
    print(f"  Message: {message}")

## Key Takeaways

### What We Demonstrated

1. **Co-evolution process**: Validator automatically learns from production failures
2. **Pattern extraction**: Identifies keywords common in failures
3. **Rule pruning**: Removes ineffective rules over time
4. **Continuous improvement**: System adapts without manual updates

### Production Considerations

- Run evolution periodically (daily or weekly)
- Require minimum confidence (0.7-0.9) for new rules
- Monitor rule effectiveness continuously
- Version validators for rollback capability
- A/B test new rules before full deployment

### Next Steps

- Integrate with hybrid system from Example 1
- Implement more sophisticated pattern extraction (ML-based)
- Add rule versioning and audit trail
- Build monitoring dashboard for rule effectiveness