In [None]:
class PromptImprover:
    """
    Generates improved prompts based on evaluation results.
    Focuses on boosting below-threshold metrics.
    """
    
    def __init__(self, metric_config: dict, config: dict, api_key: str = None):
        self.metric_config = metric_config
        self.config = config
        self.client = anthropic.Anthropic(api_key=api_key) if api_key else None
    
    def improve_prompt(self, original_prompt: str, evaluation_result: Dict) -> Dict[str, str]:
        """
        Generate an improved version of the prompt.
        
        Args:
            original_prompt: The original prompt text
            evaluation_result: Full evaluation results from PromptEvaluator
        
        Returns:
            Dictionary with 'improved_prompt' and 'change_explanations'
        """
        print(f"\n{'='*70}")
        print(f"GENERATING IMPROVED PROMPT")
        print(f"{'='*70}")
        
        below_threshold = evaluation_result['below_threshold_metrics']
        
        if not below_threshold:
            print("‚úÖ All metrics above threshold - no improvements needed!")
            return {
                'improved_prompt': original_prompt,
                'change_explanations': "All metrics are above threshold. No changes made."
            }
        
        print(f"Targeting {len(below_threshold)} below-threshold metrics...")
        
        # Build improvement prompt
        improvement_prompt = self._build_improvement_prompt(
            original_prompt, 
            evaluation_result
        )
        
        # Call Claude API
        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.3,  # Slightly creative but controlled
                messages=[
                    {"role": "user", "content": improvement_prompt}
                ]
            )
            
            response_text = response.content[0].text
            
            # Parse response
            result = self._parse_improvement_response(response_text)
            
            print("‚úÖ Improved prompt generated!")
            return result
            
        except Exception as e:
            print(f"‚ùå Error calling Claude API: {e}")
            return {
                'improved_prompt': original_prompt,
                'change_explanations': f"Error during improvement: {str(e)}"
            }
    
    def _build_improvement_prompt(self, original_prompt: str, 
                                  evaluation_result: Dict) -> str:
        """Build the prompt for Claude to improve the original prompt."""
        
        below_threshold = evaluation_result['below_threshold_metrics']
        metrics_details = evaluation_result['metrics']
        
        # Group below-threshold metrics by category
        by_group = {}
        for metric_name in below_threshold:
            metric_data = metrics_details[metric_name]
            group = self.metric_config[metric_name]['group']
            
            if group not in by_group:
                by_group[group] = []
            
            gap = metric_data['threshold'] - metric_data['score']
            by_group[group].append({
                'name': metric_name,
                'score': metric_data['score'],
                'max': metric_data['max_points'],
                'threshold': metric_data['threshold'],
                'gap': gap,
                'description': self.metric_config[metric_name]['description'],
                'evidence': metric_data.get('justification') or metric_data.get('evidence')
            })
        
        # Build detailed metric information
        metrics_info_parts = []
        for group, metrics in by_group.items():
            metrics_info_parts.append(f"\n**{group.upper().replace('_', ' ')}:**")
            for m in metrics:
                metrics_info_parts.append(
                    f"\n‚Ä¢ **{m['name']}** - Score: {m['score']:.2f}/{m['max']:.2f} "
                    f"(Need: {m['threshold']:.2f}, Gap: {m['gap']:.2f})"
                )
                metrics_info_parts.append(f"  - Description: {m['description']}")
                metrics_info_parts.append(f"  - Current assessment: {m['evidence']}")
        
        metrics_info = "\n".join(metrics_info_parts)
        
        # Create improvement prompt
        improvement_prompt = f"""You are an expert prompt engineer. Your task is to improve a prompt by addressing specific weaknesses identified in an evaluation.

ORIGINAL PROMPT:
```
{original_prompt}
```

EVALUATION SUMMARY:
- Total Score: {evaluation_result['total_score']:.2f} / {evaluation_result['total_max']} ({evaluation_result['percentage']:.1f}%)
- Metrics Below Threshold: {len(below_threshold)}
- Target: Bring all metrics above their 70% threshold

METRICS REQUIRING IMPROVEMENT:
{metrics_info}

YOUR TASK:
Rewrite the prompt to address ALL the metrics listed above that are below threshold. For each metric, implement specific improvements to bring it above the threshold.

IMPROVEMENT STRATEGIES:

1. **Structure & Clarity**: Add numbered steps, clear sections, explicit task definitions
2. **Context & Information**: Provide background, specify output format, include examples
3. **Reasoning & Cognition**: Request step-by-step thinking, enable iteration, progressive complexity
4. **Safety & Alignment**: Address uncertainty handling, minimize hallucinations, define safe failures
5. **Format & Style**: Assign roles/personas, specify audience, calibrate tone
6. **Output Quality**: Ensure feasibility, add validation hooks, enable self-correction
7. **Advanced Features**: Add memory anchoring, calibration requests, comparisons

CRITICAL REQUIREMENTS:
- Preserve the core intent and purpose of the original prompt
- Add substance, not just keywords (make genuine improvements)
- Be specific and actionable in your additions
- Don't make the prompt unnecessarily verbose - be concise but complete
- Ensure all improvements are relevant to the original task

OUTPUT FORMAT:
Provide your response in two sections:
```
IMPROVED PROMPT:
[Write the complete improved prompt here]

---

CHANGES MADE:
[For each metric you addressed, explain what specific changes you made and how they improve that metric. Format as a numbered or bulleted list with the metric name and explanation.]
```

Generate the improved prompt now."""
        
        return improvement_prompt
    
    def _parse_improvement_response(self, response_text: str) -> Dict[str, str]:
        """Parse Claude's improvement response."""
        
        # Look for the two sections
        try:
            # Split on the separator
            if "---" in response_text:
                parts = response_text.split("---", 1)
            elif "CHANGES MADE:" in response_text:
                parts = response_text.split("CHANGES MADE:", 1)
                parts[1] = "CHANGES MADE:" + parts[1]
            else:
                # Try to find both sections
                improved_start = response_text.find("IMPROVED PROMPT:")
                changes_start = response_text.find("CHANGES MADE:")
                
                if improved_start != -1 and changes_start != -1:
                    improved_section = response_text[improved_start:changes_start]
                    changes_section = response_text[changes_start:]
                    parts = [improved_section, changes_section]
                else:
                    # Fallback: treat entire response as improved prompt
                    return {
                        'improved_prompt': response_text.strip(),
                        'change_explanations': "Changes section not found in response."
                    }
            
            # Extract improved prompt (remove header)
            improved_prompt = parts[0].replace("IMPROVED PROMPT:", "").strip()
            improved_prompt = improved_prompt.strip('`').strip()
            
            # Extract change explanations (remove header)
            change_explanations = parts[1].replace("CHANGES MADE:", "").strip()
            change_explanations = change_explanations.strip('`').strip()
            
            return {
                'improved_prompt': improved_prompt,
                'change_explanations': change_explanations
            }
            
        except Exception as e:
            print(f"Warning: Error parsing improvement response: {e}")
            return {
                'improved_prompt': response_text.strip(),
                'change_explanations': "Could not parse change explanations."
            }


print("‚úÖ Prompt Improver Created!")

In [None]:
def test_prompt_improver(api_key: str = None, use_mock: bool = True):
    """Test the prompt improver."""
    
    # Use the result from previous evaluation
    if 'result' not in globals():
        print("‚ö†Ô∏è  Need to run evaluation first!")
        print("Run: result, report = test_full_evaluator(use_mock=True)")
        return None
    
    test_prompt = result['prompt']
    
    print(f"Testing Prompt Improver")
    print(f"Using {'MOCK' if use_mock else 'REAL'} improvement")
    print(f"\nOriginal Score: {result['total_score']:.2f}/150 ({result['percentage']:.1f}%)")
    print(f"Below Threshold: {result['below_threshold_count']} metrics")
    print()
    
    if use_mock:
        # Create mock improvement
        improvement_result = _mock_improve_prompt(test_prompt, result)
    else:
        if not api_key:
            print("‚ö†Ô∏è  API key required for real improvement")
            return None
        improver = PromptImprover(METRIC_CONFIG, CONFIG, api_key=api_key)
        improvement_result = improver.improve_prompt(test_prompt, result)
    
    # Display results
    print("\n" + "="*80)
    print("IMPROVED PROMPT")
    print("="*80)
    print(improvement_result['improved_prompt'])
    
    print("\n" + "="*80)
    print("CHANGES MADE")
    print("="*80)
    print(improvement_result['change_explanations'])
    
    return improvement_result


def _mock_improve_prompt(original_prompt: str, evaluation_result: Dict) -> Dict[str, str]:
    """Mock improvement for testing without API key."""
    
    below_threshold = evaluation_result['below_threshold_metrics']
    
    # Create mock improved prompt
    improvements = []
    
    # Check what's missing and add it
    prompt_lower = original_prompt.lower()
    
    if any('structure' in m.lower() for m in below_threshold):
        improvements.append("structured format with numbered steps")
    if any('example' in m.lower() for m in below_threshold):
        improvements.append("concrete examples")
    if any('role' in m.lower() or 'persona' in m.lower() for m in below_threshold):
        improvements.append("role assignment")
    if any('reasoning' in m.lower() for m in below_threshold):
        improvements.append("step-by-step reasoning request")
    if any('validation' in m.lower() for m in below_threshold):
        improvements.append("validation requirements")
    if any('audience' in m.lower() for m in below_threshold):
        improvements.append("audience specification")
    
    improved_prompt = f"""# Enhanced Task

You are an expert in the domain with deep knowledge.

## Original Request
{original_prompt}

## Additional Requirements
1. Think through this step-by-step
2. Provide specific examples
3. Validate your approach
4. Target audience: professionals with intermediate knowledge

## Process
- Break down the task into clear stages
- Verify each stage before proceeding
- Iterate on your output for quality

Please ensure all criteria are met before providing your final response."""
    
    change_explanations = f"""**MOCK IMPROVEMENTS APPLIED:**

The following enhancements were made to address {len(below_threshold)} below-threshold metrics:

"""
    
    for i, improvement in enumerate(improvements, 1):
        change_explanations += f"{i}. Added {improvement}\n"
    
    change_explanations += "\n*Note: This is a mock improvement for testing. Use real API for production.*"
    
    return {
        'improved_prompt': improved_prompt,
        'change_explanations': change_explanations
    }


# Run test
print("="*80)
print("RUNNING PROMPT IMPROVER TEST")
print("="*80)

if 'result' in globals():
    improvement_result = test_prompt_improver(use_mock=True)
    if improvement_result:
        print("\n‚úÖ Improvement complete!")
        print("   Result stored in 'improvement_result' variable")
else:
    print("‚ö†Ô∏è  Please run evaluation first:")
    print("   result, report = test_full_evaluator(use_mock=True)")

In [None]:
def test_prompt_improver(api_key: str = None, use_mock: bool = True):
    """Test the prompt improver."""
    
    # Use the result from previous evaluation
    if 'result' not in globals():
        print("‚ö†Ô∏è  Need to run evaluation first!")
        print("Run: result, report = test_full_evaluator(use_mock=True)")
        return None
    
    test_prompt = result['prompt']
    
    print(f"Testing Prompt Improver")
    print(f"Using {'MOCK' if use_mock else 'REAL'} improvement")
    print(f"\nOriginal Score: {result['total_score']:.2f}/150 ({result['percentage']:.1f}%)")
    print(f"Below Threshold: {result['below_threshold_count']} metrics")
    print()
    
    if use_mock:
        # Create mock improvement
        improvement_result = _mock_improve_prompt(test_prompt, result)
    else:
        if not api_key:
            print("‚ö†Ô∏è  API key required for real improvement")
            return None
        improver = PromptImprover(METRIC_CONFIG, CONFIG, api_key=api_key)
        improvement_result = improver.improve_prompt(test_prompt, result)
    
    # Display results
    print("\n" + "="*80)
    print("IMPROVED PROMPT")
    print("="*80)
    print(improvement_result['improved_prompt'])
    
    print("\n" + "="*80)
    print("CHANGES MADE")
    print("="*80)
    print(improvement_result['change_explanations'])
    
    return improvement_result


def _mock_improve_prompt(original_prompt: str, evaluation_result: Dict) -> Dict[str, str]:
    """Mock improvement for testing without API key."""
    
    below_threshold = evaluation_result['below_threshold_metrics']
    
    # Create mock improved prompt
    improvements = []
    
    # Check what's missing and add it
    prompt_lower = original_prompt.lower()
    
    if any('structure' in m.lower() for m in below_threshold):
        improvements.append("structured format with numbered steps")
    if any('example' in m.lower() for m in below_threshold):
        improvements.append("concrete examples")
    if any('role' in m.lower() or 'persona' in m.lower() for m in below_threshold):
        improvements.append("role assignment")
    if any('reasoning' in m.lower() for m in below_threshold):
        improvements.append("step-by-step reasoning request")
    if any('validation' in m.lower() for m in below_threshold):
        improvements.append("validation requirements")
    if any('audience' in m.lower() for m in below_threshold):
        improvements.append("audience specification")
    
    improved_prompt = f"""# Enhanced Task

You are an expert in the domain with deep knowledge.

## Original Request
{original_prompt}

## Additional Requirements
1. Think through this step-by-step
2. Provide specific examples
3. Validate your approach
4. Target audience: professionals with intermediate knowledge

## Process
- Break down the task into clear stages
- Verify each stage before proceeding
- Iterate on your output for quality

Please ensure all criteria are met before providing your final response."""
    
    change_explanations = f"""**MOCK IMPROVEMENTS APPLIED:**

The following enhancements were made to address {len(below_threshold)} below-threshold metrics:

"""
    
    for i, improvement in enumerate(improvements, 1):
        change_explanations += f"{i}. Added {improvement}\n"
    
    change_explanations += "\n*Note: This is a mock improvement for testing. Use real API for production.*"
    
    return {
        'improved_prompt': improved_prompt,
        'change_explanations': change_explanations
    }


# Run test
print("="*80)
print("RUNNING PROMPT IMPROVER TEST")
print("="*80)

if 'result' in globals():
    improvement_result = test_prompt_improver(use_mock=True)
    if improvement_result:
        print("\n‚úÖ Improvement complete!")
        print("   Result stored in 'improvement_result' variable")
else:
    print("‚ö†Ô∏è  Please run evaluation first:")
    print("   result, report = test_full_evaluator(use_mock=True)")

In [None]:
def compare_prompts(original: str, improved: str, changes: str):
    """Create a side-by-side comparison of prompts."""
    
    print("\n" + "="*100)
    print("PROMPT COMPARISON")
    print("="*100)
    
    # Stats comparison
    print("\nüìä Statistics:")
    print(f"{'Metric':<30} {'Original':<20} {'Improved':<20} {'Change':<20}")
    print("-" * 100)
    
    original_lines = original.split('\n')
    improved_lines = improved.split('\n')
    
    stats = [
        ("Length (characters)", len(original), len(improved)),
        ("Length (words)", len(original.split()), len(improved.split())),
        ("Lines", len(original_lines), len(improved_lines)),
        ("Has numbered list", bool(re.search(r'\d+\.', original)), bool(re.search(r'\d+\.', improved))),
        ("Has headers", bool(re.search(r'^#+', original, re.MULTILINE)), bool(re.search(r'^#+', improved, re.MULTILINE))),
        ("Has examples", 'example' in original.lower(), 'example' in improved.lower()),
        ("Has role/persona", bool(re.search(r'you are', original.lower())), bool(re.search(r'you are', improved.lower()))),
    ]
    
    for metric, orig_val, impr_val in stats:
        if isinstance(orig_val, bool):
            orig_str = "‚úÖ" if orig_val else "‚ùå"
            impr_str = "‚úÖ" if impr_val else "‚ùå"
            change_str = "Added" if (not orig_val and impr_val) else "Same"
        else:
            orig_str = str(orig_val)
            impr_str = str(impr_val)
            change = impr_val - orig_val
            change_str = f"+{change}" if change > 0 else str(change)
        
        print(f"{metric:<30} {orig_str:<20} {impr_str:<20} {change_str:<20}")
    
    print("\n" + "="*100)
    print("CHANGES EXPLANATION")
    print("="*100)
    print(changes)


# Test comparison
if 'result' in globals() and 'improvement_result' in globals():
    compare_prompts(
        result['prompt'],
        improvement_result['improved_prompt'],
        improvement_result['change_explanations']
    )