In [None]:
print('Setup complete.')

# Prompt Diagnosis and Patch Loop Demo

## Learning Objectives
- Learn systematic approaches to diagnosing prompt failures
- Practice iterative prompt improvement techniques
- Understand common failure patterns and their solutions
- Build a feedback loop for prompt optimization

## The Problem: When Good Prompts Go Bad

Even well-crafted prompts can fail in unexpected ways. This demo shows how to:
1. **Identify** what went wrong
2. **Diagnose** the root cause  
3. **Patch** the prompt systematically
4. **Validate** the improvement
5. **Iterate** until satisfactory

In [None]:
# Install required packages
!pip install asksageclient pip_system_certs rich pandas difflib tiktoken

In [None]:
# ================================
# 🔐 Cell 1 — Load secrets (Colab) + pricing + token utils
# ================================
import os, time, csv
from typing import Optional, Dict
import tiktoken

from google.colab import userdata

ASKSAGE_API_KEY = userdata.get("ASKSAGE_API_KEY")
ASKSAGE_BASE_URL = userdata.get("ASKSAGE_BASE_URL")
ASKSAGE_EMAIL = userdata.get("ASKSAGE_EMAIL")

assert ASKSAGE_API_KEY, "ASKSAGE_API_KEY not provided."
assert ASKSAGE_EMAIL, "ASKSAGE_EMAIL not provided."

print("✓ Secrets loaded")
print("  • EMAIL:", ASKSAGE_EMAIL)
print("  • BASE URL:", ASKSAGE_BASE_URL or "(default)")

# Pricing (USD per 1,000,000 tokens)
PRICES_PER_M = {
    "gpt-5": {"input_per_m": 1.25, "output_per_m": 10.00},
    "gpt-5-mini": {"input_per_m": 0.25, "output_per_m": 2.00},
}

# Tokenizer
enc = tiktoken.get_encoding("o200k_base")

def count_tokens(text: str) -> int:
    return len(enc.encode(text or ""))

def cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
    if model not in PRICES_PER_M:
        raise ValueError(f"Unknown model: {model}")
    r = PRICES_PER_M[model]
    return (input_tokens / 1_000_000) * r["input_per_m"] + (output_tokens / 1_000_000) * r["output_per_m"]

In [None]:
# ================================
# 🔧 Cell 2 — Import bootcamp_common and setup AskSage client
# ================================
import sys
sys.path.append('../../../')  # Adjust path to reach bootcamp_common

from bootcamp_common.ask_sage import AskSageClient
import json, difflib
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass

import pandas as pd
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.syntax import Syntax

# Initialize AskSage client
client = AskSageClient(
    api_key=ASKSAGE_API_KEY,
    base_url=ASKSAGE_BASE_URL
)

console = Console()
print("✓ AskSage client initialized")
print("✅ Libraries loaded successfully")

## Scenario: Code Documentation Generator

We'll build a prompt to generate documentation for Python functions, then systematically improve it when we encounter failures.

In [None]:
@dataclass
class PromptIteration:
    version: int
    prompt: str
    test_input: str
    expected_output: str
    actual_output: str
    success: bool
    issues_found: List[str]
    fixes_applied: List[str]
    timestamp: str

class PromptDiagnosticTool:
    """Tool for diagnosing and fixing prompt issues"""
    
    def __init__(self):
        self.setup_client()
        self.iterations = []
        
        # Test cases for our documentation generator
        self.test_cases = [
            {
                'input': '''def calculate_fibonacci(n):
    if n <= 1:
        return n
    return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)''',
                'expected_elements': ['parameters', 'returns', 'description', 'example']
            },
            {
                'input': '''def process_data(data, filter_func=None, transform=True):
    if filter_func:
        data = [x for x in data if filter_func(x)]
    if transform:
        data = [str(x).upper() for x in data]
    return data''',
                'expected_elements': ['parameters', 'returns', 'description', 'example']
            }
        ]
    
    def setup_client(self):
        """Setup API client with fallback to mock"""
        if os.getenv('OPENAI_API_KEY'):
            try:
                self.client = openai.OpenAI()
                self.has_api = True
                console.print("✅ OpenAI client configured")
            except Exception as e:
                self.has_api = False
                console.print(f"⚠️ Using mock responses: {e}")
        else:
            self.has_api = False
            console.print("💡 No API key found, using mock responses")
    
    def test_prompt(self, prompt: str, test_input: str, version: int) -> Dict:
        """Test a prompt with given input"""
        
        full_prompt = prompt.format(code=test_input)
        
        if self.has_api:
            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "user", "content": full_prompt}],
                    max_tokens=300,
                    temperature=0.3
                )
                return {
                    'output': response.choices[0].message.content,
                    'success': True
                }
            except Exception as e:
                return {
                    'output': f"Error: {str(e)}",
                    'success': False
                }
        else:
            # Mock responses that demonstrate different failure patterns
            mock_responses = {
                1: "This function calculates Fibonacci numbers.",  # Too brief
                2: """This function calculates Fibonacci numbers recursively.
                
Args:
    n: A number
    
Returns:
    The nth Fibonacci number""",  # Better but missing example
                3: """Calculate the nth Fibonacci number using recursion.
                
Args:
    n (int): The position in the Fibonacci sequence (non-negative integer)
    
Returns:
    int: The nth Fibonacci number
    
Example:
    >>> calculate_fibonacci(5)
    5
    >>> calculate_fibonacci(10)
    55
    
Note:
    This implementation has exponential time complexity. Consider using
    dynamic programming for large values of n."""
            }
            
            return {
                'output': mock_responses.get(version, mock_responses[3]),
                'success': True
            }
    
    def diagnose_issues(self, output: str, expected_elements: List[str]) -> List[str]:
        """Diagnose issues with the generated output"""
        issues = []
        output_lower = output.lower()
        
        # Check for missing elements
        if 'parameters' in expected_elements or 'args' in expected_elements:
            if not any(word in output_lower for word in ['args:', 'parameters:', 'param']):
                issues.append("Missing parameter documentation")
        
        if 'returns' in expected_elements:
            if not any(word in output_lower for word in ['returns:', 'return']):
                issues.append("Missing return value documentation")
        
        if 'example' in expected_elements:
            if not any(word in output_lower for word in ['example', '>>>', 'usage']):
                issues.append("Missing usage example")
        
        # Check for quality issues
        if len(output.split()) < 10:
            issues.append("Documentation too brief")
        
        if not any(char in output for char in ['.', '!', '?']):
            issues.append("Missing proper sentence structure")
        
        # Check for type hints
        if ':' in output and '(' in output:
            if not any(word in output for word in ['int', 'str', 'list', 'dict', 'bool']):
                issues.append("Missing or vague type information")
        
        return issues
    
    def generate_fixes(self, issues: List[str], current_prompt: str) -> Tuple[str, List[str]]:
        """Generate fixes for identified issues"""
        fixes_applied = []
        new_prompt = current_prompt
        
        if "Missing parameter documentation" in issues:
            if "Args:" not in new_prompt:
                new_prompt += "\n- Include detailed parameter descriptions with types"
                fixes_applied.append("Added parameter documentation requirement")
        
        if "Missing return value documentation" in issues:
            if "Returns:" not in new_prompt:
                new_prompt += "\n- Document the return value with its type"
                fixes_applied.append("Added return value documentation requirement")
        
        if "Missing usage example" in issues:
            if "example" not in new_prompt.lower():
                new_prompt += "\n- Provide a practical usage example with expected output"
                fixes_applied.append("Added usage example requirement")
        
        if "Documentation too brief" in issues:
            if "comprehensive" not in new_prompt.lower():
                new_prompt = new_prompt.replace("Generate", "Generate comprehensive")
                fixes_applied.append("Added comprehensiveness requirement")
        
        if "Missing or vague type information" in issues:
            if "type" not in new_prompt.lower():
                new_prompt += "\n- Include specific Python types for all parameters and return values"
                fixes_applied.append("Added specific type requirement")
        
        return new_prompt, fixes_applied
    
    def run_diagnostic_loop(self, max_iterations: int = 3):
        """Run the full diagnostic and patch loop"""
        
        # Initial prompt (deliberately flawed)
        current_prompt = "Generate documentation for this Python function:\n\n{code}\n\nDocumentation:"
        
        console.print("🔬 Starting Prompt Diagnostic Loop\n")
        
        for iteration in range(1, max_iterations + 1):
            console.print(f"🔄 [bold blue]Iteration {iteration}[/bold blue]")
            console.print("─" * 50)
            
            # Show current prompt
            console.print(f"[yellow]Current Prompt:[/yellow]")
            console.print(Panel(current_prompt, border_style="yellow"))
            
            # Test with first test case
            test_case = self.test_cases[0]
            result = self.test_prompt(current_prompt, test_case['input'], iteration)
            
            console.print(f"[green]Generated Output:[/green]")
            console.print(Panel(result['output'], border_style="green"))
            
            # Diagnose issues
            issues = self.diagnose_issues(result['output'], test_case['expected_elements'])
            
            if not issues:
                console.print("✅ [bold green]No issues found! Prompt is working well.[/bold green]")
                break
            
            # Show issues
            console.print(f"[red]Issues Identified:[/red]")
            for issue in issues:
                console.print(f"  • {issue}")
            
            # Record this iteration
            self.iterations.append(PromptIteration(
                version=iteration,
                prompt=current_prompt,
                test_input=test_case['input'],
                expected_output="Complete documentation with all elements",
                actual_output=result['output'],
                success=len(issues) == 0,
                issues_found=issues,
                fixes_applied=[],
                timestamp=datetime.now().isoformat()
            ))
            
            if iteration < max_iterations:
                # Generate fixes
                new_prompt, fixes_applied = self.generate_fixes(issues, current_prompt)
                
                console.print(f"[cyan]Fixes Applied:[/cyan]")
                for fix in fixes_applied:
                    console.print(f"  • {fix}")
                
                # Update iteration record with fixes
                self.iterations[-1].fixes_applied = fixes_applied
                current_prompt = new_prompt
                
                console.print("\n")
                time.sleep(1)  # Brief pause for readability
        
        return current_prompt
    
    def show_improvement_summary(self):
        """Show summary of improvements across iterations"""
        
        if not self.iterations:
            console.print("No iterations recorded")
            return
        
        # Create summary table
        table = Table(title="Prompt Evolution Summary")
        table.add_column("Version")
        table.add_column("Issues Found")
        table.add_column("Fixes Applied")
        table.add_column("Success")
        
        for iteration in self.iterations:
            issues_str = ", ".join(iteration.issues_found[:2])  # First 2 issues
            if len(iteration.issues_found) > 2:
                issues_str += f" (+{len(iteration.issues_found)-2} more)"
            
            fixes_str = ", ".join(iteration.fixes_applied[:2])  # First 2 fixes
            if len(iteration.fixes_applied) > 2:
                fixes_str += f" (+{len(iteration.fixes_applied)-2} more)"
            
            success_icon = "✅" if iteration.success else "❌"
            
            table.add_row(
                str(iteration.version),
                issues_str,
                fixes_str,
                success_icon
            )
        
        console.print(table)

# Initialize the diagnostic tool
diagnostic_tool = PromptDiagnosticTool()
print("🔧 Prompt diagnostic tool ready!")

## Running the Diagnostic Loop

Let's run the full diagnosis and patch cycle to see how we can systematically improve a flawed prompt.

In [None]:
# Run the diagnostic loop
final_prompt = diagnostic_tool.run_diagnostic_loop(max_iterations=3)

# Show the evolution summary
console.print("\n" + "="*60)
diagnostic_tool.show_improvement_summary()

# Show final optimized prompt
console.print("\n" + "="*60)
console.print("[bold green]🎯 Final Optimized Prompt:[/bold green]")
console.print(Panel(final_prompt, border_style="green", title="Optimized Prompt"))

## Advanced Diagnostic Techniques

Let's explore additional diagnostic methods for different types of prompt failures.

In [None]:
class AdvancedPromptDiagnostics:
    """Advanced techniques for prompt diagnosis"""
    
    def __init__(self):
        self.failure_patterns = {
            'inconsistent_format': {
                'indicators': ['varying structure', 'different layouts', 'inconsistent sections'],
                'solutions': ['Add format template', 'Use structured output', 'Provide clear format example']
            },
            'hallucination': {
                'indicators': ['factual errors', 'made up information', 'impossible details'],
                'solutions': ['Add "only use provided info"', 'Request citations', 'Use retrieval augmentation']
            },
            'instruction_following': {
                'indicators': ['ignores constraints', 'wrong task performed', 'missing requirements'],
                'solutions': ['Emphasize requirements', 'Use step-by-step format', 'Add negative examples']
            },
            'context_length': {
                'indicators': ['truncated responses', 'incomplete information', 'abrupt endings'],
                'solutions': ['Reduce prompt length', 'Split into subtasks', 'Increase max tokens']
            },
            'ambiguity': {
                'indicators': ['multiple interpretations', 'unclear responses', 'asks for clarification'],
                'solutions': ['Add specific examples', 'Define key terms', 'Provide context']
            }
        }
    
    def analyze_failure_pattern(self, prompt: str, output: str, expected: str) -> Dict:
        """Analyze what type of failure occurred"""
        
        analysis = {
            'likely_patterns': [],
            'confidence_scores': {},
            'recommended_solutions': []
        }
        
        # Simple pattern matching (in production, this would be more sophisticated)
        output_lower = output.lower()
        prompt_lower = prompt.lower()
        
        for pattern_name, pattern_info in self.failure_patterns.items():
            score = 0
            
            # Check indicators (this is simplified - real implementation would be more complex)
            if pattern_name == 'inconsistent_format':
                if len(output.split('\n')) != len(expected.split('\n')):
                    score += 0.3
                if 'format' not in prompt_lower and 'structure' not in prompt_lower:
                    score += 0.4
            
            elif pattern_name == 'instruction_following':
                if len(output.split()) < len(expected.split()) * 0.5:
                    score += 0.5
                if 'must' not in prompt_lower and 'should' not in prompt_lower:
                    score += 0.3
            
            elif pattern_name == 'ambiguity':
                if '?' in output:
                    score += 0.4
                if len(prompt_lower.split()) < 20:  # Very short prompts often ambiguous
                    score += 0.3
            
            analysis['confidence_scores'][pattern_name] = score
            
            if score > 0.3:
                analysis['likely_patterns'].append(pattern_name)
                analysis['recommended_solutions'].extend(pattern_info['solutions'])
        
        return analysis
    
    def suggest_prompt_improvements(self, analysis: Dict, current_prompt: str) -> List[str]:
        """Suggest specific prompt improvements based on analysis"""
        
        suggestions = []
        
        # Prioritize suggestions by confidence scores
        sorted_patterns = sorted(analysis['confidence_scores'].items(), 
                               key=lambda x: x[1], reverse=True)
        
        for pattern_name, confidence in sorted_patterns:
            if confidence > 0.3:
                if pattern_name == 'inconsistent_format':
                    suggestions.append("Add explicit format template: 'Use this format: [example]'")
                
                elif pattern_name == 'instruction_following':
                    suggestions.append("Strengthen instructions: 'You MUST include...' instead of 'Please include...'")
                
                elif pattern_name == 'ambiguity':
                    suggestions.append("Add specific examples and define key terms clearly")
        
        # Remove duplicates while preserving order
        seen = set()
        unique_suggestions = []
        for suggestion in suggestions:
            if suggestion not in seen:
                seen.add(suggestion)
                unique_suggestions.append(suggestion)
        
        return unique_suggestions[:3]  # Top 3 suggestions

# Demo the advanced diagnostics
advanced_diagnostics = AdvancedPromptDiagnostics()

# Example failure analysis
problematic_prompt = "Write documentation for the code."
poor_output = "This is a function."
expected_output = "Comprehensive documentation with parameters, returns, examples, and type hints."

analysis = advanced_diagnostics.analyze_failure_pattern(
    problematic_prompt, poor_output, expected_output
)

suggestions = advanced_diagnostics.suggest_prompt_improvements(analysis, problematic_prompt)

console.print("\n🔍 [bold blue]Advanced Failure Analysis:[/bold blue]")
console.print(f"Likely failure patterns: {', '.join(analysis['likely_patterns'])}")
console.print("\n💡 [bold yellow]Improvement Suggestions:[/bold yellow]")
for i, suggestion in enumerate(suggestions, 1):
    console.print(f"{i}. {suggestion}")

print("\n🔧 Advanced diagnostics ready!")

## Key Takeaways: The Prompt Patch Loop

### 🔍 **Systematic Diagnosis Process**

1. **Identify Symptoms**: What exactly is wrong with the output?
2. **Categorize Failure Type**: Format, content, instruction-following, etc.
3. **Trace Root Cause**: Is it the prompt structure, examples, or constraints?
4. **Design Targeted Fix**: Address the specific root cause, not just symptoms
5. **Validate Improvement**: Test with multiple examples to ensure fix works

### 🛠️ **Common Failure Patterns & Solutions**

| **Failure Pattern** | **Indicators** | **Solutions** |
|---------------------|----------------|---------------|
| **Inconsistent Format** | Varying structure, different layouts | Add format templates, structured output |
| **Missing Information** | Incomplete responses, omitted elements | Explicit requirements list, negative examples |
| **Hallucination** | Made-up facts, impossible details | "Use only provided info", request citations |
| **Instruction Ignored** | Wrong task performed, constraints missed | Stronger language (MUST vs should), step-by-step |
| **Ambiguous Output** | Multiple valid interpretations | Specific examples, define key terms |

### 📈 **Iterative Improvement Strategy**

- **Start Simple**: Begin with minimal prompt, add complexity as needed
- **One Fix at a Time**: Change one thing per iteration to isolate effects
- **Test Edge Cases**: Don't just test the happy path
- **Document Changes**: Keep track of what works and what doesn't
- **Version Control**: Maintain prompt versions like code

### 🎯 **Best Practices**

1. **Build a Test Suite**: Create diverse test cases covering edge cases
2. **Measure Consistently**: Use quantitative metrics where possible
3. **Automate Testing**: Run tests after each prompt change
4. **Keep a Prompt Library**: Document successful patterns for reuse
5. **A/B Test**: Compare prompt versions with statistical significance

## Next Steps

In Lab 6, you'll build your own prompt library with examples and guardrails, applying these diagnostic techniques to create robust, reliable prompts!