In [None]:
print('Setup complete.')

# Prompt Engineering: Variants Comparison Demo

## Learning Objectives
- Understand different prompt engineering techniques
- Compare zero-shot, few-shot, and chain-of-thought approaches
- Analyze quality vs latency trade-offs
- Learn systematic prompt evaluation methods

## Introduction

Prompt engineering is both art and science. The same task can be accomplished with vastly different prompts, each with different trade-offs in:
- **Quality**: Accuracy and relevance of responses
- **Latency**: Speed of response generation
- **Consistency**: Reliability across different inputs
- **Cost**: Token usage and associated costs

This demo will systematically compare different prompting approaches.

In [None]:
# Install required packages
!pip install asksageclient pip_system_certs rich pandas matplotlib seaborn tiktoken

In [None]:
# ================================
# 🔐 Cell 1 — Load secrets (Colab) + pricing + token utils
# ================================
import os, time, csv
from typing import Optional, Dict
import tiktoken

from google.colab import userdata

ASKSAGE_API_KEY = userdata.get("ASKSAGE_API_KEY")
ASKSAGE_BASE_URL = userdata.get("ASKSAGE_BASE_URL")
ASKSAGE_EMAIL = userdata.get("ASKSAGE_EMAIL")

assert ASKSAGE_API_KEY, "ASKSAGE_API_KEY not provided."
assert ASKSAGE_EMAIL, "ASKSAGE_EMAIL not provided."

print("✓ Secrets loaded")
print("  • EMAIL:", ASKSAGE_EMAIL)
print("  • BASE URL:", ASKSAGE_BASE_URL or "(default)")

# Pricing (USD per 1,000,000 tokens)
PRICES_PER_M = {
    "gpt-5": {"input_per_m": 1.25, "output_per_m": 10.00},
    "gpt-5-mini": {"input_per_m": 0.25, "output_per_m": 2.00},
}

# Tokenizer
enc = tiktoken.get_encoding("o200k_base")

def count_tokens(text: str) -> int:
    return len(enc.encode(text or ""))

def cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
    if model not in PRICES_PER_M:
        raise ValueError(f"Unknown model: {model}")
    r = PRICES_PER_M[model]
    return (input_tokens / 1_000_000) * r["input_per_m"] + (output_tokens / 1_000_000) * r["output_per_m"]

In [None]:
# ================================
# 🔧 Cell 2 — Import bootcamp_common and setup AskSage client
# ================================
import sys
sys.path.append('../../../')  # Adjust path to reach bootcamp_common

from bootcamp_common.ask_sage import AskSageClient
import json, random
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import track

# Initialize AskSage client
client = AskSageClient(
    api_key=ASKSAGE_API_KEY,
    base_url=ASKSAGE_BASE_URL
)

console = Console()
print("✓ AskSage client initialized")
print("✅ Libraries loaded successfully")

## Task: Email Classification

We'll use email classification as our test task - classifying emails as 'urgent', 'normal', or 'spam'.

In [None]:
@dataclass
class PromptResult:
    prompt_type: str
    prompt_text: str
    response: str
    latency_ms: float
    input_tokens: int
    output_tokens: int
    total_tokens: int
    timestamp: str
    success: bool
    error_message: Optional[str] = None

class PromptVariantTester:
    """Test different prompt engineering approaches"""
    
    def __init__(self):
        self.setup_client()
        self.test_emails = [
            "URGENT: Server down! All systems failing. Need immediate action!",
            "Hi team, please review the quarterly report when you have time.",
            "🎉 CONGRATULATIONS! You've WON $1,000,000! Click here NOW!",
            "Meeting moved to 3pm tomorrow. Let me know if that works.",
            "SECURITY ALERT: Suspicious login detected. Verify immediately!"
        ]
    
    def setup_client(self):
        """Setup OpenAI client (fallback to mock if not available)"""
        if os.getenv('OPENAI_API_KEY'):
            try:
                self.client = openai.OpenAI()
                self.has_api = True
                console.print("✅ OpenAI client configured")
            except Exception as e:
                self.has_api = False
                console.print(f"⚠️ OpenAI setup failed, using mock responses: {e}")
        else:
            self.has_api = False
            console.print("💡 No OpenAI API key found, using mock responses")
    
    def create_prompt_variants(self, email_text: str) -> Dict[str, str]:
        """Create different prompt variants for the same task"""
        
        variants = {
            'zero_shot_simple': f"""
Classify this email as urgent, normal, or spam:

{email_text}

Classification:""",
            
            'zero_shot_detailed': f"""
You are an email classification system. Analyze the following email and classify it into one of these categories:
- urgent: Requires immediate attention (security issues, system failures, deadlines)
- normal: Regular business communication that can be handled during normal workflow
- spam: Unwanted promotional content, scams, or irrelevant messages

Email to classify:
{email_text}

Classification (respond with just the category):""",
            
            'few_shot': f"""
Classify emails as urgent, normal, or spam. Here are examples:

Email: "Server outage affecting all customers. Need immediate fix!"
Classification: urgent

Email: "Weekly team meeting scheduled for Friday at 2pm."
Classification: normal

Email: "WIN BIG! Click here for amazing deals!"
Classification: spam

Email: {email_text}
Classification:""",
            
            'chain_of_thought': f"""
Classify this email as urgent, normal, or spam. Think through your reasoning step by step:

Email: {email_text}

Analysis:
1. What is the main topic/purpose of this email?
2. What language patterns indicate urgency or spam?
3. What is the likely sender and context?
4. Based on these factors, what is the appropriate classification?

Final Classification:""",
            
            'system_prompt': f"""
System: You are a professional email classifier with years of experience. You excel at quickly identifying urgent communications and filtering out spam.

User: Classify this email as urgent, normal, or spam:

{email_text}

Assistant:"""
        }
        
        return variants
    
    def test_prompt(self, prompt_type: str, prompt_text: str, model: str = "gpt-3.5-turbo") -> PromptResult:
        """Test a single prompt variant"""
        
        start_time = time.time()
        
        if self.has_api:
            try:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt_text}],
                    max_tokens=150,
                    temperature=0.1  # Low temperature for consistent classification
                )
                
                latency_ms = (time.time() - start_time) * 1000
                
                return PromptResult(
                    prompt_type=prompt_type,
                    prompt_text=prompt_text,
                    response=response.choices[0].message.content.strip(),
                    latency_ms=round(latency_ms, 2),
                    input_tokens=response.usage.prompt_tokens,
                    output_tokens=response.usage.completion_tokens,
                    total_tokens=response.usage.total_tokens,
                    timestamp=datetime.now().isoformat(),
                    success=True
                )
                
            except Exception as e:
                return PromptResult(
                    prompt_type=prompt_type,
                    prompt_text=prompt_text,
                    response="",
                    latency_ms=0,
                    input_tokens=0,
                    output_tokens=0,
                    total_tokens=0,
                    timestamp=datetime.now().isoformat(),
                    success=False,
                    error_message=str(e)
                )
        else:
            # Mock responses for demo
            mock_responses = {
                'zero_shot_simple': 'urgent',
                'zero_shot_detailed': 'urgent',
                'few_shot': 'urgent', 
                'chain_of_thought': '1. This email mentions server/system issues\n2. Uses urgent language\n3. Requests immediate action\n4. Classification: urgent',
                'system_prompt': 'urgent'
            }
            
            # Simulate varying latencies
            mock_latency = {
                'zero_shot_simple': 850,
                'zero_shot_detailed': 1200,
                'few_shot': 1500,
                'chain_of_thought': 2100,
                'system_prompt': 950
            }
            
            # Simulate token counts
            input_tokens = len(prompt_text.split()) * 1.3
            output_tokens = len(mock_responses[prompt_type].split()) * 1.3
            
            time.sleep(mock_latency[prompt_type] / 1000)  # Simulate latency
            
            return PromptResult(
                prompt_type=prompt_type,
                prompt_text=prompt_text,
                response=mock_responses[prompt_type],
                latency_ms=mock_latency[prompt_type],
                input_tokens=int(input_tokens),
                output_tokens=int(output_tokens),
                total_tokens=int(input_tokens + output_tokens),
                timestamp=datetime.now().isoformat(),
                success=True
            )
    
    def run_comparison(self, email_index: int = 0) -> List[PromptResult]:
        """Run comparison across all prompt variants"""
        
        email_text = self.test_emails[email_index]
        console.print(f"\n📧 Testing email: '{email_text}'\n")
        
        variants = self.create_prompt_variants(email_text)
        results = []
        
        for prompt_type, prompt_text in variants.items():
            console.print(f"🔄 Testing {prompt_type.replace('_', ' ').title()}...")
            
            result = self.test_prompt(prompt_type, prompt_text)
            results.append(result)
            
            if result.success:
                console.print(f"  ✅ Response: {result.response[:100]}...")
                console.print(f"  ⏱️ Latency: {result.latency_ms}ms, Tokens: {result.total_tokens}")
            else:
                console.print(f"  ❌ Failed: {result.error_message}")
            
            time.sleep(0.5)  # Brief pause between requests
        
        return results

# Initialize the tester
tester = PromptVariantTester()
print("🔧 Prompt variant tester ready!")

## Running the Comparison

Let's test different prompt variants on the same email and compare results.

In [None]:
# Run the comparison
results = tester.run_comparison(email_index=0)  # Test the urgent email

# Create comparison table
if results:
    table = Table(title="Prompt Variant Comparison")
    table.add_column("Prompt Type")
    table.add_column("Response")
    table.add_column("Latency (ms)")
    table.add_column("Input Tokens")
    table.add_column("Output Tokens")
    table.add_column("Total Tokens")
    
    for result in results:
        if result.success:
            table.add_row(
                result.prompt_type.replace('_', ' ').title(),
                result.response[:50] + '...' if len(result.response) > 50 else result.response,
                str(result.latency_ms),
                str(result.input_tokens),
                str(result.output_tokens),
                str(result.total_tokens)
            )
    
    console.print(table)

## Analysis and Visualization

Let's analyze the performance characteristics of each approach.

In [None]:
# Convert results to DataFrame for analysis
if results:
    df = pd.DataFrame([asdict(r) for r in results if r.success])
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Latency comparison
    axes[0, 0].bar(df['prompt_type'], df['latency_ms'], alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Latency by Prompt Type')
    axes[0, 0].set_ylabel('Latency (ms)')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # 2. Token usage comparison
    axes[0, 1].bar(df['prompt_type'], df['total_tokens'], alpha=0.7, color='lightcoral')
    axes[0, 1].set_title('Total Tokens by Prompt Type')
    axes[0, 1].set_ylabel('Total Tokens')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # 3. Input vs Output tokens
    x = range(len(df))
    axes[1, 0].bar(x, df['input_tokens'], alpha=0.7, label='Input Tokens', color='lightgreen')
    axes[1, 0].bar(x, df['output_tokens'], bottom=df['input_tokens'], alpha=0.7, 
                   label='Output Tokens', color='orange')
    axes[1, 0].set_title('Input vs Output Tokens')
    axes[1, 0].set_ylabel('Tokens')
    axes[1, 0].set_xticks(x)
    axes[1, 0].set_xticklabels(df['prompt_type'], rotation=45)
    axes[1, 0].legend()
    
    # 4. Efficiency (tokens per ms)
    df['efficiency'] = df['total_tokens'] / df['latency_ms']
    axes[1, 1].bar(df['prompt_type'], df['efficiency'], alpha=0.7, color='gold')
    axes[1, 1].set_title('Efficiency (Tokens per ms)')
    axes[1, 1].set_ylabel('Tokens/ms')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    console.print("\n📊 Summary Statistics:\n")
    summary_table = Table()
    summary_table.add_column("Metric")
    summary_table.add_column("Fastest")
    summary_table.add_column("Most Efficient")
    summary_table.add_column("Most Detailed")
    
    fastest = df.loc[df['latency_ms'].idxmin()]
    most_efficient = df.loc[df['efficiency'].idxmax()]
    most_detailed = df.loc[df['output_tokens'].idxmax()]
    
    summary_table.add_row(
        "Prompt Type",
        fastest['prompt_type'],
        most_efficient['prompt_type'],
        most_detailed['prompt_type']
    )
    
    summary_table.add_row(
        "Latency (ms)",
        str(fastest['latency_ms']),
        str(most_efficient['latency_ms']),
        str(most_detailed['latency_ms'])
    )
    
    console.print(summary_table)

## Key Insights from Prompt Variant Testing

### 🚀 **Zero-Shot Simple**
- **Pros**: Fastest, lowest token usage, direct
- **Cons**: Less reliable, minimal context
- **Best for**: Simple, well-defined tasks with clear categories

### 📋 **Zero-Shot Detailed**  
- **Pros**: Clear instructions, better reliability
- **Cons**: Higher token usage, moderate latency
- **Best for**: Tasks requiring specific criteria or edge case handling

### 📚 **Few-Shot**
- **Pros**: Demonstrates desired format, handles ambiguous cases better
- **Cons**: Higher input tokens, requires good examples
- **Best for**: Tasks where format matters, complex classification

### 🧠 **Chain-of-Thought**
- **Pros**: Shows reasoning, more accurate for complex decisions
- **Cons**: Highest latency and token usage
- **Best for**: Complex reasoning, when explanation is needed

### 🤖 **System Prompt**
- **Pros**: Sets clear role and expectations
- **Cons**: May not work consistently across all models
- **Best for**: Conversational interfaces, role-specific tasks

## Choosing the Right Approach

Consider these factors:

1. **Task Complexity**: Simple tasks → Zero-shot; Complex tasks → Chain-of-thought
2. **Latency Requirements**: Real-time apps → Simple prompts; Batch processing → Detailed prompts
3. **Accuracy Needs**: High stakes → Few-shot or CoT; General use → Zero-shot
4. **Cost Constraints**: Budget-sensitive → Simple prompts; Quality-focused → Detailed prompts
5. **Consistency Requirements**: Variable inputs → Few-shot; Stable inputs → Zero-shot

## Next Steps

In Lab 5, you'll implement and test your own prompt variants with a systematic evaluation framework!