In [None]:
print('Setup complete.')

# Lab 5: Prompt Engineering Styles

## Learning Objectives
- Master different prompt engineering techniques
- Compare zero-shot, few-shot, and chain-of-thought approaches
- Build a systematic prompt testing framework
- Analyze performance trade-offs between different styles

## Lab Overview
You'll implement and test multiple prompt engineering styles on a classification task, then build a comparison framework to analyze their effectiveness.

**Estimated Time:** 60 minutes

## Your Mission
Build a prompt testing lab that compares different approaches for email classification (spam/not spam).

In [None]:
# Setup and imports
!pip install asksageclient pip_system_certs
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import time
import tiktoken
from pathlib import Path
from typing import Dict, List, Any

# Import our AskSage client
from asksageclient import AskSageClient

# Get API credentials from Google Colab secrets
from google.colab import userdata
api_key = userdata.get('ASKSAGE_API_KEY')
email = userdata.get('ASKSAGE_EMAIL')

# Initialize client and tokenizer
client = AskSageClient(api_key=api_key, email=email)
tokenizer = tiktoken.encoding_for_model("gpt-4")
print("AskSage client initialized successfully")
print("Ready to showcase AI capabilities...")

In [None]:
import os, time, csv
from typing import Optional, Dict
import tiktoken


from google.colab import userdata
import console

ASKSAGE_API_KEY = userdata.get("ASKSAGE_API_KEY")
ASKSAGE_BASE_URL = userdata.get("ASKSAGE_BASE_URL")
ASKSAGE_EMAIL = userdata.get("ASKSAGE_EMAIL")



assert ASKSAGE_API_KEY, "ASKSAGE_API_KEY not provided."
assert ASKSAGE_EMAIL, "ASKSAGE_EMAIL not provided."


print("✓ Secrets loaded")
print("  • EMAIL:", ASKSAGE_EMAIL)
print("  • BASE URL:", ASKSAGE_BASE_URL or "(default)")


# Pricing (USD per 1,000,000 tokens) — exact values requested
PRICES_PER_M = {
    "gpt-5":      {"input_per_m": 1.25, "output_per_m": 10.00},
    "gpt-5-mini": {"input_per_m": 0.25, "output_per_m": 2.00},
}


# Tokenizer (OpenAI-style)
ENCODING_NAME = "o200k_base"
enc = tiktoken.get_encoding(ENCODING_NAME)


def count_tokens(text: str) -> int:
    return len(enc.encode(text or ""))


def cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
    """Exact cost using PRICES_PER_M (input + output)."""
    if model not in PRICES_PER_M:
        raise ValueError(f"Unknown model: {model}")
    r = PRICES_PER_M[model]
    return (input_tokens / 1_000_000) * r["input_per_m"] + (output_tokens / 1_000_000) * r["output_per_m"]

## Task 1: Implement Different Prompt Styles

**TODO:** Complete the `PromptStyleTester` class with different prompt engineering approaches.

In [None]:
class PromptStyleTester:
    """Test different prompt engineering styles for email classification"""
    
    def __init__(self, client):
        self.client = client
        self.results = []
        
    def zero_shot_prompt(self, email: str) -> str:
        """Simple zero-shot classification prompt"""
        return f"""Classify this email as either 'spam' or 'not spam':

Email: {email}

Classification:"""
    
    def few_shot_prompt(self, email: str) -> str:
        """Few-shot prompt with examples"""
        return f"""Classify emails as 'spam' or 'not spam' based on these examples:

Email: "Congratulations! You've won $1000! Click here now!"
Classification: spam

Email: "Meeting scheduled for tomorrow at 2pm in conference room A"
Classification: not spam

Email: "URGENT: Your account will be suspended unless you verify now!"
Classification: spam

Email: {email}
Classification:"""
    
    def chain_of_thought_prompt(self, email: str) -> str:
        """Chain-of-thought reasoning prompt"""
        return f"""Analyze this email step by step to determine if it's spam:

Email: {email}

Let me think through this:
1. Sender analysis: 
2. Content analysis: 
3. Urgency/pressure tactics: 
4. Suspicious elements: 

Based on this analysis, classification:"""
    
    def system_prompt_style(self, email: str) -> str:
        """Using system message approach"""
        return f"""You are an expert email security analyst. Classify this email as 'spam' or 'not spam'.

Consider these factors:
- Sender credibility
- Content quality and grammar
- Urgency or pressure tactics
- Suspicious links or requests

Email: {email}

Classification:"""
    
    def test_style(self, style_name: str, prompt_func, email: str, expected: str):
        """Test a single prompt style"""
        start_time = time.time()
        
        try:
            prompt = prompt_func(email)
            input_tokens = count_tokens(prompt)
            
            # TODO: Add the API call to client.query() here

            
            # TODO: Extract response text and count output tokens
            
            # Placeholder for now
            response_text = "not spam"  # TODO: Replace with actual API response
            output_tokens = 10  # TODO: Replace with actual token count
            
            end_time = time.time()
            response_time = end_time - start_time
            
            # TODO: Calculate cost using cost_usd function
            # cost = cost_usd("gpt-5-mini", input_tokens, output_tokens)
            cost = 0.001  # TODO: Replace with actual cost calculation
            
            # Determine if prediction is correct
            is_correct = (expected.lower() in response_text) or (response_text in expected.lower())
            
            result = {
                'style': style_name,
                'email': email[:50] + '...',
                'expected': expected,
                'predicted': response_text,
                'correct': is_correct,
                'response_time': response_time,
                'input_tokens': input_tokens,
                'output_tokens': output_tokens,
                'cost': cost
            }
            
            self.results.append(result)
            return result
            
        except Exception as e:
            console.print(f"[red]Error testing {style_name}: {e}[/red]")
            return None
    
    def run_comparison(self, test_emails):
        """Run all prompt styles on test emails"""
        styles = [
            ('Zero-shot', self.zero_shot_prompt),
            ('Few-shot', self.few_shot_prompt),
            ('Chain-of-thought', self.chain_of_thought_prompt),
            ('System prompt', self.system_prompt_style)
        ]
        
        console.print("\n🧪 [bold blue]Testing Prompt Styles[/bold blue]")
        
        for email_data in test_emails:
            email = email_data['email']
            expected = email_data['label']
            
            console.print(f"\n📧 Testing email: {email[:50]}...")
            
            for style_name, prompt_func in styles:
                result = self.test_style(style_name, prompt_func, email, expected)
                if result:
                    status = "✅" if result['correct'] else "❌"
                    console.print(f"  {status} {style_name}: {result['predicted']} (${result['cost']:.4f})")
        
        return self.results

# Test data
test_emails = [
    {'email': 'Congratulations! You have won $10,000! Click here to claim your prize now!', 'label': 'spam'},
    {'email': 'Hi John, can we reschedule our meeting to 3pm tomorrow? Thanks, Sarah', 'label': 'not spam'},
    {'email': 'URGENT: Your account will be closed in 24 hours. Verify immediately!', 'label': 'spam'},
    {'email': 'Quarterly report is attached. Please review before Friday\'s board meeting.', 'label': 'not spam'}
]

# Initialize tester
tester = PromptStyleTester(client)
console.print("✅ PromptStyleTester initialized")
console.print("⚠️ TODO: Complete the API integration in test_style() method")

## Task 2: Build Performance Analysis Framework

**TODO:** Complete the analysis functions to compare prompt performance.

In [None]:
from dataclasses import dataclass
@dataclass
class StyleMetrics:
    style: str
    n: int
    accuracy: float
    avg_response_time: float
    avg_tokens: float
    total_cost: float
    cost_per_correct: Optional[float]
    overall_score: float  # user-defined weighted score


class PromptAnalyzer:
    """
    Fill-in the blanks where marked. Produces per-style metrics and a simple table.
    """
    def __init__(self, results: List[Dict[str, Any]]):
        self.results = results
        self.by_style: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
        for r in results:
            self.by_style[r['style']].append(r)

        # ---- BEGIN USER-EDITABLE AREA (weights for overall score) ----
        self.weights = {
            "accuracy": 0.55,
            "speed":    0.15,  # inverse of response time
            "tokens":   0.10,  # inverse of token usage
            "cost":     0.20,  # inverse of cost
        }
        # ---- END USER-EDITABLE AREA ----

    def _normalize_inverse(self, values: List[float], eps: float = 1e-9) -> List[float]:
        """
        Larger-is-better normalization for inverse metrics (lower is better raw).
        """
        if not values: 
            return []
        mx = max(values)
        mn = min(values)
        # Convert to inverse by flipping: inv = (max - x) range normalized
        rng = max(mx - mn, eps)
        return [(mx - v) / rng for v in values]

    def compute_metrics(self) -> List[StyleMetrics]:
        records: List[StyleMetrics] = []
        # Collect raw vectors for normalization
        styles = list(self.by_style.keys())
        accs, times, toks, costs = [], [], [], []

        # Pre-pass to compute raw per-style aggregates
        aggregates = {}
        for s in styles:
            rs = self.by_style[s]
            n = len(rs)
            accuracy = sum(1 for r in rs if r['correct']) / max(1, n)
            avg_time = mean(r['response_time'] for r in rs) if n else 0.0
            avg_tokens = mean((r['input_tokens'] + r['output_tokens']) for r in rs) if n else 0.0
            total_cost = sum(r['cost'] for r in rs)
            correct = sum(1 for r in rs if r['correct'])
            cost_per_correct = (total_cost / correct) if correct > 0 else None

            aggregates[s] = (n, accuracy, avg_time, avg_tokens, total_cost, cost_per_correct)
            accs.append(accuracy); times.append(avg_time); toks.append(avg_tokens); costs.append(total_cost)

        # Normalize inverse metrics (time, tokens, cost)
        norm_speed  = self._normalize_inverse(times)
        norm_tokens = self._normalize_inverse(toks)
        norm_costs  = self._normalize_inverse(costs)

        # Build final records with weighted score
        for idx, s in enumerate(styles):
            n, accuracy, avg_time, avg_tokens, total_cost, cost_per_correct = aggregates[s]
            score = (
                self.weights["accuracy"] * accuracy
                + self.weights["speed"]  * norm_speed[idx]
                + self.weights["tokens"] * norm_tokens[idx]
                + self.weights["cost"]   * norm_costs[idx]
            )
            records.append(StyleMetrics(
                style=s,
                n=n,
                accuracy=accuracy,
                avg_response_time=avg_time,
                avg_tokens=avg_tokens,
                total_cost=total_cost,
                cost_per_correct=cost_per_correct,
                overall_score=score
            ))
        return sorted(records, key=lambda m: m.overall_score, reverse=True)

    def print_table(self, metrics: List[StyleMetrics]) -> None:
        table = Table(title="Prompt Style Comparison")
        table.add_column("Style")
        table.add_column("N", justify="right")
        table.add_column("Accuracy", justify="right")
        table.add_column("Avg Time (s)", justify="right")
        table.add_column("Avg Tokens", justify="right")
        table.add_column("Total Cost ($)", justify="right")
        table.add_column("Cost/Correct ($)", justify="right")
        table.add_column("Score", justify="right")

        for m in metrics:
            table.add_row(
                m.style,
                str(m.n),
                f"{m.accuracy:.2%}",
                f"{m.avg_response_time:.3f}",
                f"{m.avg_tokens:.1f}",
                f"{m.total_cost:.4f}",
                ("—" if m.cost_per_correct is None else f"{m.cost_per_correct:.4f}"),
                f"{m.overall_score:.3f}",
            )
        console.print(table)

    def recommend(self, metrics: List[StyleMetrics]) -> Dict[str, Any]:
        """
        Return a small recommendation dict.
        """
        best = metrics[0] if metrics else None
        fastest = min(metrics, key=lambda m: m.avg_response_time) if metrics else None
        cheapest = min(metrics, key=lambda m: m.total_cost) if metrics else None
        most_accurate = max(metrics, key=lambda m: m.accuracy) if metrics else None
        return {
            "best_overall": best.style if best else None,
            "most_accurate": most_accurate.style if most_accurate else None,
            "fastest": fastest.style if fastest else None,
            "cheapest_total": cheapest.style if cheapest else None,
        }


## Task 3: Run the Comparison Experiment

**TODO:** Execute your prompt style comparison and analyze the results.

In [None]:
# ===== Task 3 — PromptAnalyzer (drop-in) =====
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from statistics import mean
from collections import defaultdict

try:
    from rich.console import Console
    from rich.table import Table
    _console = Console()
except Exception:
    class _Dummy:
        def print(self, *a, **k): print(*a)
    _console = _Dummy()
    class Table:
        def __init__(self, title=""): self.rows=[["(install rich for formatted table)"]]
        def add_column(self, *a, **k): pass
        def add_row(self, *a, **k): print("\t".join(map(str, a)))

@dataclass
class StyleMetrics:
    style: str
    n: int
    accuracy: float
    avg_response_time: float
    avg_tokens: float
    total_cost: float
    cost_per_correct: Optional[float]
    overall_score: float  # weighted composite

class PromptAnalyzer:
    """
    Analyze a list of PromptStyleTester results (dicts) and compute per-style metrics.
    Usage:
      analyzer = PromptAnalyzer(results)
      metrics = analyzer.compute_metrics()  # List[StyleMetrics], sorted best-first
      analyzer.print_table(metrics)
      rec = analyzer.recommend(metrics)
    """
    def __init__(self, results: List[Dict[str, Any]],
                 weights: Optional[Dict[str, float]] = None):
        self.results = results
        self.by_style: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
        for r in results:
            self.by_style[r['style']].append(r)

        # Weight knobs (tune to taste)
        self.weights = weights or {
            "accuracy": 0.55,  # higher is better
            "speed":    0.15,  # lower time is better (normalized inverse)
            "tokens":   0.10,  # lower tokens is better (normalized inverse)
            "cost":     0.20,  # lower total cost is better (normalized inverse)
        }

    @staticmethod
    def _normalize_inverse(values: List[float], eps: float = 1e-9) -> List[float]:
        """
        Normalize where lower-is-better into higher-is-better [0..1].
        """
        if not values:
            return []
        mx, mn = max(values), min(values)
        rng = max(mx - mn, eps)
        return [(mx - v) / rng for v in values]

    def compute_metrics(self) -> List[StyleMetrics]:
        styles = list(self.by_style.keys())
        aggregates = {}
        accs, times, toks, costs = [], [], [], []

        # aggregate raw stats
        for s in styles:
            rs = self.by_style[s]
            n = len(rs) or 1
            accuracy = sum(1 for r in rs if r.get('correct')) / n
            avg_time = mean(r['response_time'] for r in rs) if rs else 0.0
            avg_tokens = mean((r['input_tokens'] + r['output_tokens']) for r in rs) if rs else 0.0
            total_cost = sum(r['cost'] for r in rs) if rs else 0.0
            correct = sum(1 for r in rs if r.get('correct'))
            cost_per_correct = (total_cost / correct) if correct else None

            aggregates[s] = (n, accuracy, avg_time, avg_tokens, total_cost, cost_per_correct)
            accs.append(accuracy); times.append(avg_time); toks.append(avg_tokens); costs.append(total_cost)

        # normalize inverse metrics
        norm_speed  = self._normalize_inverse(times)
        norm_tokens = self._normalize_inverse(toks)
        norm_costs  = self._normalize_inverse(costs)

        # compute weighted score
        metrics: List[StyleMetrics] = []
        for i, s in enumerate(styles):
            n, accuracy, avg_time, avg_tokens, total_cost, cost_per_correct = aggregates[s]
            score = (
                self.weights["accuracy"] * accuracy +
                self.weights["speed"]    * norm_speed[i] +
                self.weights["tokens"]   * norm_tokens[i] +
                self.weights["cost"]     * norm_costs[i]
            )
            metrics.append(StyleMetrics(
                style=s,
                n=n,
                accuracy=accuracy,
                avg_response_time=avg_time,
                avg_tokens=avg_tokens,
                total_cost=total_cost,
                cost_per_correct=cost_per_correct,
                overall_score=score
            ))
        return sorted(metrics, key=lambda m: m.overall_score, reverse=True)

    def print_table(self, metrics: List[StyleMetrics]) -> None:
        tbl = Table(title="Prompt Style Comparison")
        tbl.add_column("Style")
        tbl.add_column("N", justify="right")
        tbl.add_column("Accuracy", justify="right")
        tbl.add_column("Avg Time (s)", justify="right")
        tbl.add_column("Avg Tokens", justify="right")
        tbl.add_column("Total Cost ($)", justify="right")
        tbl.add_column("Cost/Correct ($)", justify="right")
        tbl.add_column("Score", justify="right")

        for m in metrics:
            tbl.add_row(
                m.style,
                str(m.n),
                f"{m.accuracy:.2%}",
                f"{m.avg_response_time:.3f}",
                f"{m.avg_tokens:.1f}",
                f"{m.total_cost:.4f}",
                "—" if m.cost_per_correct is None else f"{m.cost_per_correct:.4f}",
                f"{m.overall_score:.3f}",
            )
        _console.print(tbl)

    def recommend(self, metrics: List[StyleMetrics]) -> Dict[str, Optional[str]]:
        if not metrics:
            return {"best_overall": None, "most_accurate": None, "fastest": None, "cheapest_total": None}
        best_overall = metrics[0]
        most_accurate = max(metrics, key=lambda m: m.accuracy)
        fastest = min(metrics, key=lambda m: m.avg_response_time)
        cheapest = min(metrics, key=lambda m: m.total_cost)
        return {
            "best_overall": best_overall.style,
            "most_accurate": most_accurate.style,
            "fastest": fastest.style,
            "cheapest_total": cheapest.style,
        }


## Task 4: Design Your Own Prompt Style

**TODO:** Create and test your own innovative prompt engineering approach.

In [None]:
# Setup and imports
!pip install asksageclient pip_system_certs
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import time
import tiktoken
from pathlib import Path
from typing import Dict, List, Any

# Import our AskSage client
from asksageclient import AskSageClient

# Get API credentials from Google Colab secrets
from google.colab import userdata
api_key = userdata.get('ASKSAGE_API_KEY')
email = userdata.get('ASKSAGE_EMAIL')

# Initialize client and tokenizer
client = AskSageClient(api_key=api_key, email=email)
tokenizer = tiktoken.encoding_for_model("gpt-4")
print("AskSage client initialized successfully")
print("Ready to showcase AI capabilities...")

## 🎯 Exit Ticket

Before completing this lab, make sure you can answer:

### ✅ Deliverables Checklist

- [ ] **Implemented 4 prompt styles**: Zero-shot, few-shot, chain-of-thought, and system prompt
- [ ] **Built performance analyzer**: Accuracy, response time, and token usage metrics
- [ ] **Generated comparison results**: Table and visualizations showing style performance
- [ ] **Created custom prompt style**: Your own innovative approach with analysis

### 🧠 Knowledge Check

1. **Which prompt style worked best for accuracy?** Why do you think that is?

2. **What trade-offs did you observe** between accuracy, speed, and token usage?

3. **When would you use each style** in a production system?

4. **What made your custom approach unique?** What problem does it solve?

### 🚀 Extensions (Optional)

- **Multi-model comparison**: Test styles across different models (GPT vs Claude)
- **Domain adaptation**: Try styles on different classification tasks
- **Prompt optimization**: A/B test variations of your best-performing style
- **Cost analysis**: Calculate actual costs for different approaches

### 📊 Success Metrics

- Built working prompt comparison framework
- Achieved >80% accuracy on at least one style
- Identified clear trade-offs between approaches
- Designed innovative custom prompt approach

**Time Check:** This lab should take about 60 minutes. If you're running over, focus on getting the basic comparison working first, then add custom styles.

Ready for Lab 6: Prompt Library? Let's build a reusable collection of battle-tested prompts! 🚀