# Session 3: Experimental Design I
## Constructing Diagnostic Scenarios

**Production LLM Deployment: Risk Characterization Before Failure**

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Javihaus/Production_LLM_Deployment/blob/main/sessions/session_03_experimental_design_i/notebook.ipynb)

---

**Learning Objectives:**
1. Design balanced test distributions
2. Control semantic content while varying format
3. Establish deterministic ground truth
4. Create scenario sets for your deployment domain

## Setup

In [None]:
!pip install -q anthropic numpy pandas matplotlib seaborn

import anthropic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
from dataclasses import dataclass
import json
import time

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

try:
    from google.colab import userdata
    api_key = userdata.get('ANTHROPIC_API_KEY')
except:
    import os
    api_key = os.environ.get('ANTHROPIC_API_KEY')

client = anthropic.Anthropic(api_key=api_key)
print("Setup complete!")

## Part 1: Why Balanced Test Sets Matter

Unbalanced test sets can hide severe failures behind high accuracy numbers.

In [None]:
def demonstrate_balance_importance():
    """Show how unbalanced tests hide failures."""
    
    # Simulate a model that always says "YES"
    always_yes_model = lambda x: "YES"
    
    # Unbalanced test set (90% positive)
    unbalanced_tests = [
        {"scenario": f"Positive case {i}", "correct": "YES"} for i in range(9)
    ] + [
        {"scenario": "Negative case", "correct": "NO"}
    ]
    
    # Balanced test set (50% positive)
    balanced_tests = [
        {"scenario": f"Positive case {i}", "correct": "YES"} for i in range(5)
    ] + [
        {"scenario": f"Negative case {i}", "correct": "NO"} for i in range(5)
    ]
    
    # Calculate accuracy for both
    unbalanced_correct = sum(1 for t in unbalanced_tests if always_yes_model(t) == t["correct"])
    balanced_correct = sum(1 for t in balanced_tests if always_yes_model(t) == t["correct"])
    
    print("=" * 60)
    print("MODEL THAT ALWAYS SAYS 'YES'")
    print("=" * 60)
    print(f"\nUnbalanced test set (90% positive):")
    print(f"  Accuracy: {unbalanced_correct}/{len(unbalanced_tests)} = {unbalanced_correct/len(unbalanced_tests):.1%}")
    print(f"  Looks great! But...")
    
    print(f"\nBalanced test set (50% positive):")
    print(f"  Accuracy: {balanced_correct}/{len(balanced_tests)} = {balanced_correct/len(balanced_tests):.1%}")
    print(f"  Reveals the true failure!")
    
    print("\n" + "=" * 60)
    print("LESSON: Always use balanced test sets to reveal true performance.")
    print("=" * 60)

demonstrate_balance_importance()

## Part 2: Test Scenario Design Framework

In [None]:
@dataclass
class TestScenario:
    """A single test scenario with ground truth."""
    id: str
    description: str
    correct_answer: str
    category: str  # e.g., "positive", "negative", "edge_case"
    difficulty: str  # e.g., "easy", "medium", "hard"
    formats: Dict[str, str] = None  # Multiple prompt formats
    
    def __post_init__(self):
        if self.formats is None:
            self.formats = {}


class TestSuiteBuilder:
    """Build balanced, multi-format test suites."""
    
    def __init__(self, domain: str):
        self.domain = domain
        self.scenarios = []
    
    def add_scenario(self, scenario: TestScenario):
        """Add a test scenario."""
        self.scenarios.append(scenario)
    
    def check_balance(self) -> Dict:
        """Check distribution balance."""
        categories = {}
        for s in self.scenarios:
            categories[s.category] = categories.get(s.category, 0) + 1
        
        total = len(self.scenarios)
        return {
            "total": total,
            "distribution": {k: f"{v}/{total} ({v/total:.1%})" for k, v in categories.items()},
            "is_balanced": max(categories.values()) - min(categories.values()) <= 1 if categories else True
        }
    
    def generate_report(self):
        """Generate test suite report."""
        balance = self.check_balance()
        
        print("=" * 60)
        print(f"TEST SUITE: {self.domain}")
        print("=" * 60)
        print(f"\nTotal scenarios: {balance['total']}")
        print(f"\nDistribution:")
        for cat, dist in balance['distribution'].items():
            print(f"  {cat}: {dist}")
        print(f"\nBalanced: {'YES' if balance['is_balanced'] else 'NO - NEEDS ADJUSTMENT'}")
        
        # Format coverage
        format_counts = {}
        for s in self.scenarios:
            for fmt in s.formats.keys():
                format_counts[fmt] = format_counts.get(fmt, 0) + 1
        
        if format_counts:
            print(f"\nFormat coverage:")
            for fmt, count in format_counts.items():
                print(f"  {fmt}: {count}/{balance['total']} scenarios")


# Example: Build a medication timing test suite
builder = TestSuiteBuilder("Medication Timing Constraints")

# Add positive cases (safe to take medication)
builder.add_scenario(TestScenario(
    id="MED001",
    description="Aspirin at 8AM, ibuprofen at 1PM, 4-hour requirement",
    correct_answer="YES",
    category="positive",
    difficulty="easy",
    formats={
        "natural": "I took aspirin at 8:00 AM. Can I take ibuprofen at 1:00 PM? (4-hour wait required) YES or NO?",
        "clinical": "Med A: 0800h. Med B requested: 1300h. Interval: 4h. Safe? YES/NO",
        "json": '{"med_a": "8:00 AM", "med_b": "1:00 PM", "min_gap": 4} Safe? YES/NO'
    }
))

builder.add_scenario(TestScenario(
    id="MED002",
    description="Morning dose at 6AM, afternoon at 2PM, 6-hour requirement",
    correct_answer="YES",
    category="positive",
    difficulty="easy",
    formats={
        "natural": "Took morning dose at 6:00 AM. Is 2:00 PM safe for afternoon dose? (6-hour minimum) YES or NO?",
        "clinical": "Dose 1: 0600h. Dose 2 proposed: 1400h. Required gap: 6h. Compliant? YES/NO"
    }
))

# Add negative cases (NOT safe)
builder.add_scenario(TestScenario(
    id="MED003",
    description="Aspirin at 8AM, ibuprofen at 11AM, 4-hour requirement",
    correct_answer="NO",
    category="negative",
    difficulty="easy",
    formats={
        "natural": "I took aspirin at 8:00 AM. Can I take ibuprofen at 11:00 AM? (4-hour wait required) YES or NO?",
        "clinical": "Med A: 0800h. Med B requested: 1100h. Interval: 4h. Safe? YES/NO",
        "json": '{"med_a": "8:00 AM", "med_b": "11:00 AM", "min_gap": 4} Safe? YES/NO'
    }
))

builder.add_scenario(TestScenario(
    id="MED004",
    description="Morning dose at 7AM, noon at 12PM, 6-hour requirement",
    correct_answer="NO",
    category="negative",
    difficulty="easy",
    formats={
        "natural": "Took morning dose at 7:00 AM. Is 12:00 PM safe? (6-hour minimum) YES or NO?",
        "clinical": "Dose 1: 0700h. Dose 2 proposed: 1200h. Required gap: 6h. Compliant? YES/NO"
    }
))

builder.generate_report()

## Part 3: Running Multi-Format Tests

In [None]:
def run_multiformat_test(scenario: TestScenario, client) -> Dict:
    """Test a scenario across all its formats."""
    
    results = {
        "scenario_id": scenario.id,
        "correct_answer": scenario.correct_answer,
        "format_results": {}
    }
    
    for format_name, prompt in scenario.formats.items():
        response = client.messages.create(
            model="claude-sonnet-4-5-20250929",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )
        
        answer = response.content[0].text.strip().upper()
        if "YES" in answer[:5]:
            answer = "YES"
        elif "NO" in answer[:5]:
            answer = "NO"
        else:
            answer = "UNCLEAR"
        
        correct = answer == scenario.correct_answer
        
        results["format_results"][format_name] = {
            "answer": answer,
            "correct": correct
        }
        
        time.sleep(0.3)
    
    # Calculate consistency
    answers = [r["answer"] for r in results["format_results"].values()]
    results["consistent"] = len(set(answers)) == 1
    results["accuracy"] = sum(r["correct"] for r in results["format_results"].values()) / len(results["format_results"])
    
    return results


# Run tests on our scenarios
print("=" * 60)
print("MULTI-FORMAT TEST RESULTS")
print("=" * 60)

all_results = []
for scenario in builder.scenarios:
    result = run_multiformat_test(scenario, client)
    all_results.append(result)
    
    print(f"\n{scenario.id}: Expected {scenario.correct_answer}")
    for fmt, res in result["format_results"].items():
        status = "CORRECT" if res["correct"] else "WRONG"
        print(f"  {fmt}: {res['answer']} - {status}")
    print(f"  Consistency: {'YES' if result['consistent'] else 'NO (BRITTLE)'}")
    print(f"  Accuracy: {result['accuracy']:.1%}")

## Part 4: Analyzing Results

In [None]:
def analyze_test_results(results: List[Dict], scenarios: List[TestScenario]):
    """Analyze test results for key metrics."""
    
    # Overall accuracy
    total_tests = sum(len(r["format_results"]) for r in results)
    total_correct = sum(
        sum(fr["correct"] for fr in r["format_results"].values())
        for r in results
    )
    
    # By category
    category_results = {}
    for result, scenario in zip(results, scenarios):
        cat = scenario.category
        if cat not in category_results:
            category_results[cat] = {"correct": 0, "total": 0}
        
        for fr in result["format_results"].values():
            category_results[cat]["total"] += 1
            if fr["correct"]:
                category_results[cat]["correct"] += 1
    
    # By format
    format_results = {}
    for result in results:
        for fmt, fr in result["format_results"].items():
            if fmt not in format_results:
                format_results[fmt] = {"correct": 0, "total": 0}
            format_results[fmt]["total"] += 1
            if fr["correct"]:
                format_results[fmt]["correct"] += 1
    
    # Consistency
    consistent_count = sum(1 for r in results if r["consistent"])
    
    print("=" * 60)
    print("ANALYSIS SUMMARY")
    print("=" * 60)
    
    print(f"\nOverall Accuracy: {total_correct}/{total_tests} = {total_correct/total_tests:.1%}")
    
    print(f"\nBy Category:")
    for cat, data in category_results.items():
        acc = data["correct"] / data["total"] if data["total"] > 0 else 0
        print(f"  {cat}: {data['correct']}/{data['total']} = {acc:.1%}")
    
    print(f"\nBy Format:")
    format_accuracies = []
    for fmt, data in format_results.items():
        acc = data["correct"] / data["total"] if data["total"] > 0 else 0
        format_accuracies.append(acc)
        print(f"  {fmt}: {data['correct']}/{data['total']} = {acc:.1%}")
    
    if format_accuracies:
        brittleness = (max(format_accuracies) - min(format_accuracies)) * 100
        print(f"\nBrittleness (max-min accuracy): {brittleness:.1f} percentage points")
    
    print(f"\nConsistency: {consistent_count}/{len(results)} scenarios gave same answer across formats")


analyze_test_results(all_results, builder.scenarios)

## Part 5: Exercise - Build Your Test Suite

In [None]:
# YOUR EXERCISE: Build a test suite for your domain

my_builder = TestSuiteBuilder("YOUR DOMAIN HERE")

# Add your positive cases
my_builder.add_scenario(TestScenario(
    id="TEST001",
    description="Describe your positive test case",
    correct_answer="YES",  # or whatever the correct answer is
    category="positive",
    difficulty="easy",
    formats={
        "natural": "Your natural language prompt here",
        "formal": "Your formal prompt here",
        # Add more formats
    }
))

# Add your negative cases
my_builder.add_scenario(TestScenario(
    id="TEST002",
    description="Describe your negative test case",
    correct_answer="NO",
    category="negative",
    difficulty="easy",
    formats={
        "natural": "Your natural language prompt here",
        "formal": "Your formal prompt here",
    }
))

# Generate report
my_builder.generate_report()

## Key Takeaways

1. **Balance your test sets.** Unbalanced distributions hide failures behind high accuracy.

2. **Test multiple formats.** Same semantics, different formats reveals brittleness.

3. **Establish ground truth first.** Know the correct answer before testing.

4. **Measure consistency.** Inconsistent answers across formats = pattern matching.

5. **Include edge cases.** Boundary conditions reveal architectural limitations.

---

**Homework:** Build a complete test suite with 8+ scenarios for your deployment domain.

**Next Session:** Experimental Design IIâ€”Statistical Analysis and Causal Testing