In [None]:
print('Setup complete.')

# Lab 01: Introduction to LLM Evaluation

## Learning Objectives

- Understand evaluation metrics for LLM applications
- Implement basic evaluation frameworks
- Create evaluation datasets
- Measure model performance systematically

## Setup


In [None]:
# Import required libraries
import json
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass, field
from enum import Enum
import time
from datetime import datetime

# Evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

## Part 1: Evaluation Framework Basics

In [None]:
@dataclass
class EvaluationCase:
    """Single evaluation test case"""
    case_id: str
    input_text: str
    expected_output: str
    metadata: Dict[str, Any] = field(default_factory=dict)
    
@dataclass
class EvaluationResult:
    """Result of a single evaluation"""
    case_id: str
    actual_output: str
    expected_output: str
    score: float
    metrics: Dict[str, float]
    execution_time: float
    error: Optional[str] = None

class EvaluationMetric(Enum):
    """Available evaluation metrics"""
    EXACT_MATCH = "exact_match"
    CONTAINS = "contains"
    SEMANTIC_SIMILARITY = "semantic_similarity"
    REGEX_MATCH = "regex_match"
    CUSTOM = "custom"

In [None]:
class LLMEvaluator:
    """Base evaluation framework for LLM applications"""
    
    def __init__(self, name: str = "default_evaluator"):
        self.name = name
        self.results: List[EvaluationResult] = []
        
    def evaluate_single(
        self,
        case: EvaluationCase,
        llm_function,
        metric: EvaluationMetric = EvaluationMetric.EXACT_MATCH
    ) -> EvaluationResult:
        """Evaluate a single test case"""
        start_time = time.time()
        
        try:
            # Get LLM output
            actual_output = llm_function(case.input_text)
            
            # Calculate score based on metric
            score = self._calculate_score(
                actual_output,
                case.expected_output,
                metric
            )
            
            # Additional metrics
            metrics = {
                "length_ratio": len(actual_output) / max(len(case.expected_output), 1),
                "metric_type": metric.value
            }
            
            result = EvaluationResult(
                case_id=case.case_id,
                actual_output=actual_output,
                expected_output=case.expected_output,
                score=score,
                metrics=metrics,
                execution_time=time.time() - start_time
            )
            
        except Exception as e:
            result = EvaluationResult(
                case_id=case.case_id,
                actual_output="",
                expected_output=case.expected_output,
                score=0.0,
                metrics={},
                execution_time=time.time() - start_time,
                error=str(e)
            )
        
        self.results.append(result)
        return result
    
    def _calculate_score(
        self,
        actual: str,
        expected: str,
        metric: EvaluationMetric
    ) -> float:
        """Calculate score based on metric type"""
        if metric == EvaluationMetric.EXACT_MATCH:
            return 1.0 if actual.strip() == expected.strip() else 0.0
        
        elif metric == EvaluationMetric.CONTAINS:
            return 1.0 if expected.lower() in actual.lower() else 0.0
        
        elif metric == EvaluationMetric.SEMANTIC_SIMILARITY:
            # Simplified - in production use embeddings
            common_words = set(actual.lower().split()) & set(expected.lower().split())
            total_words = set(actual.lower().split()) | set(expected.lower().split())
            return len(common_words) / max(len(total_words), 1)
        
        else:
            return 0.0
    
    def evaluate_dataset(
        self,
        cases: List[EvaluationCase],
        llm_function,
        metric: EvaluationMetric = EvaluationMetric.EXACT_MATCH
    ) -> Dict[str, Any]:
        """Evaluate entire dataset"""
        for case in cases:
            self.evaluate_single(case, llm_function, metric)
        
        return self.get_summary()

## Part 2: Creating Evaluation Datasets

In [None]:
class EvaluationDatasetBuilder:
    """Build and manage evaluation datasets"""
    
    @staticmethod
    def create_classification_dataset() -> List[EvaluationCase]:
        """Create dataset for classification tasks"""
        return [
            EvaluationCase(
                case_id="cls_001",
                input_text="The movie was fantastic! Best I've seen all year.",
                expected_output="positive",
                metadata={"category": "sentiment", "difficulty": "easy"}
            ),
            EvaluationCase(
                case_id="cls_002",
                input_text="The service was terrible and the food was cold.",
                expected_output="negative",
                metadata={"category": "sentiment", "difficulty": "easy"}
            ),
            EvaluationCase(
                case_id="cls_003",
                input_text="It was okay, nothing special but not bad either.",
                expected_output="neutral",
                metadata={"category": "sentiment", "difficulty": "medium"}
            )
        ]
    
    @staticmethod
    def create_extraction_dataset() -> List[EvaluationCase]:
        """Create dataset for information extraction"""
        return [
            EvaluationCase(
                case_id="ext_001",
                input_text="John Smith lives at 123 Main St, Boston, MA 02101",
                expected_output=json.dumps({
                    "name": "John Smith",
                    "address": "123 Main St",
                    "city": "Boston",
                    "state": "MA",
                    "zip": "02101"
                }),
                metadata={"category": "address_extraction"}
            ),
            EvaluationCase(
                case_id="ext_002",
                input_text="Contact me at john@example.com or call 555-1234",
                expected_output=json.dumps({
                    "email": "john@example.com",
                    "phone": "555-1234"
                }),
                metadata={"category": "contact_extraction"}
            )
        ]
    
    @staticmethod
    def save_dataset(cases: List[EvaluationCase], filename: str):
        """Save dataset to JSON file"""
        data = [
            {
                "case_id": case.case_id,
                "input_text": case.input_text,
                "expected_output": case.expected_output,
                "metadata": case.metadata
            }
            for case in cases
        ]
        
        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)
    
    @staticmethod
    def load_dataset(filename: str) -> List[EvaluationCase]:
        """Load dataset from JSON file"""
        with open(filename, 'r') as f:
            data = json.load(f)
        
        return [
            EvaluationCase(
                case_id=item["case_id"],
                input_text=item["input_text"],
                expected_output=item["expected_output"],
                metadata=item.get("metadata", {})
            )
            for item in data
        ]

## Part 3: Evaluation Metrics & Analysis

In [None]:
class EvaluationAnalyzer:
    """Analyze evaluation results"""
    
    def __init__(self, results: List[EvaluationResult]):
        self.results = results
        self.df = self._results_to_dataframe()
    
    def _results_to_dataframe(self) -> pd.DataFrame:
        """Convert results to DataFrame for analysis"""
        data = []
        for result in self.results:
            data.append({
                'case_id': result.case_id,
                'score': result.score,
                'execution_time': result.execution_time,
                'has_error': result.error is not None,
                **result.metrics
            })
        return pd.DataFrame(data)
    
    def get_summary_statistics(self) -> Dict[str, float]:
        """Get summary statistics"""
        return {
            'mean_score': self.df['score'].mean(),
            'std_score': self.df['score'].std(),
            'min_score': self.df['score'].min(),
            'max_score': self.df['score'].max(),
            'success_rate': (self.df['score'] > 0.5).mean(),
            'mean_execution_time': self.df['execution_time'].mean(),
            'error_rate': self.df['has_error'].mean()
        }
    
    def plot_results(self):
        """Visualize evaluation results"""
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        
        # Score distribution
        axes[0, 0].hist(self.df['score'], bins=20, edgecolor='black')
        axes[0, 0].set_title('Score Distribution')
        axes[0, 0].set_xlabel('Score')
        axes[0, 0].set_ylabel('Frequency')
        
        # Execution time
        axes[0, 1].scatter(range(len(self.df)), self.df['execution_time'])
        axes[0, 1].set_title('Execution Time by Case')
        axes[0, 1].set_xlabel('Case Index')
        axes[0, 1].set_ylabel('Time (seconds)')
        
        # Score vs execution time
        axes[1, 0].scatter(self.df['execution_time'], self.df['score'])
        axes[1, 0].set_title('Score vs Execution Time')
        axes[1, 0].set_xlabel('Execution Time (s)')
        axes[1, 0].set_ylabel('Score')
        
        # Success rate by metric type (if available)
        if 'metric_type' in self.df.columns:
            metric_scores = self.df.groupby('metric_type')['score'].mean()
            axes[1, 1].bar(metric_scores.index, metric_scores.values)
            axes[1, 1].set_title('Average Score by Metric Type')
            axes[1, 1].set_xlabel('Metric Type')
            axes[1, 1].set_ylabel('Average Score')
        
        plt.tight_layout()
        plt.show()
    
    def get_failure_analysis(self, threshold: float = 0.5) -> pd.DataFrame:
        """Analyze failed cases"""
        failures = self.df[self.df['score'] < threshold]
        return failures[['case_id', 'score', 'execution_time', 'has_error']]

## Part 4: Practical Example

In [None]:
# Mock LLM function for testing
def mock_sentiment_classifier(text: str) -> str:
    """Mock sentiment classifier for demonstration"""
    positive_words = ['fantastic', 'great', 'excellent', 'amazing', 'best']
    negative_words = ['terrible', 'bad', 'awful', 'worst', 'horrible']
    
    text_lower = text.lower()
    
    if any(word in text_lower for word in positive_words):
        return "positive"
    elif any(word in text_lower for word in negative_words):
        return "negative"
    else:
        return "neutral"

# Create evaluation dataset
dataset = EvaluationDatasetBuilder.create_classification_dataset()

# Run evaluation
evaluator = LLMEvaluator("sentiment_evaluator")
results = evaluator.evaluate_dataset(
    cases=dataset,
    llm_function=mock_sentiment_classifier,
    metric=EvaluationMetric.EXACT_MATCH
)

print("Evaluation Results:")
print(json.dumps(results, indent=2))

In [None]:
# Analyze results
analyzer = EvaluationAnalyzer(evaluator.results)

# Get summary statistics
stats = analyzer.get_summary_statistics()
print("Summary Statistics:")
for key, value in stats.items():
    print(f"{key}: {value:.3f}")

# Plot results
analyzer.plot_results()

# Analyze failures
failures = analyzer.get_failure_analysis()
if not failures.empty:
    print("\nFailed Cases:")
    print(failures)

## Part 5: Exercises

Complete these exercises to practice evaluation techniques.

In [None]:
# Exercise 1: Create a custom evaluation metric
# TODO: Implement a metric that checks if the output contains all required keywords
# Example: For input "list fruits", output should contain "apple", "banana", "orange"

def keyword_coverage_metric(actual: str, expected_keywords: List[str]) -> float:
    """Calculate what percentage of expected keywords are in the output"""
    # Your code here:
    pass


# Exercise 2: Implement A/B testing for prompts
# TODO: Create a function that compares two different prompts on the same dataset

def compare_prompts(prompt_a: str, prompt_b: str, test_cases: List[str]) -> Dict[str, Any]:
    """Compare performance of two prompts"""
    # Your code here:
    pass


# Exercise 3: Build an evaluation pipeline
# TODO: Create a complete pipeline that:
# 1. Loads a dataset
# 2. Runs multiple evaluation metrics
# 3. Generates a report
# 4. Saves results to file

class EvaluationPipeline:
    """Complete evaluation pipeline"""
    # Your code here:
    pass

## Summary

In this lab, you learned:
- How to build evaluation frameworks for LLM applications
- Different types of evaluation metrics
- How to create and manage evaluation datasets
- Techniques for analyzing evaluation results

Next steps:
- Implement more sophisticated metrics (BLEU, ROUGE, etc.)
- Build automated evaluation pipelines
- Integrate with CI/CD systems