# Week 14: Evaluation & Benchmarking

Techniques for reliable AI evaluation.

## Learning Objectives
1. Compute classical metrics (Precision, Recall, F1)
2. Implement LLM-as-a-Judge
3. Understand Model Drift

In [None]:
import numpy as np
from typing import List

## 1. Classification Metrics

Implementing confusion matrix derived metrics.

In [None]:
def calculate_metrics(y_true: np.ndarray, y_pred: np.ndarray):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    epsilon = 1e-10
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon) # aka Sensitivity
    f1 = 2 * (precision * recall) / (precision + recall + epsilon)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# Test metrics
y_t = np.array([1, 0, 1, 1, 0, 1])
y_p = np.array([1, 0, 1, 0, 0, 1])

print(calculate_metrics(y_t, y_p))

## 2. LLM-as-a-Judge

Using a stronger model to evaluate outputs.

In [None]:
def llm_judge_prompt(question, answer, ground_truth):
    return f"""You are an impartial judge. Evaluate the quality of the AI's answer.

Question: {question}
Ground Truth: {ground_truth}
AI Answer: {answer}

Score the answer from 1 to 5 based on accuracy and helpfulness.
Output format: {{ "score": int, "reason": "str" }}
"""

samples = [
    {"q": "What is 2+2?", "a": "4", "gt": "4"},
    {"q": "Capital of France?", "a": "London", "gt": "Paris"}
]

for s in samples:
    print(f"--- Case ---\n{llm_judge_prompt(s['q'], s['a'], s['gt'])}")

## 3. Drift Detection (KS Test)

 detecting if data distribution has changed.

In [None]:
from scipy import stats

def detect_drift(reference_data, new_data, threshold=0.05):
    """
    Kolmogorov-Smirnov Test for data drift.
    Returns True if drift detected (p_value < threshold)
    """
    statistic, p_value = stats.ks_2samp(reference_data, new_data)
    
    return {
        "drift_detected": p_value < threshold,
        "p_value": p_value,
        "statistic": statistic
    }

# Test Drift
np.random.seed(42)
ref = np.random.normal(0, 1, 1000)
new_same = np.random.normal(0, 1, 1000)
new_drift = np.random.normal(0.5, 1, 1000)  # Mean shift

print("Same Dist:", detect_drift(ref, new_same))
print("Drifted Dist:", detect_drift(ref, new_drift))