# 04 - Evaluation

Evaluate models on the test set using the game loop.

Supports:
- Local models (base Qwen, fine-tuned Alpha/Beta)
- API models (Claude, GPT-4o, etc.)

In [None]:
import torch
import json
from pathlib import Path
from unsloth import FastLanguageModel
from peft import PeftModel

# Add parent dir for imports
import sys
sys.path.insert(0, '..')
from src.evaluation import run_evaluation, load_puzzles, analyze_one_away_recovery

In [None]:
# Load test puzzles
test_puzzles = load_puzzles("data/test.jsonl")
print(f"Loaded {len(test_puzzles)} test puzzles")

## Option 1: Evaluate Local Models

In [None]:
# Load base Qwen (no LoRA)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-14B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print("Base Qwen loaded!")

In [None]:
# Run base model evaluation
results_base = run_evaluation(
    model, tokenizer, test_puzzles,
    model_name="base",
    output_dir="data"
)

In [None]:
# Load Alpha (fine-tuned on golden traces)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-14B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

model = PeftModel.from_pretrained(model, "qwen-connections-alpha-r32-ep3")
FastLanguageModel.for_inference(model)

# Verify LoRA loaded
print(f"Adapters: {model.active_adapters}")
print("Alpha loaded!")

In [None]:
# Run Alpha evaluation
results_alpha = run_evaluation(
    model, tokenizer, test_puzzles,
    model_name="alpha",
    output_dir="data"
)

In [None]:
# Load Beta (fine-tuned on golden + game traces)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-14B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

model = PeftModel.from_pretrained(model, "qwen-connections-beta-r32-ep2.5")
FastLanguageModel.for_inference(model)
print("Beta loaded!")

In [None]:
# Run Beta evaluation
results_beta = run_evaluation(
    model, tokenizer, test_puzzles,
    model_name="beta",
    output_dir="data"
)

## Option 2: Evaluate API Models

In [None]:
# For API models, we need a modified game loop
# See src/api_evaluation.py for full implementation

from anthropic import Anthropic
# from openai import OpenAI

ANTHROPIC_API_KEY = "sk-ant-..."  # Your key
client = Anthropic(api_key=ANTHROPIC_API_KEY)

In [None]:
# API game loop (simplified)
import re

def extract_json_from_response(text):
    matches = list(re.finditer(r'```json\s*(\{.*?\})\s*```', text, re.DOTALL))
    if matches:
        try:
            return json.loads(matches[-1].group(1))
        except:
            pass
    try:
        start = text.rfind('{')
        if start != -1:
            depth = 0
            for i, c in enumerate(text[start:]):
                if c == '{': depth += 1
                elif c == '}': depth -= 1
                if depth == 0:
                    return json.loads(text[start:start + i + 1])
    except:
        pass
    return None

# ... (full implementation in src/api_evaluation.py)

## Analyze Results

In [None]:
# Compare ONE AWAY recovery rates
print("ONE AWAY Recovery Analysis")
print("=" * 40)

for model_name in ["base", "alpha", "beta"]:
    traces_file = f"data/eval_{model_name}_traces.jsonl"
    try:
        stats = analyze_one_away_recovery(traces_file)
        print(f"{model_name.upper()}:")
        print(f"  ONE AWAY situations: {stats['total']}")
        print(f"  Recovered: {stats['recovered']} ({stats['recovery_rate']:.1f}%)")
        print()
    except FileNotFoundError:
        print(f"{model_name}: No traces file found")

In [None]:
# Load and display a trace
def show_trace(traces_file, puzzle_id):
    """Display a specific game trace."""
    with open(traces_file) as f:
        for line in f:
            t = json.loads(line)
            if t["puzzle_id"] == puzzle_id:
                status = "✅ SOLVED" if t["solved"] else "❌ LOST"
                print(f"{'='*70}")
                print(f"PUZZLE {puzzle_id} — {status}")
                print(f"Groups: {t['groups_found']}/4, Mistakes: {t['mistakes']}")
                print(f"{'='*70}")
                print(f"\nWords: {t['words']}")
                print(f"\nSolution:")
                for name, words in t['solution'].items():
                    print(f"  {name}: {words}")
                print(f"\n{'='*70}")
                print("GAME TRACE:")
                for turn in t["trace"]:
                    print(f"\n--- Turn {turn.get('turn', '?')} ---")
                    if "guess" in turn:
                        print(f"Guess: {turn['guess']}")
                    print(f"Result: {turn.get('result', turn.get('action', '?'))}")
                return
    print(f"Puzzle {puzzle_id} not found")

# Example: show a specific puzzle
# show_trace("data/eval_beta_traces.jsonl", 812)

## Final Results Summary

In [None]:
# Summary table
results_summary = {
    "Base Qwen 14B": {"solve_rate": 9.3, "avg_groups": 0.75},
    "GPT-4o-mini": {"solve_rate": 10.0, "avg_groups": 0.78},
    "Claude Haiku 3.5": {"solve_rate": 13.3, "avg_groups": 1.07},
    "GPT-4o": {"solve_rate": 22.7, "avg_groups": 1.49},
    "Alpha (fine-tuned)": {"solve_rate": 27.3, "avg_groups": 1.75},
    "Beta (fine-tuned)": {"solve_rate": 30.0, "avg_groups": 1.91},
    "Claude Sonnet 4.5": {"solve_rate": 87.3, "avg_groups": 3.61},
}

print("Final Results (150 test puzzles)")
print("=" * 50)
print(f"{'Model':<25} {'Solve Rate':>12} {'Avg Groups':>12}")
print("-" * 50)
for model, stats in results_summary.items():
    print(f"{model:<25} {stats['solve_rate']:>11.1f}% {stats['avg_groups']:>11.2f}")