# 01 - Data Preparation

This notebook handles:
1. Loading NYT Connections puzzle data
2. Cleaning and validating puzzles
3. Creating temporal train/test split
4. Preparing training datasets (alpha and beta)

In [None]:
import json
import random
from pathlib import Path

random.seed(42)
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

## Load Raw Puzzle Data

Puzzle format:
```json
{
    "game_id": 123,
    "date": "2024-01-15",
    "words": ["WORD1", ..., "WORD16"],
    "groups": [
        {"name": "GROUP NAME", "members": ["W1", "W2", "W3", "W4"]},
        ...
    ]
}
```

In [None]:
def load_raw_puzzles(filepath):
    """Load puzzles from JSONL, converting to standard format."""
    puzzles = []
    with open(filepath, "r") as f:
        for line in f:
            if not line.strip():
                continue
            raw = json.loads(line)
            
            # Convert groups to solution dict
            solution = {g["name"]: g["members"] for g in raw["groups"]}
            
            puzzles.append({
                "game_id": raw["game_id"],
                "date": raw.get("date"),
                "words": raw["words"],
                "solution": solution
            })
    return puzzles

# Load all puzzles
puzzles = load_raw_puzzles(DATA_DIR / "puzzles_raw.jsonl")
print(f"Loaded {len(puzzles)} puzzles")

## Validate and Clean Data

In [None]:
def validate_puzzle(puzzle):
    """Check puzzle has valid structure."""
    # Must have exactly 16 words
    if len(puzzle["words"]) != 16:
        return False, f"Wrong word count: {len(puzzle['words'])}"
    
    # Must have exactly 4 groups
    if len(puzzle["solution"]) != 4:
        return False, f"Wrong group count: {len(puzzle['solution'])}"
    
    # Each group must have 4 words
    for name, members in puzzle["solution"].items():
        if len(members) != 4:
            return False, f"Group '{name}' has {len(members)} words"
    
    # All solution words must be in word list
    word_set = set(w.upper() for w in puzzle["words"])
    for name, members in puzzle["solution"].items():
        for m in members:
            if m.upper() not in word_set:
                return False, f"Solution word '{m}' not in word list"
    
    # No NaN values
    for w in puzzle["words"]:
        if w is None or (isinstance(w, float) and str(w) == 'nan'):
            return False, "Contains NaN values"
    
    return True, None

# Validate all puzzles
valid_puzzles = []
invalid_count = 0

for p in puzzles:
    is_valid, error = validate_puzzle(p)
    if is_valid:
        valid_puzzles.append(p)
    else:
        invalid_count += 1
        print(f"Invalid puzzle {p['game_id']}: {error}")

print(f"\nValid: {len(valid_puzzles)}, Invalid: {invalid_count}")

## Create Temporal Train/Test Split

We use a **temporal split**: train on older puzzles, test on newer ones.
This simulates real deployment (model trained on past, predicting future).

In [None]:
# Sort by game_id (chronological)
valid_puzzles.sort(key=lambda x: x["game_id"])

# Last 150 puzzles for test
TEST_SIZE = 150
train_puzzles = valid_puzzles[:-TEST_SIZE]
test_puzzles = valid_puzzles[-TEST_SIZE:]

test_ids = set(p["game_id"] for p in test_puzzles)

print(f"Train: {len(train_puzzles)} puzzles (IDs {train_puzzles[0]['game_id']}-{train_puzzles[-1]['game_id']})")
print(f"Test:  {len(test_puzzles)} puzzles (IDs {test_puzzles[0]['game_id']}-{test_puzzles[-1]['game_id']})")

In [None]:
# Save train/test splits
def save_jsonl(data, filepath):
    with open(filepath, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

save_jsonl(train_puzzles, DATA_DIR / "train.jsonl")
save_jsonl(test_puzzles, DATA_DIR / "test.jsonl")

print(f"Saved {DATA_DIR / 'train.jsonl'} ({len(train_puzzles)} puzzles)")
print(f"Saved {DATA_DIR / 'test.jsonl'} ({len(test_puzzles)} puzzles)")

## Prepare Training Data

We have two types of training data:
1. **Golden traces**: One-shot reasoning from Claude Sonnet
2. **Game traces**: Multi-turn game transcripts with feedback

In [None]:
# Load golden traces (generated separately)
golden_traces = []
with open(DATA_DIR / "golden_traces.jsonl", "r") as f:
    for line in f:
        if line.strip():
            golden_traces.append(json.loads(line))

# Check for leakage into test set
golden_ids = set(int(t["game_id"]) for t in golden_traces)
golden_leakage = golden_ids & test_ids

print(f"Golden traces: {len(golden_traces)} total, {len(golden_leakage)} in test set")
if golden_leakage:
    print(f"⚠️  WARNING: {len(golden_leakage)} golden traces leak into test!")

In [None]:
# Load game traces
game_traces = []
with open(DATA_DIR / "game_traces.jsonl", "r") as f:
    for line in f:
        if line.strip():
            game_traces.append(json.loads(line))

# Filter to only clean traces (not in test set)
game_ids = [int(t["metadata"]["puzzle_id"]) for t in game_traces]
clean_game_traces = [t for t in game_traces if int(t["metadata"]["puzzle_id"]) not in test_ids]

print(f"Game traces: {len(game_traces)} total, {len(clean_game_traces)} clean (not in test)")

In [None]:
# Format golden traces for SFT
SYSTEM_PROMPT = "You are an expert puzzle solver specializing in word associations and pattern recognition."

def format_golden_trace(trace):
    """Convert golden trace to chat format."""
    words_str = ", ".join(trace["words"])
    prompt = f"""Solve this NYT Connections puzzle. Group the 16 words into 4 groups of 4 words each.

Words: {words_str}

Think through this step-by-step, then output your answer as JSON."""

    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": trace["reasoning_trace"]}
        ],
        "metadata": {
            "puzzle_id": trace.get("game_id", 0),
            "source": "golden_trace"
        }
    }

golden_formatted = [format_golden_trace(t) for t in golden_traces]
print(f"Formatted {len(golden_formatted)} golden traces")

In [None]:
# Create Alpha dataset (golden only)
alpha_data = golden_formatted.copy()
random.shuffle(alpha_data)

save_jsonl(alpha_data, DATA_DIR / "sft_alpha.jsonl")
print(f"Alpha training data: {len(alpha_data)} examples (golden only)")

In [None]:
# Create Beta dataset (golden + game traces)
# Take first 150 clean game traces
game_traces_to_use = clean_game_traces[:150]

beta_data = golden_formatted + game_traces_to_use
random.shuffle(beta_data)

save_jsonl(beta_data, DATA_DIR / "sft_beta.jsonl")
print(f"Beta training data: {len(beta_data)} examples ({len(golden_formatted)} golden + {len(game_traces_to_use)} game)")

## Summary

Created files:
- `data/train.jsonl` - Training puzzles (for reference)
- `data/test.jsonl` - Test puzzles (150, temporal split)
- `data/sft_alpha.jsonl` - Alpha training data (golden traces only)
- `data/sft_beta.jsonl` - Beta training data (golden + game traces)