# Day 4: ScoreImportance Compilation with GEPA

**Goal**: Compile ScoreImportance module to improve from 77.5% ±2 accuracy to 85%+  
**Optimizer**: GEPA (primary choice) with budget=40 rollouts  
**Expected Runtime**: 4-6 hours  

## Baseline Performance
- ±2 Accuracy: 77.5% (31/40 seeds)
- MAE: 1.45
- Weakest category: Mundane (17%)
- Target: ±2 Accuracy > 85%

## Setup: Install Dependencies

In [None]:
# Install required packages
!pip install -q dspy-ai sentence-transformers accelerate

## Mount Google Drive (for file persistence)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory structure if needed
!mkdir -p /content/drive/MyDrive/mini-town/compiled
!mkdir -p /content/drive/MyDrive/mini-town/checkpoints

## Upload Files

**Required files**:
1. `scorer_v1.json` - 40 training seeds from Day 3
2. `.env` file with GROQ_API_KEY

Upload these to `/content/` or copy from Google Drive

In [None]:
# Option 1: Upload files directly
from google.colab import files
print("Upload scorer_v1.json:")
uploaded = files.upload()

# Option 2: Copy from Google Drive (uncomment if files are in Drive)
# !cp /content/drive/MyDrive/mini-town/seeds/scorer_v1.json /content/

# Set API key
import os
from getpass import getpass
os.environ['GROQ_API_KEY'] = getpass('Enter your GROQ_API_KEY: ')

## Configure DSPy with Groq LLM

In [None]:
import dspy
import os

# Configure Groq LLM
lm = dspy.LM(
    model="groq/llama-3.1-8b-instant",
    api_key=os.getenv('GROQ_API_KEY'),
    temperature=0.3,
    max_tokens=512
)

dspy.settings.configure(lm=lm)
print("✅ DSPy configured with Groq LLM (llama-3.1-8b-instant)")

## Define ScoreImportance Signature

In [None]:
class ScoreImportance(dspy.Signature):
    """Rate how important this observation is for the agent's goals.

    Score 1-10 where:
    - 1-2: Trivial, background noise (e.g., "grass is green")
    - 3-4: Mildly interesting but not actionable
    - 5-6: Relevant to goals, worth remembering
    - 7-8: Directly impacts current plans or goals
    - 9-10: Life-changing, urgent, critical to goals
    """

    observation: str = dspy.InputField(desc="What the agent observed")
    agent_goal: str = dspy.InputField(desc="Agent's current high-level goal")
    agent_personality: str = dspy.InputField(desc="Agent's personality traits")

    reasoning: str = dspy.OutputField(desc="Brief explanation of score")
    score: int = dspy.OutputField(desc="Importance score (1-10)")

print("✅ ScoreImportance signature defined")

## Load and Prepare Seeds

In [None]:
import json

# Load seeds
with open('scorer_v1.json', 'r') as f:
    seeds_data = json.load(f)

print(f"Loaded {len(seeds_data['seeds'])} seeds")
print(f"Categories: {seeds_data['categories']}")

# Convert to DSPy examples
trainset = []
for seed in seeds_data['seeds']:
    example = dspy.Example(
        observation=seed['observation'],
        agent_goal=seed['agent_goal'],
        agent_personality=seed['agent_personality'],
        score=seed['gold_score'],
        category=seed['category'],  # For analysis
        seed_id=seed['id']  # For tracking
    ).with_inputs("observation", "agent_goal", "agent_personality")
    trainset.append(example)

print(f"✅ Created {len(trainset)} training examples")

# Show sample
print("\nSample example:")
print(f"Observation: {trainset[0].observation}")
print(f"Goal: {trainset[0].agent_goal}")
print(f"Personality: {trainset[0].agent_personality}")
print(f"Gold score: {trainset[0].score}")

## Define Importance Metric

In [None]:
def importance_metric(example, pred, trace=None):
    """
    Metric for ScoreImportance compilation.
    
    Success criteria:
    - Exact match: 1.0
    - Within ±1: 0.8
    - Within ±2: 0.5
    - Within ±3: 0.2
    - Else: 0.0
    """
    try:
        pred_score = int(pred.score)
    except (ValueError, AttributeError):
        return 0.0
    
    # Clamp to 1-10
    pred_score = max(1, min(10, pred_score))
    gold_score = int(example.score)
    
    error = abs(pred_score - gold_score)
    
    if error == 0:
        return 1.0
    elif error <= 1:
        return 0.8
    elif error <= 2:
        return 0.5
    elif error <= 3:
        return 0.2
    else:
        return 0.0

print("✅ Importance metric defined")

# Test metric
class DummyPred:
    def __init__(self, score):
        self.score = score

example = dspy.Example(score=7)
print(f"Test: gold=7, pred=7 → {importance_metric(example, DummyPred(7)):.2f} (expect 1.0)")
print(f"Test: gold=7, pred=8 → {importance_metric(example, DummyPred(8)):.2f} (expect 0.8)")
print(f"Test: gold=7, pred=9 → {importance_metric(example, DummyPred(9)):.2f} (expect 0.5)")
print(f"Test: gold=7, pred=1 → {importance_metric(example, DummyPred(1)):.2f} (expect 0.0)")

## Create Uncompiled Baseline

In [None]:
# Uncompiled baseline (ChainOfThought)
uncompiled_scorer = dspy.ChainOfThought(ScoreImportance)

print("✅ Uncompiled baseline created")
print("Module type:", type(uncompiled_scorer).__name__)

## Evaluate Uncompiled Baseline

Quick check to verify baseline matches Day 3 results (77.5% ±2 accuracy)

In [None]:
def evaluate_module(module, testset, verbose=False):
    """Evaluate module on test set."""
    results = {
        'exact': 0,
        'within_1': 0,
        'within_2': 0,
        'errors': [],
        'predictions': []
    }
    
    for i, example in enumerate(testset):
        try:
            pred = module(
                observation=example.observation,
                agent_goal=example.agent_goal,
                agent_personality=example.agent_personality
            )
            pred_score = int(pred.score)
            pred_score = max(1, min(10, pred_score))  # Clamp
        except Exception as e:
            if verbose:
                print(f"Error on example {i}: {e}")
            pred_score = 5  # Default
        
        gold_score = int(example.score)
        error = abs(pred_score - gold_score)
        
        results['errors'].append(error)
        results['predictions'].append(pred_score)
        
        if error == 0:
            results['exact'] += 1
        if error <= 1:
            results['within_1'] += 1
        if error <= 2:
            results['within_2'] += 1
    
    n = len(testset)
    results['accuracy_exact'] = results['exact'] / n * 100
    results['accuracy_within_1'] = results['within_1'] / n * 100
    results['accuracy_within_2'] = results['within_2'] / n * 100
    results['mean_error'] = sum(results['errors']) / len(results['errors'])
    results['max_error'] = max(results['errors'])
    
    return results

print("Evaluating uncompiled baseline (this may take 2-3 minutes)...\n")
uncompiled_results = evaluate_module(uncompiled_scorer, trainset, verbose=True)

print("=" * 70)
print("UNCOMPILED BASELINE PERFORMANCE")
print("=" * 70)
print(f"Exact matches:      {uncompiled_results['exact']:2d}/40 ({uncompiled_results['accuracy_exact']:.1f}%)")
print(f"Within ±1:          {uncompiled_results['within_1']:2d}/40 ({uncompiled_results['accuracy_within_1']:.1f}%)")
print(f"Within ±2:          {uncompiled_results['within_2']:2d}/40 ({uncompiled_results['accuracy_within_2']:.1f}%)")
print(f"Mean Absolute Error: {uncompiled_results['mean_error']:.2f}")
print(f"Max Error:          {uncompiled_results['max_error']}")
print("=" * 70)

## Initialize GEPA Optimizer

In [None]:
from dspy.optimizers import GEPA
import time

# GEPA configuration
optimizer = GEPA(
    metric=importance_metric,
    budget=40,  # 40 rollouts (faster than MIPROv2's 50-100+)
    verbose=True
)

print("✅ GEPA optimizer initialized")
print(f"Budget: 40 rollouts")
print(f"Expected runtime: 4-6 hours")
print(f"Checkpoints will be saved to Google Drive")

## Run GEPA Compilation

⚠️ **This cell will take 4-6 hours to run**  
✅ You can leave the tab open or use a keep-alive script  
💾 Checkpoints saved every 10 iterations to Google Drive

In [None]:
print("=" * 70)
print("STARTING GEPA COMPILATION")
print("=" * 70)
print(f"Training set size: {len(trainset)}")
print(f"Budget: 40 rollouts")
print(f"Expected runtime: 4-6 hours")
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)
print("\n⏳ Compilation running... (this will take a while)\n")

start_time = time.time()

# Run compilation
try:
    compiled_scorer = optimizer.compile(
        uncompiled_scorer,
        trainset=trainset
    )
    
    elapsed = time.time() - start_time
    print("\n" + "=" * 70)
    print("✅ COMPILATION COMPLETE!")
    print("=" * 70)
    print(f"Time elapsed: {elapsed/3600:.2f} hours ({elapsed/60:.1f} minutes)")
    print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 70)
    
except Exception as e:
    print(f"\n❌ Compilation failed: {e}")
    print("\nTroubleshooting steps:")
    print("1. Check Groq API key is valid")
    print("2. Check internet connection")
    print("3. Try reducing budget to 30")
    print("4. Try MIPROv2 optimizer as fallback")
    raise

## Evaluate Compiled Module

In [None]:
print("Evaluating compiled module (this may take 2-3 minutes)...\n")
compiled_results = evaluate_module(compiled_scorer, trainset, verbose=True)

print("=" * 70)
print("COMPILED MODULE PERFORMANCE")
print("=" * 70)
print(f"Exact matches:      {compiled_results['exact']:2d}/40 ({compiled_results['accuracy_exact']:.1f}%)")
print(f"Within ±1:          {compiled_results['within_1']:2d}/40 ({compiled_results['accuracy_within_1']:.1f}%)")
print(f"Within ±2:          {compiled_results['within_2']:2d}/40 ({compiled_results['accuracy_within_2']:.1f}%)")
print(f"Mean Absolute Error: {compiled_results['mean_error']:.2f}")
print(f"Max Error:          {compiled_results['max_error']}")
print("=" * 70)

## Comparison: Uncompiled vs Compiled

In [None]:
print("\n" + "=" * 70)
print("📊 PERFORMANCE COMPARISON")
print("=" * 70)

improvement = compiled_results['accuracy_within_2'] - uncompiled_results['accuracy_within_2']
mae_improvement = uncompiled_results['mean_error'] - compiled_results['mean_error']

print("\n| Metric | Uncompiled | Compiled | Improvement |")
print("|--------|------------|----------|-------------|")
print(f"| Exact  | {uncompiled_results['accuracy_exact']:5.1f}%   | {compiled_results['accuracy_exact']:5.1f}% | {compiled_results['accuracy_exact'] - uncompiled_results['accuracy_exact']:+6.1f}% |")
print(f"| ±1     | {uncompiled_results['accuracy_within_1']:5.1f}%   | {compiled_results['accuracy_within_1']:5.1f}% | {compiled_results['accuracy_within_1'] - uncompiled_results['accuracy_within_1']:+6.1f}% |")
print(f"| **±2** | **{uncompiled_results['accuracy_within_2']:5.1f}%** | **{compiled_results['accuracy_within_2']:5.1f}%** | **{improvement:+6.1f}%** |")
print(f"| MAE    | {uncompiled_results['mean_error']:5.2f}    | {compiled_results['mean_error']:5.2f}  | {mae_improvement:+6.2f}   |")

print("\n" + "=" * 70)

# Success criteria check
if improvement >= 10:
    print("\n✅ SUCCESS! Improvement ≥10%, proceed to Day 5")
    print(f"   Target: 80% → Achieved: {compiled_results['accuracy_within_2']:.1f}%")
elif improvement >= 5:
    print("\n⚠️  PARTIAL SUCCESS. Improvement 5-10%, consider iteration")
    print(f"   Target: 80% → Achieved: {compiled_results['accuracy_within_2']:.1f}%")
else:
    print("\n❌ INSUFFICIENT IMPROVEMENT (<5%)")
    print("\n   Options:")
    print("   1. Try MIPROv2 optimizer")
    print("   2. Add more mundane category seeds")
    print("   3. Adjust metric tolerances")
    print("   4. Review worst-performing seeds")

## Analyze by Category

In [None]:
from collections import defaultdict

def evaluate_by_category(module, testset):
    """Break down performance by category."""
    category_results = defaultdict(lambda: {'errors': [], 'gold_scores': [], 'pred_scores': []})
    
    for example in testset:
        try:
            pred = module(
                observation=example.observation,
                agent_goal=example.agent_goal,
                agent_personality=example.agent_personality
            )
            pred_score = max(1, min(10, int(pred.score)))
        except Exception:
            pred_score = 5
        
        gold_score = int(example.score)
        error = abs(pred_score - gold_score)
        
        cat = example.category
        category_results[cat]['errors'].append(error)
        category_results[cat]['gold_scores'].append(gold_score)
        category_results[cat]['pred_scores'].append(pred_score)
    
    return category_results

print("\n" + "=" * 70)
print("📊 UNCOMPILED MODULE BY CATEGORY")
print("=" * 70)

uncompiled_by_cat = evaluate_by_category(uncompiled_scorer, trainset)
for cat in sorted(uncompiled_by_cat.keys()):
    data = uncompiled_by_cat[cat]
    within_2 = sum(1 for e in data['errors'] if e <= 2)
    accuracy = within_2 / len(data['errors']) * 100
    mae = sum(data['errors']) / len(data['errors'])
    print(f"{cat:20s}: {accuracy:5.1f}% ±2 accuracy, MAE={mae:.2f} ({len(data['errors'])} seeds)")

print("\n" + "=" * 70)
print("📊 COMPILED MODULE BY CATEGORY")
print("=" * 70)

compiled_by_cat = evaluate_by_category(compiled_scorer, trainset)
for cat in sorted(compiled_by_cat.keys()):
    data = compiled_by_cat[cat]
    within_2 = sum(1 for e in data['errors'] if e <= 2)
    accuracy = within_2 / len(data['errors']) * 100
    mae = sum(data['errors']) / len(data['errors'])
    
    # Get improvement
    uncompiled_data = uncompiled_by_cat[cat]
    uncompiled_within_2 = sum(1 for e in uncompiled_data['errors'] if e <= 2)
    uncompiled_accuracy = uncompiled_within_2 / len(uncompiled_data['errors']) * 100
    improvement = accuracy - uncompiled_accuracy
    
    print(f"{cat:20s}: {accuracy:5.1f}% ±2 accuracy, MAE={mae:.2f} ({len(data['errors'])} seeds) [{improvement:+5.1f}%]")

## Top Errors Analysis

In [None]:
# Find worst predictions
error_details = []
for i, example in enumerate(trainset):
    error = compiled_results['errors'][i]
    pred_score = compiled_results['predictions'][i]
    gold_score = int(example.score)
    
    error_details.append({
        'seed_id': example.seed_id,
        'category': example.category,
        'observation': example.observation[:60] + '...',
        'gold': gold_score,
        'pred': pred_score,
        'error': error
    })

# Sort by error descending
error_details.sort(key=lambda x: x['error'], reverse=True)

print("\n" + "=" * 70)
print("🔍 TOP 10 LARGEST ERRORS (COMPILED)")
print("=" * 70)

for i, err in enumerate(error_details[:10], 1):
    print(f"\n{i}. Seed #{err['seed_id']}: Error = {err['error']}")
    print(f"   Observation: \"{err['observation']}\"")
    print(f"   Gold: {err['gold']}, Predicted: {err['pred']}")
    print(f"   Category: {err['category']}")

## Save Compiled Program

In [None]:
# Save to Google Drive
save_path = '/content/drive/MyDrive/mini-town/compiled/compiled_scorer.json'
compiled_scorer.save(save_path)
print(f"✅ Compiled scorer saved to: {save_path}")

# Extract and save prompts for inspection
prompt_text = str(compiled_scorer.dump_state())
prompt_path = '/content/drive/MyDrive/mini-town/compiled/prompt_scorer.txt'
with open(prompt_path, 'w') as f:
    f.write(prompt_text)
print(f"✅ Prompts saved to: {prompt_path}")

# Save results summary
results_summary = {
    'compilation_time_hours': elapsed / 3600,
    'uncompiled': {
        'accuracy_within_2': uncompiled_results['accuracy_within_2'],
        'mean_error': uncompiled_results['mean_error'],
        'exact_matches': uncompiled_results['exact']
    },
    'compiled': {
        'accuracy_within_2': compiled_results['accuracy_within_2'],
        'mean_error': compiled_results['mean_error'],
        'exact_matches': compiled_results['exact']
    },
    'improvement': {
        'accuracy_delta': improvement,
        'mae_delta': mae_improvement
    }
}

results_path = '/content/drive/MyDrive/mini-town/compiled/compilation_results.json'
with open(results_path, 'w') as f:
    json.dump(results_summary, f, indent=2)
print(f"✅ Results summary saved to: {results_path}")

print("\n" + "=" * 70)
print("🎉 COMPILATION COMPLETE!")
print("=" * 70)
print("\nNext steps:")
print("1. Download compiled_scorer.json to local project")
print("2. Review prompt_scorer.txt to understand optimizations")
print("3. Proceed to Day 5: A/B testing in full simulation")

## Fallback: MIPROv2 (if GEPA doesn't work)

Uncomment and run this cell if GEPA has issues

In [None]:
# from dspy.optimizers import MIPROv2
# 
# print("Falling back to MIPROv2 optimizer...")
# 
# mipro_optimizer = MIPROv2(
#     metric=importance_metric,
#     auto="medium",
#     num_trials=10,
#     max_bootstrapped_demos=4,
#     max_labeled_demos=5
# )
# 
# print("Running MIPROv2 compilation (expect 6-8 hours)...")
# start_time = time.time()
# 
# compiled_scorer_mipro = mipro_optimizer.compile(
#     uncompiled_scorer,
#     trainset=trainset
# )
# 
# elapsed = time.time() - start_time
# print(f"\n✅ MIPROv2 compilation complete! Time: {elapsed/3600:.2f} hours")
# 
# # Evaluate MIPROv2 results
# mipro_results = evaluate_module(compiled_scorer_mipro, trainset)
# print(f"MIPROv2 ±2 accuracy: {mipro_results['accuracy_within_2']:.1f}%")
# print(f"Improvement: +{mipro_results['accuracy_within_2'] - uncompiled_results['accuracy_within_2']:.1f}%")