# Day 4: ScoreImportance Compilation with GEPA

**Goal**: Compile ScoreImportance module to improve from 77.5% ±2 accuracy to 85%+  
**Optimizer**: GEPA (primary choice) with budget=40 rollouts  
**Expected Runtime**: 4-6 hours  

## Baseline Performance
- ±2 Accuracy: 77.5% (31/40 seeds)
- MAE: 1.45
- Weakest category: Mundane (17%)
- Target: ±2 Accuracy > 85%

## Setup: Install Dependencies

In [None]:
# Install required packages
!pip install -q dspy-ai sentence-transformers accelerate
!pip install --quiet gepa==0.0.7  # run once per Colab session


## Mount Google Drive (for file persistence)

In [None]:
from google.colab import drive
drive.mount('/content/drive')



## Upload Files

**Required files**:
1. `scorer_v1.json` - 40 training seeds from Day 3
2. `.env` file with GROQ_API_KEY

Upload these to `/content/` or copy from Google Drive

In [None]:
# Mount/upload your latest JSONL splits first (e.g. via Drive or files tab).
from pathlib import Path
import json

DATA_DIR = Path("/content/drive/MyDrive/mini-town/datasets")  # adjust if you put them elsewhere
train_jsonl = DATA_DIR / "town_agent_train.jsonl"
dev_jsonl = DATA_DIR / "town_agent_dev.jsonl"
test_jsonl = DATA_DIR / "town_agent_test.jsonl"

for path in (train_jsonl, dev_jsonl, test_jsonl):
    if not path.exists():
        raise FileNotFoundError(f"Missing dataset: {path}")

print("✅ Dataset files located")
print(f"  Train: {train_jsonl}")
print(f"  Dev:   {dev_jsonl}")
print(f"  Test:  {test_jsonl}")

# Set API key (if TOGETHER_API_KEY already set, can skip)
import os
from getpass import getpass
if not os.getenv("TOGETHER_API_KEY"):
    together_key = getpass("Enter your Together.ai API key: ")
    os.environ["TOGETHER_API_KEY"] = together_key
    print("✅ Together.ai API key set")
else:
    print("✅ Together.ai API key already set")


In [None]:
# API key already handled in cell 6
print("✅ API key configuration complete")


## Configure DSPy with Groq LLM

In [None]:
# Configure Together.ai LM
import os
import dspy

lm = dspy.LM(
    model="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    api_key=os.getenv("TOGETHER_API_KEY"),
    temperature=0.3,
    max_tokens=512
)

reflection_lm = dspy.LM(
    model="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    api_key=os.getenv("TOGETHER_API_KEY"),
    temperature=0.5,
    max_tokens=512
)

# Re-configure DSPy with Together.ai
dspy.settings.configure(lm=lm)

print("✅ DSPy configured with Together.ai")
print("   Model: Meta-Llama-3.1-8B-Instruct-Turbo")

# Import GEPA standalone components
from gepa.api import optimize as gepa_optimize
from gepa.adapters.dspy_adapter.dspy_adapter import DspyAdapter, ScoreWithFeedback
from gepa.logging.logger import StdOutLogger

print("✅ GEPA standalone components imported")


## Define ScoreImportance Signature

In [None]:
class ScoreImportance(dspy.Signature):
    """Rate how important this observation is for the agent's goals.

    Score 1-10 where:
    - 1-2: Trivial, background noise (e.g., "grass is green")
    - 3-4: Mildly interesting but not actionable
    - 5-6: Relevant to goals, worth remembering
    - 7-8: Directly impacts current plans or goals
    - 9-10: Life-changing, urgent, critical to goals
    """

    observation: str = dspy.InputField(desc="What the agent observed")
    agent_goal: str = dspy.InputField(desc="Agent's current high-level goal")
    agent_personality: str = dspy.InputField(desc="Agent's personality traits")

    reasoning: str = dspy.OutputField(desc="Brief explanation of score")
    score: int = dspy.OutputField(desc="Importance score (1-10)")

print("✅ ScoreImportance signature defined")


## Load and Prepare Seeds

In [None]:
def load_examples(jsonl_path):
    """Load DSPy examples from JSONL dataset."""
    examples = []
    with Path(jsonl_path).open("r", encoding="utf-8") as handle:
        for line in handle:
            record = json.loads(line)
            examples.append(
                dspy.Example(
                    observation=record.get("observation", record.get("recent_observations", "")),
                    agent_goal=record.get("agent_goal", ""),
                    agent_personality=record.get("agent_personality", ""),
                    score=record.get("score", record.get("importance_score", 5)),
                    category=record.get("category", "unknown"),
                    seed_id=record.get("agent_id", record.get("id", 0)),
                ).with_inputs("observation", "agent_goal", "agent_personality")
            )
    return examples

# Load datasets from JSONL files
trainset = load_examples(train_jsonl)
devset = load_examples(dev_jsonl)
testset = load_examples(test_jsonl)

print(f"✅ Train examples: {len(trainset)}")
print(f"✅ Dev examples:   {len(devset)}")
print(f"✅ Test examples:  {len(testset)}")

# Show sample from training set
if trainset:
    print("\nSample train example:")
    print(f"  Observation: {trainset[0].observation[:80]}…")
    print(f"  Goal: {trainset[0].agent_goal}")
    print(f"  Personality: {trainset[0].agent_personality}")
    print(f"  Score: {trainset[0].score}")
    print(f"  Category: {trainset[0].category}")
else:
    print("⚠️  No training examples loaded!")


## Define Importance Metric

In [None]:
def importance_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Metric for ScoreImportance compilation (GEPA-compatible).

    GEPA requires 5 arguments:
    - gold: The gold/example object with .score attribute
    - pred: The prediction object with .score attribute
    - trace: Execution trace (optional, for debugging)
    - pred_name: Name of the predictor (optional)
    - pred_trace: Prediction trace (optional)

    Returns:
    - float: Score between 0.0 and 1.0
    """
    try:
        # Extract scores from objects
        if hasattr(gold, 'score'):
            gold_score = int(gold.score)
        else:
            gold_score = int(gold)

        if hasattr(pred, 'score'):
            pred_score = int(pred.score)
        else:
            pred_score = int(pred)
    except (ValueError, AttributeError, TypeError):
        return 0.0

    # Clamp predicted score to 1–10
    pred_score = max(1, min(10, pred_score))

    # Calculate error
    error = abs(pred_score - gold_score)

    # Return score based on error (higher is better)
    if error == 0:
        return 1.0      # Exact match
    elif error <= 1:
        return 0.8      # Within ±1
    elif error <= 2:
        return 0.5      # Within ±2
    elif error <= 3:
        return 0.2      # Within ±3
    else:
        return 0.0      # Poor prediction


print("✅ GEPA-compatible importance metric defined")


# Dummy test classes
class DummyExample:
    def __init__(self, score):
        self.score = score


class DummyPred:
    def __init__(self, score):
        self.score = score


# Test cases
test_gold = DummyExample(7)
print(f"Test: gold=7, pred=7 → {importance_metric(test_gold, DummyPred(7)):.2f} (expect 1.0)")
print(f"Test: gold=7, pred=8 → {importance_metric(test_gold, DummyPred(8)):.2f} (expect 0.8)")
print(f"Test: gold=7, pred=9 → {importance_metric(test_gold, DummyPred(9)):.2f} (expect 0.5)")
print(f"Test: gold=7, pred=1 → {importance_metric(test_gold, DummyPred(1)):.2f} (expect 0.0)")


## Create Uncompiled Baseline

In [None]:
# Uncompiled baseline (ChainOfThought)
uncompiled_scorer = dspy.ChainOfThought(ScoreImportance)

print("✅ Uncompiled baseline created")
print("Module type:", type(uncompiled_scorer).__name__)


## Evaluate Uncompiled Baseline

Quick check to verify baseline matches Day 3 results (77.5% ±2 accuracy)

In [None]:
print("Evaluating uncompiled baseline (this may take 2-3 minutes)...\n")
uncompiled_results = evaluate_module(uncompiled_scorer, trainset, verbose=True)

print("=" * 70)
print("UNCOMPILED BASELINE PERFORMANCE (TRAIN SET)")
print("=" * 70)
print(f"Exact matches:      {uncompiled_results['exact']:2d}/{len(trainset)} ({uncompiled_results['accuracy_exact']:.1f}%)")
print(f"Within ±1:          {uncompiled_results['within_1']:2d}/{len(trainset)} ({uncompiled_results['accuracy_within_1']:.1f}%)")
print(f"Within ±2:          {uncompiled_results['within_2']:2d}/{len(trainset)} ({uncompiled_results['accuracy_within_2']:.1f}%)")
print(f"Mean Absolute Error: {uncompiled_results['mean_error']:.2f}")
print(f"Max Error:          {uncompiled_results['max_error']}")
print("=" * 70)

# Optional: Evaluate on dev set if available
if devset:
    print("\nEvaluating on dev set...\n")
    dev_results = evaluate_module(uncompiled_scorer, devset, verbose=False)
    print(f"Dev set ±2 accuracy: {dev_results['accuracy_within_2']:.1f}%")


## Initialize GEPA Optimizer

In [None]:
# Build adapter and feedback map before compilation
seed_candidate = {}
for name, predictor in uncompiled_scorer.named_predictors():
    instructions = getattr(getattr(predictor, "signature", None), "instructions", None)
    if isinstance(instructions, str):
        seed_candidate[name] = instructions

if not seed_candidate:
    raise RuntimeError("No predictor instructions found for GEPA seed candidate.")

def score_feedback(component_name: str):
    def _fn(predictor_output, predictor_inputs, module_inputs, module_outputs, captured_trace):
        score = importance_metric(module_inputs, module_outputs)
        feedback_bits = [
            f"[{component_name}] current score: {score:.3f}",
            "Keep scores aligned with invitations and avoid big mistakes.",
        ]
        return ScoreWithFeedback(score=score, feedback=" ".join(feedback_bits))
    return _fn

feedback_map = {name: score_feedback(name) for name in seed_candidate}

adapter = DspyAdapter(
    student_module=uncompiled_scorer,
    metric_fn=importance_metric,
    feedback_map=feedback_map,
    failure_score=0.0,
    num_threads=None,
    add_format_failure_as_feedback=True,
)

print("✅ GEPA adapter and feedback map configured")
print(f"Found {len(seed_candidate)} predictor(s) to optimize")


## Run GEPA Compilation

**This cell will take 4-6 hours to run**  
You can leave the tab open or use a keep-alive script  
Checkpoints saved every 10 iterations to Google Drive

In [None]:
# Reuse existing LM as reflection model
import time

lm = dspy.settings.lm
if lm is None:
    raise RuntimeError("DSPy LM not configured; call configure_dspy() first.")

def reflection_lm_fn(prompt: str) -> str:
    response = lm(prompt)
    if isinstance(response, str):
        return response
    if hasattr(response, "text"):
        return response.text
    if getattr(response, "choices", None):
        choice = response.choices[0]
        return getattr(choice, "text", None) or choice.get("message", {}).get("content", "")
    return str(response)

# Calculate budget
max_metric_calls = max(40 * max(1, len(trainset)), len(trainset))  # keep 40 "rollouts"

print("=" * 70)
print("STARTING GEPA COMPILATION")
print("=" * 70)
print(f"Training set size: {len(trainset)}")
print(f"Budget: {max_metric_calls} metric calls")
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)
print("\n Compilation running... (this will take a while)\n")

start_time = time.time()

# Run GEPA compilation
try:
    gepar = gepa_optimize(
        seed_candidate=seed_candidate,
        trainset=trainset,
        valset=trainset,
        adapter=adapter,
        reflection_lm=reflection_lm_fn,
        max_metric_calls=max_metric_calls,
        logger=StdOutLogger(),
        display_progress_bar=True,
        reflection_minibatch_size=5,
        skip_perfect_score=True,
    )

    elapsed = time.time() - start_time
    print("\n" + "=" * 70)
    print("COMPILATION COMPLETE!")
    print("=" * 70)
    print(f"Time elapsed: {elapsed/3600:.2f} hours ({elapsed/60:.1f} minutes)")
    print(f"Total metric calls: {gepar.total_metric_calls}")
    print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 70)

except Exception as e:
    print(f"\n Compilation failed: {e}")
    print("\nTroubleshooting steps:")
    print("1. Check GEPA package installation")
    print("2. Check internet connection")
    print("3. Try reducing budget to 30")
    print("4. Check seed_candidate structure")
    raise

## Materialize Compiled Module

# Materialize the compiled module from GEPA's best candidate
compiled_scorer = uncompiled_scorer.deepcopy()
best_candidate = gepar.best_candidate

for name, predictor in compiled_scorer.named_predictors():
    if name in best_candidate:
        predictor.signature = predictor.signature.with_instructions(best_candidate[name])

print("✅ Compiled module materialized from GEPA best candidate")
print(f"Updated {len(best_candidate)} predictor(s)")

# Show the optimized instructions
print("\nOptimized instructions:")
for name, instructions in best_candidate.items():
    print(f"\n{name}:")
    print(f"  {instructions[:100]}{'...' if len(instructions) > 100 else ''}")

In [None]:
## Evaluate Compiled Module

print("Evaluating compiled module (this may take 2-3 minutes)...\n")
compiled_results = evaluate_module(compiled_scorer, trainset, verbose=True)

print("=" * 70)
print("COMPILED MODULE PERFORMANCE (TRAIN SET)")
print("=" * 70)
print(f"Exact matches:      {compiled_results['exact']:2d}/{len(trainset)} ({compiled_results['accuracy_exact']:.1f}%)")
print(f"Within ±1:          {compiled_results['within_1']:2d}/{len(trainset)} ({compiled_results['accuracy_within_1']:.1f}%)")
print(f"Within ±2:          {compiled_results['within_2']:2d}/{len(trainset)} ({compiled_results['accuracy_within_2']:.1f}%)")
print(f"Mean Absolute Error: {compiled_results['mean_error']:.2f}")
print(f"Max Error:          {compiled_results['max_error']}")
print("=" * 70)

# Optional: Evaluate on dev and test sets if available
if devset:
    print("\nEvaluating on dev set...\n")
    dev_compiled_results = evaluate_module(compiled_scorer, devset, verbose=False)
    print(f"Dev set ±2 accuracy: {dev_compiled_results['accuracy_within_2']:.1f}%")

if testset:
    print("\nEvaluating on test set...\n")
    test_compiled_results = evaluate_module(compiled_scorer, testset, verbose=False)
    print(f"Test set ±2 accuracy: {test_compiled_results['accuracy_within_2']:.1f}%")


## Comparison: Uncompiled vs Compiled

In [None]:
print("\n" + "=" * 70)
print("PERFORMANCE COMPARISON")
print("=" * 70)

improvement = compiled_results['accuracy_within_2'] - uncompiled_results['accuracy_within_2']
mae_improvement = uncompiled_results['mean_error'] - compiled_results['mean_error']

print("\n| Metric | Uncompiled | Compiled | Improvement |")
print("|--------|------------|----------|-------------|")
print(f"| Exact  | {uncompiled_results['accuracy_exact']:5.1f}%   | {compiled_results['accuracy_exact']:5.1f}% | {compiled_results['accuracy_exact'] - uncompiled_results['accuracy_exact']:+6.1f}% |")
print(f"| ±1     | {uncompiled_results['accuracy_within_1']:5.1f}%   | {compiled_results['accuracy_within_1']:5.1f}% | {compiled_results['accuracy_within_1'] - uncompiled_results['accuracy_within_1']:+6.1f}% |")
print(f"| **±2** | **{uncompiled_results['accuracy_within_2']:5.1f}%** | **{compiled_results['accuracy_within_2']:5.1f}%** | **{improvement:+6.1f}%** |")
print(f"| MAE    | {uncompiled_results['mean_error']:5.2f}    | {compiled_results['mean_error']:5.2f}  | {mae_improvement:+6.2f}   |")

print("\n" + "=" * 70)

# Success criteria check
if improvement >= 10:
    print("\n SUCCESS! Improvement ≥10%, proceed to Day 5")
    print(f"   Target: 80% → Achieved: {compiled_results['accuracy_within_2']:.1f}%")
elif improvement >= 5:
    print("\n  PARTIAL SUCCESS. Improvement 5-10%, consider iteration")
    print(f"   Target: 80% → Achieved: {compiled_results['accuracy_within_2']:.1f}%")
else:
    print("\n INSUFFICIENT IMPROVEMENT (<5%)")
    print("\n   Options:")
    print("   1. Try MIPROv2 optimizer")
    print("   2. Add more mundane category seeds")
    print("   3. Adjust metric tolerances")
    print("   4. Review worst-performing seeds")


## Analyze by Category

In [None]:
from collections import defaultdict

def evaluate_by_category(module, testset):
    """Break down performance by category."""
    category_results = defaultdict(lambda: {'errors': [], 'gold_scores': [], 'pred_scores': []})

    for example in testset:
        try:
            pred = module(
                observation=example.observation,
                agent_goal=example.agent_goal,
                agent_personality=example.agent_personality
            )
            pred_score = max(1, min(10, int(pred.score)))
        except Exception:
            pred_score = 5

        gold_score = int(example.score)
        error = abs(pred_score - gold_score)

        cat = example.category
        category_results[cat]['errors'].append(error)
        category_results[cat]['gold_scores'].append(gold_score)
        category_results[cat]['pred_scores'].append(pred_score)

    return category_results

print("\n" + "=" * 70)
print(" UNCOMPILED MODULE BY CATEGORY")
print("=" * 70)

uncompiled_by_cat = evaluate_by_category(uncompiled_scorer, trainset)
for cat in sorted(uncompiled_by_cat.keys()):
    data = uncompiled_by_cat[cat]
    within_2 = sum(1 for e in data['errors'] if e <= 2)
    accuracy = within_2 / len(data['errors']) * 100
    mae = sum(data['errors']) / len(data['errors'])
    print(f"{cat:20s}: {accuracy:5.1f}% ±2 accuracy, MAE={mae:.2f} ({len(data['errors'])} examples)")

print("\n" + "=" * 70)
print(" COMPILED MODULE BY CATEGORY")
print("=" * 70)

compiled_by_cat = evaluate_by_category(compiled_scorer, trainset)
for cat in sorted(compiled_by_cat.keys()):
    data = compiled_by_cat[cat]
    within_2 = sum(1 for e in data['errors'] if e <= 2)
    accuracy = within_2 / len(data['errors']) * 100
    mae = sum(data['errors']) / len(data['errors'])

    # Get improvement
    uncompiled_data = uncompiled_by_cat[cat]
    uncompiled_within_2 = sum(1 for e in uncompiled_data['errors'] if e <= 2)
    uncompiled_accuracy = uncompiled_within_2 / len(uncompiled_data['errors']) * 100
    improvement = accuracy - uncompiled_accuracy

    print(f"{cat:20s}: {accuracy:5.1f}% ±2 accuracy, MAE={mae:.2f} ({len(data['errors'])} examples) [{improvement:+5.1f}%]")


## Top Errors Analysis

In [None]:
# Find worst predictions
error_details = []
for i, example in enumerate(trainset):
    error = compiled_results['errors'][i]
    pred_score = compiled_results['predictions'][i]
    gold_score = int(example.score)

    error_details.append({
        'seed_id': example.seed_id,
        'category': example.category,
        'observation': example.observation[:60] + '...',
        'gold': gold_score,
        'pred': pred_score,
        'error': error
    })

# Sort by error descending
error_details.sort(key=lambda x: x['error'], reverse=True)

print("\n" + "=" * 70)
print(" TOP 10 LARGEST ERRORS (COMPILED)")
print("=" * 70)

for i, err in enumerate(error_details[:10], 1):
    print(f"\n{i}. Example #{err['seed_id']}: Error = {err['error']}")
    print(f"   Category: {err['category']}")
    print(f"   Observation: \"{err['observation']}\"")
    print(f"   Gold: {err['gold']}, Predicted: {err['pred']}")


## Save Compiled Program

In [None]:
# Save to Google Drive
save_path = '/content/drive/MyDrive/mini-town/compiled/compiled_scorer.json'
compiled_scorer.save(save_path)
print(f" Compiled scorer saved to: {save_path}")

# Extract and save prompts for inspection
prompt_text = str(compiled_scorer.dump_state())
prompt_path = '/content/drive/MyDrive/mini-town/compiled/prompt_scorer.txt'
with open(prompt_path, 'w') as f:
    f.write(prompt_text)
print(f" Prompts saved to: {prompt_path}")

# Save results summary
results_summary = {
    'compilation_time_hours': elapsed / 3600,
    'uncompiled': {
        'accuracy_within_2': uncompiled_results['accuracy_within_2'],
        'mean_error': uncompiled_results['mean_error'],
        'exact_matches': uncompiled_results['exact']
    },
    'compiled': {
        'accuracy_within_2': compiled_results['accuracy_within_2'],
        'mean_error': compiled_results['mean_error'],
        'exact_matches': compiled_results['exact']
    },
    'improvement': {
        'accuracy_delta': improvement,
        'mae_delta': mae_improvement
    }
}

results_path = '/content/drive/MyDrive/mini-town/compiled/compilation_results.json'
with open(results_path, 'w') as f:
    json.dump(results_summary, f, indent=2)
print(f" Results summary saved to: {results_path}")

print("\n" + "=" * 70)
print(" COMPILATION COMPLETE!")
print("=" * 70)
print("\nNext steps:")
print("1. Download compiled_scorer.json to local project")
print("2. Review prompt_scorer.txt to understand optimizations")
print("3. Proceed to Day 5: A/B testing in full simulation")


## Fallback: MIPROv2 (if GEPA doesn't work)

Uncomment and run this cell if GEPA has issues

In [None]:
# from dspy.optimizers import MIPROv2
#
# print("Falling back to MIPROv2 optimizer...")
#
# mipro_optimizer = MIPROv2(
#     metric=importance_metric,
#     auto="medium",
#     num_trials=10,
#     max_bootstrapped_demos=4,
#     max_labeled_demos=5
# )
#
# print("Running MIPROv2 compilation (expect 6-8 hours)...")
# start_time = time.time()
#
# compiled_scorer_mipro = mipro_optimizer.compile(
#     uncompiled_scorer,
#     trainset=trainset
# )
#
# elapsed = time.time() - start_time
# print(f"\n MIPROv2 compilation complete! Time: {elapsed/3600:.2f} hours")
#
# # Evaluate MIPROv2 results
# mipro_results = evaluate_module(compiled_scorer_mipro, trainset)
# print(f"MIPROv2 ±2 accuracy: {mipro_results['accuracy_within_2']:.1f}%")
# print(f"Improvement: +{mipro_results['accuracy_within_2'] - uncompiled_results['accuracy_within_2']:.1f}%")
