# Automatic System Prompt Optimization (DSPy) — with **gpt-4.1-mini**

This notebook uses DSPy to optimize a *system prompt* for a task, targeting OpenAI's **gpt-4.1-mini**.

## 1) Setup

In [None]:

%pip install -U dspy openai tiktoken

import os, re
import dspy

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

BASE_MODEL = "openai/gpt-4.1-mini"
JUDGE_MODEL = "openai/gpt-4.1"

dspy.configure(lm=dspy.LM(BASE_MODEL))
print("DSPy:", dspy.__version__)


## 2) Data

In [None]:
import dspy
import json

# Load the dataset from JSON file
with open("qna_dataset.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# Convert JSON data to dspy.Example objects
train_examples = [
    dspy.Example(prompt=ex["prompt"], generation=ex["generation"]) 
    for ex in dataset["train"]
]

dev_examples = [
    dspy.Example(prompt=ex["prompt"], generation=ex["generation"]) 
    for ex in dataset["dev"]
]

# Create train and dev sets
trainset = [e.with_inputs("prompt") for e in train_examples]
devset = [e.with_inputs("prompt") for e in dev_examples]

print(f"Loaded {len(trainset)} training examples and {len(devset)} dev examples")
(len(trainset), len(devset))

## 3) Metrics

In [None]:

def token_f1(pred, ref):
    p = pred.lower().split(); r = ref.lower().split()
    if not p or not r: return 0.0
    from collections import Counter
    cp, cr = Counter(p), Counter(r)
    overlap = sum((cp & cr).values())
    prec = overlap/len(p); rec = overlap/len(r)
    return 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)

def concise_qna_metric(example, prediction, trace=None):
    out = (prediction.get("generation") or "").strip()
    ref = (example.get("generation") or "").strip()
    if not out: return 0.0
    # Encourage <= 2 sentences
    import re as _re
    sentences = [s for s in _re.split(r"[.!?]+", out) if s.strip()]
    length_pen = 0.0 if len(sentences)<=2 else min(1.0, 0.2*(len(sentences)-2))
    return max(0.0, min(1.0, token_f1(out, ref)-length_pen))


## 4) Minimal program with custom adapter

In [None]:
import sys  # Need to import sys for the adapter

class signature(dspy.Signature):
    prompt = dspy.InputField()
    generation = dspy.OutputField()

def format_demos(demos):
    s = []
    for d in (demos or []):
        s.append(f"\n# Example\nUser: {d.inputs.get('prompt','')}\nAssistant: {d.outputs.get('generation','')}")
    return "\n".join(s)

class SimplestAdapter(dspy.Adapter):
    def __call__(self, lm, lm_kwargs, signature, demos, inputs):
        sys_msg = signature.instructions or ""
        if demos: sys_msg += "\n" + format_demos(demos)
        messages = [
            {"role":"system","content": sys_msg},
            {"role":"user","content": inputs["prompt"]},
        ]
        outputs = lm(messages=messages, **lm_kwargs)
        return [{"generation": outputs[0]}]

class MyPredict(dspy.Predict):
    def __init__(self, signature, **kw):
        super().__init__(signature, **kw)
        self.adapter = SimplestAdapter()

INITIAL_SYSTEM_PROMPT = "You are concise. Answer correctly in <= 2 sentences."
my_program = MyPredict(signature)
my_program.signature.instructions = INITIAL_SYSTEM_PROMPT
print(my_program(prompt="Who painted the Mona Lisa?"))

## 5) Optimize (MIPROv2)

In [None]:
# Initialize the optimizer
teleprompter = dspy.MIPROv2(
    metric=concise_qna_metric,
    auto="medium",  # Can choose between light, medium, and heavy optimization runs
    max_bootstrapped_demos=0,  # No few-shot examples (focusing on instruction optimization)
    max_labeled_demos=0,
)

# Optimize program
print("Optimizing program with MIPROv2...")
my_program_optimized = teleprompter.compile(
    my_program,
    trainset=trainset,
    requires_permission_to_run=False
)

# Test the optimized program
print("\nOptimized result:")
print(my_program_optimized(prompt="What is the capital of Germany?"))

# Inspect the optimization history
my_program_optimized.inspect_history()

## 6) Eval

In [None]:
def evaluate(program, dataset, metric):
    scores = []
    for ex in dataset:
        # pull input & reference safely from dspy.Example
        user_prompt = getattr(ex, "prompt", None) or getattr(ex, "inputs", {}).get("prompt", "")
        ref_answer  = getattr(ex, "generation", None) or getattr(ex, "outputs", {}).get("generation", "")

        # run program
        pred = program(prompt=user_prompt)

        # normalize prediction to a dict with "generation"
        gen = getattr(pred, "generation", None)
        if gen is None and hasattr(pred, "as_dict"):
            gen = pred.as_dict().get("generation", "")
        if gen is None and hasattr(pred, "toDict"):
            gen = pred.toDict().get("generation", "")
        if gen is None and hasattr(pred, "outputs") and isinstance(pred.outputs, dict):
            gen = pred.outputs.get("generation", "")
        if gen is None:
            try:
                gen = pred["generation"]  # last resort if subscriptable
            except Exception:
                gen = str(pred)

        ex_dict   = {"prompt": user_prompt, "generation": ref_answer}
        pred_dict = {"generation": gen}
        scores.append(metric(ex_dict, pred_dict))

    return sum(scores) / len(scores) if scores else 0.0

base = evaluate(my_program, devset, concise_qna_metric)
opt  = evaluate(my_program_optimized, devset, concise_qna_metric)
print("Base:", base, "Optimized:", opt)


## 7) Export learned system prompt

In [None]:
# Save the optimized program for future use
my_program_optimized.save("optimized_qna_program.json")
print("Optimized program saved to optimized_qna_program.json")

# Also save both prompts in a comparison JSON
import json

prompts_comparison = {
    "original_prompt": INITIAL_SYSTEM_PROMPT,
    "optimized_prompt": my_program_optimized.signature.instructions,
    "base_score": base,
    "optimized_score": opt,
    "improvement": opt - base
}

with open("prompt_comparison.json", "w", encoding="utf-8") as f:
    json.dump(prompts_comparison, f, indent=2)

print("\nPrompt comparison saved to prompt_comparison.json:")
print(json.dumps(prompts_comparison, indent=2))