# Metadata extraction using DSPy and a local LLM, with evaluation metrics

To run this, you first need to start a local vLLM server in the backround with a command like this:

    vllm serve $MODEL_ID --port 7987 --max-model-len 32768 --gpu-memory-utilization 0.9

where MODEL_ID is e.g. `meta-llama/Llama-3.1-8B-Instruct` and the port has to match the PORT setting below.

In [1]:
import dspy

MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"  # should match the model vLLM is running (does it matter??)
PORT = 7987  # should match the port where vLLM is running
MAX_TOKENS = 8192  # limit on how many new tokens to generate (default: 4000)
TEMPERATURE = 0.7

lm = dspy.LM("openai/" + MODEL_ID,
             api_base=f"http://localhost:{PORT}/v1",  # ensure this points to your port
             api_key="local", model_type="chat", max_tokens=MAX_TOKENS, temperature=TEMPERATURE)
dspy.configure(lm=lm)

# test the connection to the LLM
lm("Say this is a test!", temperature=0.0)  # => ['This is a test!']

["Alright, let's proceed with the test. What would you like to test? Here are a few options:\n\n1. **Trivia**: I can ask you questions on a topic of your choice.\n2. **Language**: I can help you practice a foreign language.\n3. **Math**: I can provide math problems to solve.\n4. **General Knowledge**: I can ask you questions on a wide range of topics.\n\nPlease choose one, or let me know if there's something specific you'd like to test."]

In [2]:
# Load and prepare dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of validation set

train_files = glob.glob("../../llm-dataset/*-train.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

VAL_SIZE = 64  # how many documents to validate on during optimization

def preprocess_sample(sample):
    # fix some bad field names
    ground_truth = { fld.replace('-', '_'): val for fld, val in sample["ground_truth"].items() }
    output = json.dumps(ground_truth)
    input_ = json.dumps(sample["content"])
    return dspy.Example({"content": input_, "metadata": output}).with_inputs("content")

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records


train_val_set = dataset_to_records(train_files)
random.shuffle(train_val_set)

train_set = train_val_set[VAL_SIZE:]
val_set = train_val_set[:VAL_SIZE]

test_set = dataset_to_records(test_files)

len(train_set), len(val_set), len(test_set)

(576, 64, 182)

In [3]:
print("Input Message:")
print(train_set[-1]['content'])

print("\n\nGold Answer:")
for k, v in json.loads(train_set[-1]['metadata']).items():
    print(f"{k}: {v}")

Input Message:
{"pdfinfo": {"creationDate": "D:20201214215341+01'00'", "modDate": "D:20201214215418+01'00'"}, "pages": [{"page": 1, "text": "# ANTAA TAITEEN OPETTAA\n\n\n"}, {"page": 3, "text": "ANTA A TAITEEN OPETTA A GERT BIESTA\n\n\n"}, {"page": 4, "text": "00:00:08.18\n\n\n"}, {"page": 5, "text": "00:00:36.03 00:00:52.19 00:00:54.19\n\n\n"}, {"page": 6, "text": "00:00:58.16 00:01:00.17 00:01:0\n\n\n"}, {"page": 65, "text": "\u2018Opastan sinua kaikessa, n\u00e4yt\u00e4n sinulle kaiken ja nime\u00e4n kaiken.\u2019\n\u2014 COMENIUS\nT\u00e4ss\u00e4 kirjassa Gert Biesta esitt\u00e4\u00e4 uuden n\u00e4kemyksen nykyaikaisesta taidekasvatuksesta\n\nosoittamalla, ett\u00e4 taide tarjoaa ainutlaatuisia v\u00e4lineit\u00e4 olla dialogissa maailman kanssa. N\u00e4kemys\n\nperustuu ajatukseen, ett\u00e4 opettaminen on n\u00e4ytt\u00e4mist\u00e4. Opettaja n\u00e4ytt\u00e4\u00e4 oppilaalle millaisiin\n\nhyviin, t\u00e4rkeisiin tai merkitt\u00e4viin asioihin maailmassa voisi kiinnitt\u00e4\u00e4

In [4]:
from typing import Optional

class ExtractInfo(dspy.Signature):
    """Extract structured metadata from text extracted from a PDF."""

    content: str = dspy.InputField()
    language: str = dspy.OutputField(desc="The language of the resource expressed as a BCP47 language tag.")
    title: str = dspy.OutputField(desc="The main title of the publication.")
    alt_title: list[str] = dspy.OutputField(desc="Alternative or parallel titles of the publication, suffixed with a BCP47 language tag in curly brackets.")
    creator: list[str] = dspy.OutputField(desc="The primary author(s) of the resource.")
    year: Optional[str] = dspy.OutputField(desc="The year on which the resource was issued or made available.")
    publisher: list[str] = dspy.OutputField(desc="The entity/entities responsible for making the resource available.")
    doi: Optional[str] = dspy.OutputField(desc="The Digital Object Identifier (DOI) associated with the resource.")
    e_isbn: list[str] = dspy.OutputField(desc="The ISBN associated with the electronic resource.")
    p_isbn: list[str] = dspy.OutputField(desc="The ISBN of the printed version of this document.")
    e_issn: Optional[str] = dspy.OutputField(desc="The ISSN associated with the electronic resource.")
    p_issn: Optional[str] = dspy.OutputField(desc="The ISSN of the printed version of this document.")
    type_coar: str = dspy.OutputField(desc="The type of the resource according to the COAR Resource Types classification.")

module = dspy.ChainOfThought(ExtractInfo)

text = "Apple Inc. announced its latest iPhone 14 today." \
    "The CEO, Tim Cook, highlighted its new features in a press release."
response = module(content=text)

print(response)


Prediction(
    reasoning='This text is a short news announcement about Apple Inc.\'s latest iPhone 14. The content is in English and does not provide specific metadata fields such as DOIs, ISBNs, or ISSNs, which are typically associated with scholarly or formal publications. The text mentions the CEO, Tim Cook, and the company Apple Inc., which can be used as the creator and publisher respectively. The type of resource can be classified as "Text" according to the COAR Resource Types classification, given it is a textual announcement.',
    language='en',
    title='Apple Inc. announces latest iPhone 14',
    alt_title=['Apple Inc. kündigt neues iPhone 14 an{de}', 'Apple Inc. annonce le dernier iPhone 14{fr}'],
    creator=['Tim Cook'],
    year=None,
    publisher=['Apple Inc.'],
    doi=None,
    e_isbn=[],
    p_isbn=[],
    e_issn=None,
    p_issn=None,
    type_coar='Text'
)


In [5]:
import Levenshtein

ALMOST_THRESHOLD = 0.9  # Adjust as needed

def feedback_simple_string(field, true_val, pred_val):
    score = 1.0 if true_val == pred_val else 0.0
    if score == 1.0:
        feedback = f"✅ `{field}` is correct: `{true_val}`."
    else:
        feedback = f"❌ `{field}` is incorrect. You predicted `{pred_val}`, but the correct value is `{true_val}`."
    return score, feedback

def feedback_fuzzy_string(field, true_val, pred_val):
    base_score = 1.0 if true_val == pred_val else 0.0
    if base_score == 1.0 or (true_val and pred_val and Levenshtein.ratio(true_val.lower(), pred_val.lower()) >= ALMOST_THRESHOLD):
        score = 1.0
        feedback = f"✅ `{field}` is approximately correct: `{pred_val}` matches `{true_val}` closely."
    else:
        score = 0.0
        feedback = f"❌ `{field}` is incorrect. You predicted `{pred_val}`, but the correct value is `{true_val}`."
    return score, feedback

def feedback_set(field, true_val, pred_val):
    true_set = set(true_val or [])
    pred_set = set(pred_val or [])

    if not true_set and not pred_set:
        return 1.0, f"✅ `{field}` is empty as expected."
    elif not true_set or not pred_set:
        return 0.0, f"❌ `{field}` is incorrect. Expected `{true_set}`, but got `{pred_set}`."

    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    feedback = f"🔍 `{field}` partial match."
    feedback += f"- Correctly included: `{list(true_set & pred_set)}`\n"
    if fp:
        feedback += f"- Incorrectly included: `{list(pred_set - true_set)}`\n"
    if fn:
        feedback += f"- Missed: `{list(true_set - pred_set)}`"

    return f1, feedback.strip()

def feedback_e_issn(field, true_val, pred_val, p_issn_val):
    if true_val == pred_val:
        return 1.0, f"✅ `{field}` is correct: `{true_val}`."
    elif p_issn_val and pred_val == p_issn_val and true_val is None:
        return 1.0, f"✅ `{field}` is correctly inferred from `p_issn`: `{pred_val}`."
    else:
        return 0.0, f"❌ `{field}` is incorrect. You predicted `{pred_val}`, but the correct value is `{true_val}`."

def metadata_metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    fields = [
        'language', 'title', 'creator', 'year', 'publisher',
        'doi', 'e_isbn', 'p_isbn', 'e_issn', 'p_issn', 'type_coar'
    ]

    scores = []
    feedback_parts = []

    metadata = json.loads(example.get("metadata", "{}"))
    ground_truth = example.get("ground_truth", {})

    for field in fields:
        true_val = metadata.get(field)
        pred_val = pred.get(field) or None

        if field in ['language', 'year', 'doi', 'p_issn', 'type_coar']:
            score, feedback = feedback_simple_string(field, true_val, pred_val)
        elif field == 'title':
            score, feedback = feedback_fuzzy_string(field, true_val, pred_val)
        elif field in ['creator', 'publisher', 'e_isbn', 'p_isbn']:
            score, feedback = feedback_set(field, true_val, pred_val)
        elif field == 'e_issn':
            p_issn_val = ground_truth.get("p_issn")
            score, feedback = feedback_e_issn(field, true_val, pred_val, p_issn_val)
        else:
            score, feedback = feedback_simple_string(field, true_val, pred_val)

        scores.append(score)
        feedback_parts.append(feedback)

    overall_score = sum(scores) / len(scores) if scores else 0
    full_feedback = "\n".join(feedback_parts)

    return dspy.Prediction(score=overall_score, feedback=full_feedback)


In [6]:
from dspy import GEPA

optimizer = GEPA(
    metric=metadata_metric_with_feedback,
    auto="medium",
    num_threads=32,
    track_stats=False,
    use_merge=True,
    reflection_lm=lm
)

In [7]:
%%time

optimized_program = optimizer.compile(
    module,
    trainset=train_set,
    valset=val_set,
)

2025/09/26 17:00:46 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1010 metric calls of the program. This amounts to 1.58 full evals on the train+val set.
2025/09/26 17:00:46 INFO dspy.teleprompt.gepa.gepa: Using 64 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/1010 [00:00<?, ?rollouts/s]2025/09/26 17:00:47 INFO dspy.evaluate.evaluate: Average Metric: 40.03030303030302 / 64 (62.5%)
2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.6254734848484849
GEPA Optimization:   6%|▋         | 64/1010 [00:00<00:06, 146.47rollouts/s]2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.6254734848484849


Average Metric: 1.97 / 3 (65.7%): 100%|██████████| 3/3 [00:00<00:00, 161.43it/s]

2025/09/26 17:00:47 INFO dspy.evaluate.evaluate: Average Metric: 1.9696969696969697 / 3 (65.7%)
2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs).
9. **p_isbn**: Print International Standard Book Numbers (ISBNs).
10. **e_issn**: Electronic Interna




2025/09/26 17:00:47 INFO dspy.evaluate.evaluate: Average Metric: 49.86442345533254 / 64 (77.9%)
2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New program is on the linear pareto front
2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Full valset score for new program: 0.7791316164895711
2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Full train_val score for new program: 0.7791316164895711
2025/09/26 17:00:47 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Individual valset scores for new program: [1.0, 0.9090909090909091, 0.9090909090909091, 0.7272727272727273, 1.0, 0.5454545454545454, 1.0, 0.7272727272727273, 0.7272727272727273, 0.8051948051948052, 0.7272727272727273, 0.7272727272727273, 1.0, 0.9090909090909091, 0.7878787878787878, 0.36363636363636365, 0.9090909090909091, 0.7878787878787878, 0.8181818181818182, 0.5454545454545454, 0.9696969696969696, 0.9090909090909091, 0.9696969696969696, 1.0, 0.7272727272727273, 1.0, 0.545454545454545

Average Metric: 2.59 / 3 (86.4%): 100%|██████████| 3/3 [00:00<00:00, 185.38it/s]

2025/09/26 17:00:47 INFO dspy.evaluate.evaluate: Average Metric: 2.590909090909091 / 3 (86.4%)





2025/09/26 17:03:56 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyphens

Average Metric: 2.52 / 3 (83.8%): 100%|██████████| 3/3 [00:16<00:00,  5.45s/it]

2025/09/26 17:06:11 INFO dspy.evaluate.evaluate: Average Metric: 2.515151515151515 / 3 (83.8%)





2025/09/26 17:10:34 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyphens

Average Metric: 2.45 / 3 (81.8%): 100%|██████████| 3/3 [00:25<00:00,  8.38s/it]

2025/09/26 17:13:11 INFO dspy.evaluate.evaluate: Average Metric: 2.4545454545454546 / 3 (81.8%)





2025/09/26 17:17:46 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyphens

Average Metric: 2.45 / 3 (81.8%): 100%|██████████| 3/3 [00:30<00:00, 10.13s/it]

2025/09/26 17:20:23 INFO dspy.evaluate.evaluate: Average Metric: 2.4545454545454546 / 3 (81.8%)





2025/09/26 17:24:38 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyp

Average Metric: 2.24 / 3 (74.7%): 100%|██████████| 3/3 [00:31<00:00, 10.66s/it]

2025/09/26 17:27:31 INFO dspy.evaluate.evaluate: Average Metric: 2.242424242424242 / 3 (74.7%)





2025/09/26 17:32:07 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyphens

Average Metric: 2.27 / 3 (75.8%): 100%|██████████| 3/3 [00:22<00:00,  7.65s/it]

2025/09/26 17:35:59 INFO dspy.evaluate.evaluate: Average Metric: 2.272727272727273 / 3 (75.8%)





2025/09/26 17:40:30 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyp

Average Metric: 2.18 / 3 (72.7%): 100%|██████████| 3/3 [00:24<00:00,  8.30s/it]

2025/09/26 17:43:03 INFO dspy.evaluate.evaluate: Average Metric: 2.181818181818182 / 3 (72.7%)





2025/09/26 17:46:35 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: markdown
# Instructions for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hy

Average Metric: 2.64 / 3 (87.9%): 100%|██████████| 3/3 [00:21<00:00,  7.02s/it]

2025/09/26 17:48:55 INFO dspy.evaluate.evaluate: Average Metric: 2.6363636363636367 / 3 (87.9%)





2025/09/26 17:53:29 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: markdown
# Instructions for the Assistant

## Task Description

You are tasked with extracting structured metadata from the text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hy

Average Metric: 2.55 / 3 (84.8%): 100%|██████████| 3/3 [00:18<00:00,  6.22s/it] 

2025/09/26 17:54:15 INFO dspy.evaluate.evaluate: Average Metric: 2.5454545454545454 / 3 (84.8%)





2025/09/26 17:56:07 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: markdown
# Detailed Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted w

Average Metric: 2.50 / 3 (83.3%): 100%|██████████| 3/3 [00:18<00:00,  6.19s/it] 

2025/09/26 17:58:19 INFO dspy.evaluate.evaluate: Average Metric: 2.5 / 3 (83.3%)





2025/09/26 18:02:53 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hy

Average Metric: 2.36 / 3 (78.8%): 100%|██████████| 3/3 [00:17<00:00,  5.83s/it]

2025/09/26 18:03:26 INFO dspy.evaluate.evaluate: Average Metric: 2.3636363636363638 / 3 (78.8%)





2025/09/26 18:08:00 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyphen

Average Metric: 2.55 / 3 (84.8%): 100%|██████████| 3/3 [00:16<00:00,  5.48s/it]

2025/09/26 18:08:35 INFO dspy.evaluate.evaluate: Average Metric: 2.5454545454545454 / 3 (84.8%)





2025/09/26 18:13:09 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hy

Average Metric: 2.07 / 3 (69.1%): 100%|██████████| 3/3 [00:17<00:00,  5.93s/it]

2025/09/26 18:13:42 INFO dspy.evaluate.evaluate: Average Metric: 2.0727272727272728 / 3 (69.1%)





2025/09/26 18:18:12 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: markdown
# Detailed Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted w

Average Metric: 2.45 / 3 (81.8%): 100%|██████████| 3/3 [00:24<00:00,  8.06s/it]

2025/09/26 18:20:36 INFO dspy.evaluate.evaluate: Average Metric: 2.4545454545454546 / 3 (81.8%)





2025/09/26 18:22:49 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hy

Average Metric: 2.27 / 3 (75.8%): 100%|██████████| 3/3 [00:16<00:00,  5.42s/it]

2025/09/26 18:24:56 INFO dspy.evaluate.evaluate: Average Metric: 2.2727272727272725 / 3 (75.8%)





2025/09/26 18:29:29 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document, determined by the text content.
2. **title**: The main title of the publication, including the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication, inferred from the creation date in the pdfinfo metadata and the content of the document.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DO

Average Metric: 2.18 / 3 (72.7%): 100%|██████████| 3/3 [00:19<00:00,  6.40s/it]

2025/09/26 18:30:07 INFO dspy.evaluate.evaluate: Average Metric: 2.181818181818182 / 3 (72.7%)





2025/09/26 18:33:33 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for predict: markdown
# Instructions for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without h

Average Metric: 2.36 / 3 (78.8%): 100%|██████████| 3/3 [00:22<00:00,  7.66s/it]

2025/09/26 18:35:59 INFO dspy.evaluate.evaluate: Average Metric: 2.3636363636363638 / 3 (78.8%)





2025/09/26 18:40:29 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for predict: markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document. Use the text content to identify the language, and if there is any language information given in the text, use that information to determine the language.
2. **title**: The main title of the publication. It should include the subtitle if present. The title should be formatted as it appears in the text or as the metadata provided in the PDF information, if that metadata matches the title in the text.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PD

Average Metric: 2.27 / 3 (75.8%): 100%|██████████| 3/3 [00:17<00:00,  5.74s/it]

2025/09/26 18:43:10 INFO dspy.evaluate.evaluate: Average Metric: 2.2727272727272725 / 3 (75.8%)





2025/09/26 18:45:39 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for predict: markdown
# Instructions for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include the last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without h

CPU times: user 16.2 s, sys: 3.44 s, total: 19.7 s
Wall time: 1h 48min 24s





In [8]:
for name, pred in optimized_program.named_predictors():
    print("================================")
    print(f"Predictor: {name}")
    print("================================")
    print("Prompt:")
    print(pred.signature.instructions)
    print("*********************************")

Predictor: predict
Prompt:
markdown
# Instruction for the Assistant

## Task Description

You are tasked with extracting structured metadata from text extracted from PDF documents. The metadata should include the following fields:

1. **language**: The language of the document.
2. **title**: The main title of the publication. It should include the subtitle if present.
3. **alt_title**: Alternative titles, if any, formatted with the BCP47 language tag.
4. **creator**: The name(s) of the author(s) or creator(s), formatted to include last name first. This should match the name(s) mentioned in the text of the PDF.
5. **year**: The year of publication.
6. **publisher**: The publisher(s) of the document. This should match the name(s) mentioned in the text of the PDF.
7. **doi**: The Digital Object Identifier (DOI), if available.
8. **e_isbn**: Electronic International Standard Book Numbers (ISBNs), formatted without hyphens or spaces.
9. **p_isbn**: Print International Standard Book Numbers 

In [9]:
%%time

evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metadata_metric_with_feedback,
    num_threads=32,
    display_table=True,
    display_progress=True,
    provide_traceback=True
)

eval_result = evaluate(optimized_program)

Average Metric: 149.17 / 182 (82.0%): 100%|██████████| 182/182 [05:30<00:00,  1.82s/it]

2025/09/26 18:54:42 INFO dspy.evaluate.evaluate: Average Metric: 149.1715898942757 / 182 (82.0%)



CPU times: user 3.11 s, sys: 547 ms, total: 3.66 s
Wall time: 5min 30s


In [10]:
lm.inspect_history()





[34m[2025-09-26T18:54:42.542180][0m

[31mSystem message:[0m

Your input fields are:
1. `content` (str):
Your output fields are:
1. `reasoning` (str): 
2. `language` (str): The language of the resource expressed as a BCP47 language tag.
3. `title` (str): The main title of the publication.
4. `alt_title` (list[str]): Alternative or parallel titles of the publication, suffixed with a BCP47 language tag in curly brackets.
5. `creator` (list[str]): The primary author(s) of the resource.
6. `year` (Union[str, NoneType]): The year on which the resource was issued or made available.
7. `publisher` (list[str]): The entity/entities responsible for making the resource available.
8. `doi` (Union[str, NoneType]): The Digital Object Identifier (DOI) associated with the resource.
9. `e_isbn` (list[str]): The ISBN associated with the electronic resource.
10. `p_isbn` (list[str]): The ISBN of the printed version of this document.
11. `e_issn` (Union[str, NoneType]): The ISSN associated with the

In [11]:
# save the optimized program for later use (many formats, just in case)
optimized_program.save("gepa-optimized-module.json", save_program=False)
optimized_program.save("gepa-optimized-module.pkl", save_program=False)
# save just the prompt(s)
for name, pred in optimized_program.named_predictors():
    with open(f"gepa-optimized-prompt-{name}.txt", "w") as outfile:
        outfile.write(pred.signature.instructions)
