# Metadata extraction using DSPy and a local LLM using GEPA optimization

To run this, you first need to start two local vLLM servers in the backround.

For the main extractor model:

    vllm serve google/gemma-3-4b-it --port 7987 --max-model-len 16384 --gpu-memory-utilization 0.25

For the reflection model:
    
    vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --port 7988 --max-model-len 24576 --gpu-memory-utilization 0.65


In [1]:
import dspy

MODEL_ID = "google/gemma-3-4b-it"  # should match the model vLLM is running (does it matter??)
PORT = 7987  # should match the port where vLLM is running
MAX_TOKENS = 1024  # limit on how many new tokens to generate (default: 4000)
TEMPERATURE = 0.7

lm = dspy.LM("openai/" + MODEL_ID,
             api_base=f"http://localhost:{PORT}/v1",  # ensure this points to your port
             api_key="local", model_type="chat", max_tokens=MAX_TOKENS, temperature=TEMPERATURE)
dspy.configure(lm=lm)

# test the connection to the LLM
lm("Who are you?", temperature=0.0)

["I'm Gemma, a large language model created by the Gemma team at Google DeepMind. I’m an open-weights model, which means I’m widely available for public use! \n\nI can take text and images as inputs and generate text-based responses. \n\nYou can learn more about me on the Gemma project page: [https://ai.google.dev/gemma](https://ai.google.dev/gemma)"]

In [2]:

REFLECTION_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
REFLECTION_PORT = PORT + 1
REFLECTION_MAX_TOKENS = 8192

reflection_lm = dspy.LM("openai/" + REFLECTION_MODEL_ID,
             api_base=f"http://localhost:{REFLECTION_PORT}/v1",  # ensure this points to your port
             api_key="local", model_type="chat", max_tokens=REFLECTION_MAX_TOKENS, temperature=TEMPERATURE)

# test the connection to the LLM
reflection_lm("Who are you?", temperature=0.0)

["I am Mistral Small 3, a Large Language Model created by Mistral AI, a French startup headquartered in Paris. I help answer questions to the best of my ability using the data I've been trained on up until 2023-10-01."]

In [3]:
# Load and prepare dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of validation set

train_files = glob.glob("../../llm-dataset/*-train.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

VAL_SIZE = 64  # how many documents to validate on during optimization

def preprocess_sample(sample):
    # fix some bad field names
    ground_truth = { fld.replace('-', '_'): val for fld, val in sample["ground_truth"].items() }
    output = json.dumps(ground_truth)
    input_ = json.dumps(sample["content"])
    return dspy.Example({"content": input_, "metadata": output}).with_inputs("content")

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records


train_val_set = dataset_to_records(train_files)
random.shuffle(train_val_set)

train_set = train_val_set[VAL_SIZE:]
val_set = train_val_set[:VAL_SIZE]

test_set = dataset_to_records(test_files)

len(train_set), len(val_set), len(test_set)

(576, 64, 182)

In [4]:
print("Input Message:")
print(train_set[-1]['content'])

print("\n\nGold Answer:")
for k, v in json.loads(train_set[-1]['metadata']).items():
    print(f"{k}: {v}")

Input Message:
{"pdfinfo": {"creationDate": "D:20201214215341+01'00'", "modDate": "D:20201214215418+01'00'"}, "pages": [{"page": 1, "text": "# ANTAA TAITEEN OPETTAA\n\n\n"}, {"page": 3, "text": "ANTA A TAITEEN OPETTA A GERT BIESTA\n\n\n"}, {"page": 4, "text": "00:00:08.18\n\n\n"}, {"page": 5, "text": "00:00:36.03 00:00:52.19 00:00:54.19\n\n\n"}, {"page": 6, "text": "00:00:58.16 00:01:00.17 00:01:0\n\n\n"}, {"page": 65, "text": "\u2018Opastan sinua kaikessa, n\u00e4yt\u00e4n sinulle kaiken ja nime\u00e4n kaiken.\u2019\n\u2014 COMENIUS\nT\u00e4ss\u00e4 kirjassa Gert Biesta esitt\u00e4\u00e4 uuden n\u00e4kemyksen nykyaikaisesta taidekasvatuksesta\n\nosoittamalla, ett\u00e4 taide tarjoaa ainutlaatuisia v\u00e4lineit\u00e4 olla dialogissa maailman kanssa. N\u00e4kemys\n\nperustuu ajatukseen, ett\u00e4 opettaminen on n\u00e4ytt\u00e4mist\u00e4. Opettaja n\u00e4ytt\u00e4\u00e4 oppilaalle millaisiin\n\nhyviin, t\u00e4rkeisiin tai merkitt\u00e4viin asioihin maailmassa voisi kiinnitt\u00e4\u00e4

In [5]:
from typing import Optional

class ExtractInfo(dspy.Signature):
    """Extract structured metadata from text extracted from a PDF."""

    content: str = dspy.InputField()
    language: str = dspy.OutputField(desc="The language of the resource expressed as a BCP47 language tag.")
    title: str = dspy.OutputField(desc="The main title of the publication.")
    alt_title: list[str] = dspy.OutputField(desc="Alternative or parallel titles of the publication, suffixed with a BCP47 language tag in curly brackets.")
    creator: list[str] = dspy.OutputField(desc="The primary author(s) of the resource (order: Last Name, First Names).")
    year: Optional[str] = dspy.OutputField(desc="The year on which the resource was issued or made available.")
    publisher: list[str] = dspy.OutputField(desc="The entity/entities responsible for making the resource available.")
    doi: Optional[str] = dspy.OutputField(desc="The Digital Object Identifier (DOI) associated with the resource.")
    e_isbn: list[str] = dspy.OutputField(desc="The ISBN associated with the electronic resource.")
    p_isbn: list[str] = dspy.OutputField(desc="The ISBN of the printed version of this document.")
    e_issn: Optional[str] = dspy.OutputField(desc="The ISSN associated with the electronic resource.")
    p_issn: Optional[str] = dspy.OutputField(desc="The ISSN of the printed version of this document.")
    type_coar: str = dspy.OutputField(desc="The type of the resource according to the COAR Resource Types classification.")

module = dspy.ChainOfThought(ExtractInfo)

text = "Apple Inc. announced its latest iPhone 14 today." \
    "The CEO, Tim Cook, highlighted its new features in a press release."
response = module(content=text)

print(response)


Prediction(
    reasoning='The text describes an announcement by Apple Inc. regarding the iPhone 14. The CEO, Tim Cook, is mentioned, indicating a press release. The information provided is sufficient to identify the main entities and the type of resource.',
    language='en',
    title='Apple Inc. Announces iPhone 14',
    alt_title=[],
    creator=['Apple Inc.', 'Tim Cook'],
    year=None,
    publisher=['Apple Inc.'],
    doi=None,
    e_isbn=[],
    p_isbn=[],
    e_issn=None,
    p_issn=None,
    type_coar='News Article'
)


In [6]:
import Levenshtein

ALMOST_THRESHOLD = 0.9  # Adjust as needed

def feedback_simple_string(field, true_val, pred_val):
    score = 1.0 if true_val == pred_val else 0.0
    if score == 1.0:
        feedback = f"✅ `{field}` is correct: `{true_val}`."
    else:
        feedback = f"❌ `{field}` is incorrect. You predicted `{pred_val}`, but the correct value is `{true_val}`."
    return score, feedback

def feedback_fuzzy_string(field, true_val, pred_val):
    base_score = 1.0 if true_val == pred_val else 0.0
    if base_score == 1.0 or (true_val and pred_val and Levenshtein.ratio(true_val.lower(), pred_val.lower()) >= ALMOST_THRESHOLD):
        score = 1.0
        feedback = f"✅ `{field}` is approximately correct: `{pred_val}` matches `{true_val}` closely."
    else:
        score = 0.0
        feedback = f"❌ `{field}` is incorrect. You predicted `{pred_val}`, but the correct value is `{true_val}`."
    return score, feedback

def feedback_set(field, true_val, pred_val):
    true_set = set(true_val or [])
    pred_set = set(pred_val or [])

    if not true_set and not pred_set:
        return 1.0, f"✅ `{field}` is empty as expected."
    elif not true_set or not pred_set:
        return 0.0, f"❌ `{field}` is incorrect. Expected `{true_set}`, but got `{pred_set}`."

    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    feedback = f"🔍 `{field}` partial match."
    feedback += f"- Correctly included: `{list(true_set & pred_set)}`\n"
    if fp:
        feedback += f"- Incorrectly included: `{list(pred_set - true_set)}`\n"
    if fn:
        feedback += f"- Missed: `{list(true_set - pred_set)}`"

    return f1, feedback.strip()

def feedback_e_issn(field, true_val, pred_val, p_issn_val):
    if true_val == pred_val:
        return 1.0, f"✅ `{field}` is correct: `{true_val}`."
    elif p_issn_val and pred_val == p_issn_val and true_val is None:
        return 1.0, f"✅ `{field}` is correctly inferred from `p_issn`: `{pred_val}`."
    else:
        return 0.0, f"❌ `{field}` is incorrect. You predicted `{pred_val}`, but the correct value is `{true_val}`."

def metadata_metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    fields = [
        'language', 'title', 'creator', 'year', 'publisher',
        'doi', 'e_isbn', 'p_isbn', 'e_issn', 'p_issn', 'type_coar'
    ]

    scores = []
    feedback_parts = []

    metadata = json.loads(example.get("metadata", "{}"))
    ground_truth = example.get("ground_truth", {})

    for field in fields:
        true_val = metadata.get(field)
        pred_val = pred.get(field) or None

        if field in ['language', 'year', 'doi', 'p_issn', 'type_coar']:
            score, feedback = feedback_simple_string(field, true_val, pred_val)
        elif field == 'title':
            score, feedback = feedback_fuzzy_string(field, true_val, pred_val)
        elif field in ['creator', 'publisher', 'e_isbn', 'p_isbn']:
            score, feedback = feedback_set(field, true_val, pred_val)
        elif field == 'e_issn':
            p_issn_val = ground_truth.get("p_issn")
            score, feedback = feedback_e_issn(field, true_val, pred_val, p_issn_val)
        else:
            score, feedback = feedback_simple_string(field, true_val, pred_val)

        scores.append(score)
        feedback_parts.append(feedback)

    overall_score = sum(scores) / len(scores) if scores else 0
    full_feedback = "\n".join(feedback_parts)

    return dspy.Prediction(score=overall_score, feedback=full_feedback)


In [7]:
from dspy import GEPA

optimizer = GEPA(
    metric=metadata_metric_with_feedback,
    auto="heavy",
    num_threads=64,
    track_stats=False,
    use_merge=True,
    reflection_lm=reflection_lm
)

In [8]:
%%time

optimized_program = optimizer.compile(
    module,
    trainset=train_set,
    valset=val_set,
)

2025/09/27 10:31:48 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1483 metric calls of the program. This amounts to 2.32 full evals on the train+val set.
2025/09/27 10:31:48 INFO dspy.teleprompt.gepa.gepa: Using 64 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/1483 [00:00<?, ?rollouts/s]2025/09/27 10:32:25 INFO dspy.evaluate.evaluate: Average Metric: 38.251731601731585 / 64 (59.8%)
2025/09/27 10:32:25 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.5976833062770562
GEPA Optimization:   4%|▍         | 64/1483 [00:36<13:35,  1.74rollouts/s]2025/09/27 10:32:25 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.5976833062770562


Average Metric: 1.91 / 3 (63.6%): 100%|██████████| 3/3 [00:07<00:00,  2.46s/it]

2025/09/27 10:32:32 INFO dspy.evaluate.evaluate: Average Metric: 1.9090909090909092 / 3 (63.6%)





2025/09/27 10:33:06 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:07<00:00,  2.64s/it]

2025/09/27 10:34:03 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/27 10:34:44 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 1.95 / 3 (65.2%): 100%|██████████| 3/3 [00:07<00:00,  2.58s/it]

2025/09/27 10:35:35 INFO dspy.evaluate.evaluate: Average Metric: 1.9545454545454546 / 3 (65.2%)





2025/09/27 10:36:25 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 2.06 / 3 (68.7%): 100%|██████████| 3/3 [00:07<00:00,  2.43s/it]

2025/09/27 10:37:18 INFO dspy.evaluate.evaluate: Average Metric: 2.0606060606060606 / 3 (68.7%)





2025/09/27 10:38:15 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 1.73 / 3 (57.6%): 100%|██████████| 3/3 [00:06<00:00,  2.13s/it]

2025/09/27 10:38:28 INFO dspy.evaluate.evaluate: Average Metric: 1.727272727272727 / 3 (57.6%)





2025/09/27 10:39:18 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 1.73 / 3 (57.6%): 100%|██████████| 3/3 [00:07<00:00,  2.62s/it]

2025/09/27 10:40:14 INFO dspy.evaluate.evaluate: Average Metric: 1.727272727272727 / 3 (57.6%)





2025/09/27 10:41:17 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:08<00:00,  2.99s/it]

2025/09/27 10:42:11 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/27 10:43:07 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document using the language code (e.g., `fi` for Finnish, `en` for English

Average Metric: 1.45 / 3 (48.5%): 100%|██████████| 3/3 [00:07<00:00,  2.54s/it]

2025/09/27 10:43:58 INFO dspy.evaluate.evaluate: Average Metric: 1.4545454545454546 / 3 (48.5%)





2025/09/27 10:45:04 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource, ensuring accurate and contextually appropriate values.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Titl

Average Metric: 1.36 / 3 (45.5%): 100%|██████████| 3/3 [00:09<00:00,  3.18s/it]

2025/09/27 10:46:03 INFO dspy.evaluate.evaluate: Average Metric: 1.3636363636363638 / 3 (45.5%)





2025/09/27 10:46:52 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.

Average Metric: 1.64 / 3 (54.5%): 100%|██████████| 3/3 [00:08<00:00,  2.94s/it]

2025/09/27 10:47:44 INFO dspy.evaluate.evaluate: Average Metric: 1.6363636363636362 / 3 (54.5%)





2025/09/27 10:48:40 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description

1. **Language Detection**: Identify the language of the document using codes such as `fi` for Finnish, `en` for English, and `sv

Average Metric: 1.73 / 3 (57.6%): 100%|██████████| 3/3 [00:09<00:00,  3.25s/it]

2025/09/27 10:49:34 INFO dspy.evaluate.evaluate: Average Metric: 1.7272727272727273 / 3 (57.6%)





2025/09/27 10:50:27 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

Average Metric: 1.79 / 3 (59.6%): 100%|██████████| 3/3 [00:08<00:00,  3.00s/it]

2025/09/27 10:51:19 INFO dspy.evaluate.evaluate: Average Metric: 1.7878787878787878 / 3 (59.6%)





2025/09/27 10:52:17 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

Average Metric: 1.97 / 3 (65.7%): 100%|██████████| 3/3 [00:06<00:00,  2.28s/it]

2025/09/27 10:53:08 INFO dspy.evaluate.evaluate: Average Metric: 1.9696969696969697 / 3 (65.7%)





2025/09/27 10:54:07 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

Average Metric: 1.58 / 3 (52.7%): 100%|██████████| 3/3 [00:09<00:00,  3.07s/it]

2025/09/27 10:54:59 INFO dspy.evaluate.evaluate: Average Metric: 1.5818181818181818 / 3 (52.7%)





2025/09/27 10:55:53 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

Average Metric: 1.73 / 3 (57.6%): 100%|██████████| 3/3 [00:06<00:00,  2.29s/it]

2025/09/27 10:56:47 INFO dspy.evaluate.evaluate: Average Metric: 1.727272727272727 / 3 (57.6%)





2025/09/27 10:57:33 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document using specific language codes.
2. **Title Extraction**: Extract 

Average Metric: 1.82 / 3 (60.6%): 100%|██████████| 3/3 [00:05<00:00,  1.95s/it]

2025/09/27 10:58:22 INFO dspy.evaluate.evaluate: Average Metric: 1.8181818181818181 / 3 (60.6%)





2025/09/27 10:59:17 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:07<00:00,  2.53s/it]

2025/09/27 11:00:07 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/27 11:01:00 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to accurately extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document using specific language codes.
2. **Title Extraction*

Average Metric: 2.09 / 3 (69.7%): 100%|██████████| 3/3 [00:08<00:00,  2.89s/it]

2025/09/27 11:01:17 INFO dspy.evaluate.evaluate: Average Metric: 2.090909090909091 / 3 (69.7%)





2025/09/27 11:02:17 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

Average Metric: 2.09 / 3 (69.7%): 100%|██████████| 3/3 [00:09<00:00,  3.21s/it]

2025/09/27 11:03:13 INFO dspy.evaluate.evaluate: Average Metric: 2.090909090909091 / 3 (69.7%)





2025/09/27 11:07:03 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for predict: ## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document using the language code (e.g., `fi` for Finnish, `en` for English).
2. **

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:11<00:00,  3.97s/it]

2025/09/27 11:07:22 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/27 11:08:11 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document. Ensure 

Average Metric: 1.94 / 3 (64.5%): 100%|██████████| 3/3 [00:06<00:00,  2.08s/it]

2025/09/27 11:09:04 INFO dspy.evaluate.evaluate: Average Metric: 1.9350649350649352 / 3 (64.5%)





2025/09/27 11:10:10 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document. Use language codes such as `fi` for Finnish, `en` for English, 

Average Metric: 1.64 / 3 (54.5%): 100%|██████████| 3/3 [00:08<00:00,  2.77s/it]

2025/09/27 11:11:03 INFO dspy.evaluate.evaluate: Average Metric: 1.6363636363636362 / 3 (54.5%)





2025/09/27 11:12:07 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document. Use the language code (e.g., `fi` for Finnish, `en` for English

Average Metric: 1.43 / 3 (47.8%): 100%|██████████| 3/3 [00:07<00:00,  2.61s/it]

2025/09/27 11:12:23 INFO dspy.evaluate.evaluate: Average Metric: 1.4335664335664338 / 3 (47.8%)





2025/09/27 11:13:26 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description

1. **Language Detection**: Identify the language of the document using the language code (e.g., `fi` for Finnish, `en` for Engli

Average Metric: 2.45 / 3 (81.8%): 100%|██████████| 3/3 [00:07<00:00,  2.56s/it]

2025/09/27 11:14:22 INFO dspy.evaluate.evaluate: Average Metric: 2.4545454545454546 / 3 (81.8%)





2025/09/27 11:15:08 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Proposed new text for predict: markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document

CPU times: user 31.1 s, sys: 7.69 s, total: 38.8 s
Wall time: 44min 3s





In [9]:
for name, pred in optimized_program.named_predictors():
    print("================================")
    print(f"Predictor: {name}")
    print("================================")
    print("Prompt:")
    print(pred.signature.instructions)
    print("*********************************")

Predictor: predict
Prompt:
markdown
## Instructions for Extracting Structured Metadata from PDF Text

### Task Overview
The goal is to extract structured metadata from the text content extracted from a PDF. The metadata should be formatted according to a predefined JSON schema. The task involves identifying key elements such as language, title, alternative titles, creators, year of publication, publisher, DOI, ISBNs, ISSNs, and type of resource.

### Input Format
The input will be a JSON object containing the following fields:
- `pdfinfo`: A dictionary with metadata about the PDF, including `title`, `author`, `creationDate`, and `modDate`.
- `pages`: A list of dictionaries, each containing a `page` number and the corresponding `text` extracted from that page.

### Detailed Task Description
1. **Language Detection**: Identify the language of the document.
2. **Title Extraction**: Extract the main title of the document.
3. **Alternative Titles**: Identify any alternative titles or subtit

In [10]:
%%time

evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metadata_metric_with_feedback,
    num_threads=64,
    display_table=True,
    display_progress=True,
    provide_traceback=True
)

eval_result = evaluate(optimized_program)

Average Metric: 40.35 / 63 (64.1%):  35%|███▍      | 63/182 [00:46<02:49,  1.42s/it]



Average Metric: 118.67 / 181 (65.6%):  99%|█████████▉| 181/182 [01:46<00:00,  1.59it/s]



Average Metric: 119.40 / 182 (65.6%): 100%|██████████| 182/182 [01:50<00:00,  1.64it/s]

2025/09/27 11:17:43 INFO dspy.evaluate.evaluate: Average Metric: 119.40087050630528 / 182 (65.6%)



CPU times: user 3.98 s, sys: 941 ms, total: 4.92 s
Wall time: 1min 50s


In [11]:
lm.inspect_history()





[34m[2025-09-27T11:17:43.188412][0m

[31mSystem message:[0m

Your input fields are:
1. `content` (str):
Your output fields are:
1. `reasoning` (str): 
2. `language` (str): The language of the resource expressed as a BCP47 language tag.
3. `title` (str): The main title of the publication.
4. `alt_title` (list[str]): Alternative or parallel titles of the publication, suffixed with a BCP47 language tag in curly brackets.
5. `creator` (list[str]): The primary author(s) of the resource (order: Last Name, First Names).
6. `year` (Union[str, NoneType]): The year on which the resource was issued or made available.
7. `publisher` (list[str]): The entity/entities responsible for making the resource available.
8. `doi` (Union[str, NoneType]): The Digital Object Identifier (DOI) associated with the resource.
9. `e_isbn` (list[str]): The ISBN associated with the electronic resource.
10. `p_isbn` (list[str]): The ISBN of the printed version of this document.
11. `e_issn` (Union[str, NoneType

In [12]:
# save the optimized program for later use (many formats, just in case)
optimized_program.save("gepa-optimized-module.json", save_program=False)
optimized_program.save("gepa-optimized-module.pkl", save_program=False)
# save just the prompt(s)
for name, pred in optimized_program.named_predictors():
    with open(f"gepa-optimized-prompt-{name}.txt", "w") as outfile:
        outfile.write(pred.signature.instructions)
