# Metadata extraction using DSPy and a local LLM, with evaluation metrics

To run this, you first need to start a local vLLM server in the backround with a command like this:

    vllm serve $MODEL_ID --port 7987 --max-model-len 32768 --gpu-memory-utilization 0.9

where MODEL_ID is e.g. `meta-llama/Llama-3.1-8B-Instruct` and the port has to match the PORT setting below.

In [1]:
import dspy

MODEL_ID = "google/gemma-3-4b-it"  # should match the model vLLM is running (does it matter??)
PORT = 7987  # should match the port where vLLM is running
MAX_TOKENS = 1024  # limit on how many new tokens to generate (default: 4000)
TEMPERATURE = 0.7

lm = dspy.LM("openai/" + MODEL_ID,
             api_base=f"http://localhost:{PORT}/v1",  # ensure this points to your port
             api_key="local", model_type="chat", max_tokens=MAX_TOKENS, temperature=TEMPERATURE)
dspy.configure(lm=lm)

# test the connection to the LLM
lm("Say this is a test!", temperature=0.0)  # => ['This is a test!']

['Okay! This is a test! 😊 \n\nLet me know if you need anything from me.']

In [2]:
# Load and prepare dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of validation set

train_files = glob.glob("../../llm-dataset/*-train.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

VAL_SIZE = 128  # how many documents to validate on during optimization

def preprocess_sample(sample):
    # fix some bad field names
    ground_truth = { fld.replace('-', '_'): val for fld, val in sample["ground_truth"].items() }
    output = json.dumps(ground_truth)
    input_ = json.dumps(sample["content"])
    return dspy.Example({"content": input_, "metadata": output}).with_inputs("content")

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records


train_val_set = dataset_to_records(train_files)
random.shuffle(train_val_set)

train_set = train_val_set[VAL_SIZE:]
val_set = train_val_set[:VAL_SIZE]

test_set = dataset_to_records(test_files)

len(train_set), len(val_set), len(test_set)

(512, 128, 182)

In [3]:
print("Input Message:")
print(train_set[-1]['content'])

print("\n\nGold Answer:")
for k, v in json.loads(train_set[-1]['metadata']).items():
    print(f"{k}: {v}")

Input Message:
{"pdfinfo": {"creationDate": "D:20201214215341+01'00'", "modDate": "D:20201214215418+01'00'"}, "pages": [{"page": 1, "text": "# ANTAA TAITEEN OPETTAA\n\n\n"}, {"page": 3, "text": "ANTA A TAITEEN OPETTA A GERT BIESTA\n\n\n"}, {"page": 4, "text": "00:00:08.18\n\n\n"}, {"page": 5, "text": "00:00:36.03 00:00:52.19 00:00:54.19\n\n\n"}, {"page": 6, "text": "00:00:58.16 00:01:00.17 00:01:0\n\n\n"}, {"page": 65, "text": "\u2018Opastan sinua kaikessa, n\u00e4yt\u00e4n sinulle kaiken ja nime\u00e4n kaiken.\u2019\n\u2014 COMENIUS\nT\u00e4ss\u00e4 kirjassa Gert Biesta esitt\u00e4\u00e4 uuden n\u00e4kemyksen nykyaikaisesta taidekasvatuksesta\n\nosoittamalla, ett\u00e4 taide tarjoaa ainutlaatuisia v\u00e4lineit\u00e4 olla dialogissa maailman kanssa. N\u00e4kemys\n\nperustuu ajatukseen, ett\u00e4 opettaminen on n\u00e4ytt\u00e4mist\u00e4. Opettaja n\u00e4ytt\u00e4\u00e4 oppilaalle millaisiin\n\nhyviin, t\u00e4rkeisiin tai merkitt\u00e4viin asioihin maailmassa voisi kiinnitt\u00e4\u00e4

In [4]:
from typing import Optional

class ExtractInfo(dspy.Signature):
    """Extract structured metadata from text extracted from a PDF."""

    content: str = dspy.InputField()
    language: str = dspy.OutputField(desc="The language of the resource expressed as a BCP47 language tag.")
    title: str = dspy.OutputField(desc="The main title of the publication.")
    alt_title: list[str] = dspy.OutputField(desc="Alternative or parallel titles of the publication, suffixed with a BCP47 language tag in curly brackets.")
    creator: list[str] = dspy.OutputField(desc="The primary author(s) of the resource.")
    year: Optional[str] = dspy.OutputField(desc="The year on which the resource was issued or made available.")
    publisher: list[str] = dspy.OutputField(desc="The entity/entities responsible for making the resource available.")
    doi: Optional[str] = dspy.OutputField(desc="The Digital Object Identifier (DOI) associated with the resource.")
    e_isbn: list[str] = dspy.OutputField(desc="The ISBN associated with the electronic resource.")
    p_isbn: list[str] = dspy.OutputField(desc="The ISBN of the printed version of this document.")
    e_issn: Optional[str] = dspy.OutputField(desc="The ISSN associated with the electronic resource.")
    p_issn: Optional[str] = dspy.OutputField(desc="The ISSN of the printed version of this document.")
    type_coar: str = dspy.OutputField(desc="The type of the resource according to the COAR Resource Types classification.")

module = dspy.ChainOfThought(ExtractInfo)

text = "Apple Inc. announced its latest iPhone 14 today." \
    "The CEO, Tim Cook, highlighted its new features in a press release."
response = module(content=text)

print(response)


Prediction(
    reasoning='The text describes an announcement by Apple Inc. regarding the iPhone 14. I can identify the organization (Apple Inc.), the product (iPhone 14), the CEO (Tim Cook), and the type of announcement (press release). I will extract the relevant information and format it into the required metadata fields.',
    language='en',
    title='iPhone 14 Announcement',
    alt_title=[],
    creator=['Apple Inc.', 'Tim Cook'],
    year=None,
    publisher=['Apple Inc.'],
    doi=None,
    e_isbn=[],
    p_isbn=[],
    e_issn=None,
    p_issn=None,
    type_coar='News Article'
)


In [5]:
import Levenshtein

ALMOST_THRESHOLD = 0.9  # Adjust as needed

def compare_simple_string(true_val, pred_val):
    if pred_val is None and true_val is None:
        return 1
    elif true_val is None:
        return 0
    elif pred_val is None:
        return 0
    elif true_val == str(pred_val):
        return 1
    else:
        return 0

def compare_fuzzy_string(true_val, pred_val):
    base_score = compare_simple_string(true_val, pred_val)
    if base_score == 1:
        return 1
    if true_val.lower() == pred_val.lower():
        return 1
    elif Levenshtein.ratio(true_val, pred_val) >= ALMOST_THRESHOLD:
        return 1
    elif Levenshtein.ratio(true_val.lower(), pred_val.lower()) >= ALMOST_THRESHOLD:
        return 1
    else:
        return 0

def compare_set(true_val, pred_val):
    true_set = set(true_val) if true_val else set()
    pred_set = set(pred_val) if pred_val else set()

    if not true_set and not pred_set:
        return 1
    elif not true_set or not pred_set:
        return 0
    elif true_set == pred_set:
        return 1
    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def compare_e_issn(true_val, pred_val, p_issn_val):
    base_score = compare_simple_string(true_val, pred_val)
    if base_score == 1:
        return 1
    if p_issn_val and pred_val == p_issn_val:
        return 1 if true_val is None else 0
    return 0

def metadata_metric(example, pred, trace=None):
    fields = [
        'language', 'title', 'creator', 'year', 'publisher',
        'doi', 'e_isbn', 'p_isbn', 'e_issn', 'p_issn', 'type_coar'
    ]

    scores = []
    for field in fields:
        true_val = json.loads(example.get("metadata", "{}")).get(field)
        pred_val = pred.get(field) or None  # represent empty predictions as None

        if field in ['language', 'year', 'doi', 'p_issn', 'type_coar']:
            score = compare_simple_string(true_val, pred_val)
        elif field == 'title':
            score = compare_fuzzy_string(true_val, pred_val)
        elif field in ['creator', 'publisher', 'e_isbn', 'p_isbn']:
            score = compare_set(true_val, pred_val)
        elif field == 'e_issn':
            p_issn_val = example.get("ground_truth", {}).get("p_issn")
            score = compare_e_issn(true_val, pred_val, p_issn_val)
        else:
            score = compare_simple_string(true_val, pred_val)

        #print(field, repr(true_val), repr(pred_val), score)
        scores.append(score)

    return sum(scores) / len(scores) if scores else 0


In [6]:
%%time

evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metadata_metric,
    num_threads=64, # default 32
    display_table=True,
    display_progress=True
)

eval_result = evaluate(module)

Average Metric: 107.20 / 182 (58.9%): 100%|██████████| 182/182 [01:48<00:00,  1.68it/s]

2025/09/27 10:27:46 INFO dspy.evaluate.evaluate: Average Metric: 107.19936006022962 / 182 (58.9%)



CPU times: user 3.51 s, sys: 727 ms, total: 4.23 s
Wall time: 1min 48s


In [7]:
lm.inspect_history()





[34m[2025-09-27T10:27:46.125228][0m

[31mSystem message:[0m

Your input fields are:
1. `content` (str):
Your output fields are:
1. `reasoning` (str): 
2. `language` (str): The language of the resource expressed as a BCP47 language tag.
3. `title` (str): The main title of the publication.
4. `alt_title` (list[str]): Alternative or parallel titles of the publication, suffixed with a BCP47 language tag in curly brackets.
5. `creator` (list[str]): The primary author(s) of the resource.
6. `year` (Union[str, NoneType]): The year on which the resource was issued or made available.
7. `publisher` (list[str]): The entity/entities responsible for making the resource available.
8. `doi` (Union[str, NoneType]): The Digital Object Identifier (DOI) associated with the resource.
9. `e_isbn` (list[str]): The ISBN associated with the electronic resource.
10. `p_isbn` (list[str]): The ISBN of the printed version of this document.
11. `e_issn` (Union[str, NoneType]): The ISSN associated with the