# Convert metadata and PDFs to LLM dataset

This notebook will process the already downloaded PDF files and convert them to a data set suitable for fine-tuning and evaluating LLMs. Each record will be converted to two fields, "text" and "metadata", where "text" contains the text extracted from the first few pages and last page of the PDF file and "metadata" is a string that represents the metadata of the document in a very simple textual key/value format. In addition, the "id" and "url" fields (containing the document ID/URL and the PDF URL, respectively) will be retained in the new record and the "ground_truth" field will contain the original JSON structured metadata.

In [1]:
import os.path
import glob
import json
import re

import fitz
import regex  # has better Unicode support than standard library re module

PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages

KV_SKIP_FIELDS = {'id', 'url', 'rowid', 'repository', 'doctype', 'subset'}  # fields to not include in key-value metadata

PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

METADATA_FIELD_NAMES = {
    "dc.contributor": "Contributor",
    "dc.contributor.author": "Author",
    "dc.contributor.degreeSupervisor": "Supervisor",
    "dc.contributor.department": "Department",
    "dc.contributor.editor": "Editor",
    "dc.contributor.faculty": "Faculty",
    "dc.contributor.opponent": "Opponent",
    "dc.contributor.organization": "Organization",
    "dc.contributor.orgunit": "Org. unit",
    "dc.contributor.reviewer": "Reviewer",
    "dc.contributor.studysubject": "Study subject",
    "dc.contributor.supervisor": "Supervisor",
    "dc.date.issued": "Issued",
    "dc.format.extent": None,  # number of pages - hard to extract
    "dc.format.pagerange": "Page range",
    "dc.identifier.isbn": "ISBN (online)",
    "dc.identifier.urn": "URN",
    "dc.language.iso": "Language",
    "dc.publisher": "Publisher",
    "dc.relation.contractor": "Contractor",
    "dc.relation.doi": "DOI",
    "dc.relation.eissn": "ISSN (online)",
    "dc.relation.isbn": "ISBN (printed)",
    "dc.relation.ispartofjournal": "Journal name",
    "dc.relation.ispartofseries": "Series name",
    "dc.relation.issue": "Issue",
    "dc.relation.numberinseries": "Number in series",
    "dc.relation.pissn": "ISSN (printed)",
    "dc.relation.volume": "Volume",
    "dc.series.year": "Series year",
    "dc.source.identifier": None,  # rare + hard to extract
    "dc.subject.degreeprogram": "Degree program",
    "dc.subject.discipline": "Discipline",
    "dc.title": "Title",
    "dc.title.alternative": "Alternative title",
    "dc.type.coar": "COAR type",
    "dc.type.okm": "OKM type",
    "dc.type.ontasot": "Thesis level",
}

LANG_MAP = {
    'fin': 'fi',
    'swe': 'sv',
    'eng': 'en'
}

LANG_AWARE_FIELDS = (
    'dc.contributor.department',
    'dc.contributor.faculty',
    'dc.contributor.organization',
    'dc.subject.degreeprogram',
    'dc.subject.discipline',
    'dc.title.alternative',
)

metadata_files = glob.glob("../metadata/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def extract_text(fn):
    """extract and return the first few pages of text from the given PDF file"""

    with fitz.open(fn) as pdf:
        texts = []

        for key in pdf.metadata.keys():
            if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
                texts.append(f"{key}: {pdf.metadata.get(key)}")

        for page in PAGES:
            if page > len(pdf) - 2:
                continue

            text = pdf[page].get_text(sort=True)
            # Use regular expression to split text into paragraphs
            # Delimiter: newline(s) followed by an upper case character
            paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
            long_paragraph_count = 0

            for paragraph in paragraphs:
                paragraph = " ".join(paragraph.strip().split())

                if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                    continue
                elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                    texts.append(paragraph)
                elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                    # allow some long paragraphs on the first two pages
                    long_paragraph_count += 1
                    texts.append(paragraph)
                else:  # must be a long paragraph, skip it
                    pass

    return '\n'.join(texts)

def choose_value_by_lang(lang, vals):
    lang_vals = {}
    fallback_val = vals[0].split(' {')[0]
    for val in vals:
        m = re.match(r"(.*) {(\w\w)}", val)
        if m:
            lang_vals[m.group(2)] = m.group(1)
        else:
            fallback_val = val
    if lang in lang_vals:
        return lang_vals[lang]
    return fallback_val

def metadata_to_kvtext(rec):
    langcode = LANG_MAP[rec["dc.language.iso"]]
    lines = []
    for fld in sorted(rec.keys()):
        if fld in KV_SKIP_FIELDS:
            continue
        fldname = METADATA_FIELD_NAMES[fld]
        if not fldname:
            continue
        vals = rec[fld]
        if not isinstance(vals, list):
            vals = [vals]
        if fld in LANG_AWARE_FIELDS:
            vals = [choose_value_by_lang(langcode, vals)]
        for val in sorted(vals):
            lines.append(f"{fldname}: {val}")
    return "\n".join(lines)

for mdfile in sorted(metadata_files):
    out_path = mdfile.replace('metadata', 'llm-dataset')
    print(f"converting {mdfile} to {out_path}")
    with open(mdfile) as infile, open(out_path, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            pdf_path = id_to_fn(rec["id"])
            pdf_text = extract_text(pdf_path)
            metadata = metadata_to_kvtext(rec)
            ground_truth = {fld: val for fld, val in rec.items() if fld.startswith('dc.')}
            outrec = {"id": rec["id"], "url": rec["url"], "text": pdf_text, "metadata": metadata, "ground_truth": ground_truth}
            json.dump(outrec, outfile)
            outfile.write("\n")

converting ../metadata/docthes-eng-test.jsonl to ../llm-dataset/docthes-eng-test.jsonl
converting ../metadata/docthes-eng-train.jsonl to ../llm-dataset/docthes-eng-train.jsonl
converting ../metadata/docthes-fin-test.jsonl to ../llm-dataset/docthes-fin-test.jsonl
converting ../metadata/docthes-fin-train.jsonl to ../llm-dataset/docthes-fin-train.jsonl
converting ../metadata/docthes-swe-test.jsonl to ../llm-dataset/docthes-swe-test.jsonl
converting ../metadata/docthes-swe-train.jsonl to ../llm-dataset/docthes-swe-train.jsonl
converting ../metadata/mono-eng-test.jsonl to ../llm-dataset/mono-eng-test.jsonl
converting ../metadata/mono-eng-train.jsonl to ../llm-dataset/mono-eng-train.jsonl
converting ../metadata/mono-fin-test.jsonl to ../llm-dataset/mono-fin-test.jsonl
converting ../metadata/mono-fin-train.jsonl to ../llm-dataset/mono-fin-train.jsonl
converting ../metadata/mono-swe-test.jsonl to ../llm-dataset/mono-swe-test.jsonl
converting ../metadata/mono-swe-train.jsonl to ../llm-dataset/m