# Convert metadata and PDFs to LLM dataset (NorRep)

This notebook will process the already downloaded PDF files and convert them to a data set suitable for fine-tuning and evaluating LLMs.

A new field "content" will be added to each record. The field contains an object that in turn contains the fields "pdfinfo" and "pages", that contain the metadata and text extracted from the PDF file.

In [7]:
%%time

import os.path
import yaml
import json
import re

import fitz
import regex  # has better Unicode support than standard library re module

PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages

PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text
GT_FIELDS = {
    "title": "title", 
    "year": "year",
    "e-isbn": "e-isbn",
    "p-isbn": "e-isbn",
    "e-issn": "e-issn",
    "p-issn": "p-issn",
    "language": "language",
    "authors": "author",
    "publisher": "publisher",
    "doi": "doi"
}

mdfile = "../../swib_dataset/dataset.yml"
out_path = "../llm-dataset/norrep.jsonl"

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_').replace(':','_').replace('?','_').replace('=','_') + ".pdf"

def extract_content(fn):
    """extract and return the pdfinfo metadata and the first few pages of text (and last page) from the given PDF file"""

    pdfinfo = {}
    pages = []
    
    with fitz.open(fn) as pdf:

        for key in pdf.metadata.keys():
            if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
                pdfinfo[key] = pdf.metadata.get(key)

        for page in PAGES:
            if page > len(pdf) - 2:
                continue

            texts = []
            text = pdf[page].get_text(sort=True)
            # Use regular expression to split text into paragraphs
            # Delimiter: newline(s) followed by an upper case character
            paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
            long_paragraph_count = 0

            for paragraph in paragraphs:
                paragraph = " ".join(paragraph.strip().split())

                if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                    continue
                elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                    texts.append(paragraph)
                elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                    # allow some long paragraphs on the first two pages
                    long_paragraph_count += 1
                    texts.append(paragraph)
                else:  # must be a long paragraph, skip it
                    pass
            text = '\n'.join(texts)
            if text:
                pages.append({"page": pdf[page].number + 1, "text": text})
    return {"pdfinfo": pdfinfo, "pages": pages}


print(f"converting {mdfile} to {out_path}")
with open(mdfile) as infile, open(out_path, "w") as outfile:
    data = yaml.safe_load(infile)
    for did, rec in data.items():
        rid = rec["url_info"] or rec["url_file"]
        pdf_path = id_to_fn(rid)
        content = extract_content(pdf_path)
        ground_truth = { newkey: rec[origkey] for origkey, newkey in GT_FIELDS.items() }
        ground_truth["type_coar"] = "report"
        outrec = {"id": rid, "url": rec["url_file"], "content": content, "ground_truth": ground_truth}
        json.dump(outrec, outfile)
        outfile.write("\n")

converting ../../swib_dataset/dataset.yml to ../llm-dataset/norrep.jsonl
MuPDF error: format error: cmsOpenProfileFromMem failed

CPU times: user 3.29 s, sys: 135 ms, total: 3.42 s
Wall time: 3.5 s
