# Convert metadata and PDFs to LLM dataset

This notebook will process the already downloaded PDF files and convert them to a data set suitable for fine-tuning and evaluating LLMs.

A new field "content" will be added to each record. The field contains an object that in turn contains the fields "pdfinfo" and "pages", that contain the metadata and text extracted from the PDF file.

In [1]:
import os.path
import collections
import glob
import json
import re

import pymupdf
import pymupdf4llm
import regex  # has better Unicode support than standard library re module

PAGES = [0, 1, 2, 3, 4, 5, -2, -1]  # pages to analyze: first six pages + last two pages
CHARACTER_BUDGET = 5000  # Limit on how many characters per document to include


PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

metadata_files = glob.glob("../metadata/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def comma_proportion(chunk):
    if not chunk:
        return 0
    return (chunk.count(',') + chunk.count(';')) / len(chunk)

def emph_proportion(chunk):
    if not chunk:
        return 0
    return (chunk.count('_') + chunk.count('*')) / len(chunk)

def chunk_score(chunk, page_num):
    if not chunk.strip() or chunk == '-----':
        return None, None
    if '.....' in chunk or '. . . . .' in chunk or '_ _ _ _ _' in chunk:
        return None, None
    if re.match(r'^\W+$', chunk):
        return None, None

    score = -len(chunk) - 1000 * int(page_num/2)
    feats = set()
    if re.search(r'(?<!\d)20\d\d(?!\d)', chunk):
        score += 500
        feats.add("year")
    if re.search(r'\bdoi\b', chunk, re.IGNORECASE):
        score += 1000
        feats.add("doi")
    if re.search(r'\bisbn\b', chunk, re.IGNORECASE):
        score += 1000
        feats.add("isbn")
    if re.search(r'\bissn\b', chunk, re.IGNORECASE):
        score += 1000
        feats.add("issn")
    if re.search(r'\bhttps?\b', chunk, re.IGNORECASE):
        score += 1000
        feats.add("http")
    if chunk.startswith('#'):
        score += 1000
        feats.add("headline")
    if comma_proportion(chunk) > 0.01:
        score += 10000 * comma_proportion(chunk)
        feats.add("commas")
    if emph_proportion(chunk) > 0.01:
        score += 10000 * emph_proportion(chunk)
        feats.add("emph")
    return score, feats

def split_text(text):
    # Use regular expression to split text into paragraphs
    # Delimiter: newline(s) followed by an upper case character (possibly with preceding Markdown markup)
    return regex.split(r'\n+(?=[#_*]*\p{Lu})', text, flags=re.UNICODE)

def extract_content(fn):
    """extract and return the pdfinfo metadata and selected chunks of text from the given PDF file"""

    pdfinfo = {}
    page_content = collections.defaultdict(list)

    with pymupdf.open(fn) as doc:
        for key in doc.metadata.keys():
            if key not in PDF_METADATA_SKIP and doc.metadata.get(key):
                pdfinfo[key] = doc.metadata.get(key)
        
        # Extract valid pages, remove duplicates, and sort numerically
        all_pages = list(range(len(doc)))
        pages_to_extract = list(sorted({
            all_pages[idx] for idx in PAGES
            if -len(doc) <= idx < len(doc)
        }))

        page_texts = pymupdf4llm.to_markdown(doc, pages=pages_to_extract, page_chunks=True, show_progress=False,
                                             ignore_images=True, ignore_graphics=True)

    all_chunks = []
    for page in page_texts:
        for chunk in split_text(page['text']):
            score, feats = chunk_score(chunk, page['metadata']['page'])
            if score is not None:
                all_chunks.append({
                    'text': chunk,
                    'page': page['metadata']['page'],
                    'score': score,
                    'feats': feats,
                    'index': len(all_chunks),
                    'length': len(chunk)
                })
    
    # Select chunks within character budget
    selected_indices = set()
    total_chars = 0
    # Sort by score descending
    for chunk in sorted(all_chunks, key=lambda x: x['score'], reverse=True):
        if total_chars + 1 + chunk['length'] <= CHARACTER_BUDGET:
            selected_indices.add(chunk['index'])
            total_chars += chunk['length']

    for chunk in all_chunks:
        if chunk['index'] in selected_indices:
            page_content[chunk['page']].append(chunk['text'])

    pages = []
    for pageno in sorted(page_content.keys()):
        text = "\n".join(page_content[pageno])
        pages.append({"page": pageno, "text": text})
    return {"pdfinfo": pdfinfo, "pages": pages}

def convert_metadata(metadata_files):
    for mdfile in sorted(metadata_files):
        out_path = mdfile.replace('metadata', 'llm-dataset')
        print(f"converting {mdfile} to {out_path}")
        with open(mdfile) as infile, open(out_path, "w") as outfile:
            for line in infile:
                rec = json.loads(line)
                pdf_path = id_to_fn(rec["id"])
                content = extract_content(pdf_path)
                outrec = {"id": rec["id"], "url": rec["url"], "content": content, "ground_truth": rec["ground_truth"]}
                json.dump(outrec, outfile)
                outfile.write("\n")

In [2]:
%%time
convert_metadata(metadata_files)

converting ../metadata/article-en-test.jsonl to ../llm-dataset/article-en-test.jsonl
converting ../metadata/article-en-train.jsonl to ../llm-dataset/article-en-train.jsonl
converting ../metadata/article-fi-test.jsonl to ../llm-dataset/article-fi-test.jsonl
converting ../metadata/article-fi-train.jsonl to ../llm-dataset/article-fi-train.jsonl
converting ../metadata/article-se-test.jsonl to ../llm-dataset/article-se-test.jsonl
converting ../metadata/article-se-train.jsonl to ../llm-dataset/article-se-train.jsonl
converting ../metadata/article-sv-test.jsonl to ../llm-dataset/article-sv-test.jsonl
converting ../metadata/article-sv-train.jsonl to ../llm-dataset/article-sv-train.jsonl
converting ../metadata/book-en-test.jsonl to ../llm-dataset/book-en-test.jsonl
converting ../metadata/book-en-train.jsonl to ../llm-dataset/book-en-train.jsonl
converting ../metadata/book-fi-test.jsonl to ../llm-dataset/book-fi-test.jsonl
MuPDF error: unsupported error: cannot create appearance stream for Scree