# Download PDFs and convert to LLM dataset

This notebook will download the PDF files and convert them to a data set suitable for fine-tuning and evaluating LLMs. Each record will be converted to two fields, "text" and "metadata", where "text" contains the text extracted from the first few pages of the PDF file and "metadata" is a string that represents the metadata of the document in a very simple textual key/value format. In addition, the "id" and "url" fields (containing the document ID/URL and the PDF URL, respectively) will be retained in the new record.

In [2]:
import os.path
import glob
import json

import fitz
import requests

MAXPAGES = 5  # how many pages of text to extract (maximum)
MARGIN = 2  # how many more pages to look at, in case we can't find text from the first ones
TEXT_MIN = 500  # how many words to aim for (minimum)
TEXT_MAX = 700  # upper limit on # of words

KV_SKIP_FIELDS = {'id', 'url'}  # fields to not include in key-value metadata

metadata_files = glob.glob("../metadata/*/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

def extract_text(fn):
    """extract and return the first few pages of text from the given PDF file"""
    with fitz.open(fn) as pdf:
        texts = []
        extracted_pages = 0
        extracted_length = 0
        for idx, page in enumerate(list(pdf)[:MAXPAGES + MARGIN]):
            text = page.get_text(sort=True)
            text_length = len(text.strip().split())        
            if extracted_length + text_length < TEXT_MAX:
                texts.append(text)
                extracted_length += text_length
                extracted_pages += 1
            else:
                print(f"skipping page {idx+1} of {fn}: text would become too long")
            if extracted_pages >= MAXPAGES or extracted_length >= TEXT_MIN:
                break
    return '\n'.join(texts)

def metadata_to_kvtext(rec):
    lines = []
    for fld in sorted(rec.keys()):
        if fld in KV_SKIP_FIELDS:
            continue
        vals = rec[fld]
        if not isinstance(vals, list):
            vals = [vals]
        for val in sorted(vals):
            lines.append(f"{fld}: {val}")
    return "\n".join(lines)

for mdfile in sorted(metadata_files):
    out_path = mdfile.replace('metadata', 'llm-dataset')
    print(f"converting {mdfile} to {out_path}")
    with open(mdfile) as infile, open(out_path, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            pdf_path = download(rec["url"], rec["id"])
            pdf_text = extract_text(pdf_path)
            outrec = {"id": rec["id"], "url": rec["url"], "text": pdf_text, "metadata": metadata_to_kvtext(rec)}
            json.dump(outrec, outfile)
            outfile.write("\n")
    print()

converting ../metadata/test/docthes-eng.jsonl to ../llm-dataset/test/docthes-eng.jsonl
skipping page 5 of ../pdfs/osuva.uwasa.fi_handle_10024_12300.pdf: text would become too long

converting ../metadata/test/docthes-fin.jsonl to ../llm-dataset/test/docthes-fin.jsonl

converting ../metadata/test/docthes-swe.jsonl to ../llm-dataset/test/docthes-swe.jsonl

converting ../metadata/test/mono-eng.jsonl to ../llm-dataset/test/mono-eng.jsonl
skipping page 3 of ../pdfs/osuva.uwasa.fi_handle_10024_11734.pdf: text would become too long
skipping page 4 of ../pdfs/osuva.uwasa.fi_handle_10024_11734.pdf: text would become too long
skipping page 5 of ../pdfs/osuva.uwasa.fi_handle_10024_11734.pdf: text would become too long
skipping page 6 of ../pdfs/osuva.uwasa.fi_handle_10024_11734.pdf: text would become too long
skipping page 7 of ../pdfs/osuva.uwasa.fi_handle_10024_11734.pdf: text would become too long
skipping page 2 of ../pdfs/taju.uniarts.fi_handle_10024_7481.pdf: text would become too long
skip