# Download PDFs and convert to LLM dataset

This notebook will download the PDF files and convert them to a data set suitable for fine-tuning and evaluating LLMs. Each record will be converted to two fields, "text" and "metadata", where "text" contains the text extracted from the first few pages of the PDF file and "metadata" is a string that represents the metadata of the document in a very simple textual key/value format. In addition, the "id" and "url" fields (containing the document ID/URL and the PDF URL, respectively) will be retained in the new record.

In [14]:
import os.path
import glob
import json
import re

import fitz
import requests

MAXPAGES = 5  # how many pages of text to extract (maximum)
MARGIN = 2  # how many more pages to look at, in case we can't find text from the first ones
TEXT_MIN = 500  # how many words to aim for (minimum)
TEXT_MAX = 700  # upper limit on # of words

KV_SKIP_FIELDS = {'id', 'url'}  # fields to not include in key-value metadata

METADATA_FIELD_NAMES = {
    "dc.contributor": "Contributor",
    "dc.contributor.author": "Author",
    "dc.contributor.degreeSupervisor": "Supervisor",
    "dc.contributor.department": "Department",
    "dc.contributor.editor": "Editor",
    "dc.contributor.faculty": "Faculty",
    "dc.contributor.opponent": "Opponent",
    "dc.contributor.organization": "Organization",
    "dc.contributor.orgunit": "Org. unit",
    "dc.contributor.reviewer": "Reviewer",
    "dc.contributor.studysubject": "Study subject",
    "dc.contributor.supervisor": "Supervisor",
    "dc.date.issued": "Issued",
    "dc.format.content": None,  # either "fulltext" or "compositeOnly"
    "dc.format.extent": None,  # number of pages - hard to extract
    "dc.format.pagerange": "Page range",
    "dc.identifier.isbn": "Electronic ISBN",
    "dc.identifier.issn": "ISSN",
    "dc.identifier.urn": "URN",
    "dc.language.iso": "Language",
    "dc.publisher": "Publisher",
    "dc.relation.contractor": "Contractor",
    "dc.relation.doi": "DOI",
    "dc.relation.eissn": "eISSN",
    "dc.relation.isbn": "Printed ISBN",
    "dc.relation.ispartofjournal": "Journal name",
    "dc.relation.ispartofseries": "Series name",
    "dc.relation.issn": "Related ISSN",
    "dc.relation.issue": "Issue",
    "dc.relation.isversionof": None,  # very rare
    "dc.relation.numberinseries": "Number in series",
    "dc.relation.url": None,  # rare + usually just same as "url" field
    "dc.relation.volume": "Volume",
    "dc.series.sortingnumber": None,  # rare; should use dc.relation.numberinseries instead
    "dc.series.year": "Series year",
    "dc.source.identifier": None,  # rare + hard to extract
    "dc.subject.degreeprogram": "Degree program",
    "dc.subject.discipline": "Discipline",
    "dc.title": "Title",
    "dc.title.alternative": "Alternative title",
    "dc.type.coar": "COAR type",
    "dc.type.okm": "OKM type",
    "dc.type.ontasot": "Thesis level",    
}

LANG_MAP = {
    'fin': 'fi',
    'swe': 'sv',
    'eng': 'en'
}

LANG_AWARE_FIELDS = (
    'dc.contributor.department',
    'dc.contributor.faculty',
    'dc.contributor.organization',
    'dc.publisher',
    'dc.subject.degreeprogram',
    'dc.subject.discipline',
    'dc.title.alternative',
    'dc.type.coar',
    'dc.type.okm',
    'dc.type.ontasot',
)

metadata_files = glob.glob("../metadata/*/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

def extract_text(fn):
    """extract and return the first few pages of text from the given PDF file"""
    with fitz.open(fn) as pdf:
        texts = []
        extracted_pages = 0
        extracted_length = 0
        for idx, page in enumerate(list(pdf)[:MAXPAGES + MARGIN]):
            text = page.get_text(sort=True)
            text_length = len(text.strip().split())        
            if extracted_length + text_length < TEXT_MAX:
                texts.append(text)
                extracted_length += text_length
                extracted_pages += 1
            else:
                print(f"skipping page {idx+1} of {fn}: text would become too long")
            if extracted_pages >= MAXPAGES or extracted_length >= TEXT_MIN:
                break
    return '\n'.join(texts)

def choose_value_by_lang(lang, vals):
    lang_vals = {}
    fallback_val = vals[0].split(' {')[0]
    for val in vals:
        m = re.match(r"(.*) {(\w\w)}", val)
        if m:
            lang_vals[m.group(2)] = m.group(1)
        else:
            fallback_val = val
    if lang in lang_vals:
        return lang_vals[lang]
    return fallback_val

def metadata_to_kvtext(rec):
    langcode = LANG_MAP[rec["dc.language.iso"]]
    lines = []
    for fld in sorted(rec.keys()):
        if fld in KV_SKIP_FIELDS:
            continue
        fldname = METADATA_FIELD_NAMES[fld]
        if not fldname:
            continue
        vals = rec[fld]
        if not isinstance(vals, list):
            vals = [vals]
        if fld in LANG_AWARE_FIELDS:
            vals = [choose_value_by_lang(langcode, vals)]
        for val in sorted(vals):
            lines.append(f"{fldname}: {val}")
    return "\n".join(lines)

for mdfile in sorted(metadata_files):
    out_path = mdfile.replace('metadata', 'llm-dataset')
    print(f"converting {mdfile} to {out_path}")
    with open(mdfile) as infile, open(out_path, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            pdf_path = download(rec["url"], rec["id"])
            pdf_text = extract_text(pdf_path)
            metadata = metadata_to_kvtext(rec)
            print(metadata)
            outrec = {"id": rec["id"], "url": rec["url"], "text": pdf_text, "metadata": metadata}
            json.dump(outrec, outfile)
            outfile.write("\n")
            break
    print()



converting ../metadata/test/docthes-eng.jsonl to ../llm-dataset/test/docthes-eng.jsonl
Author: Lähdemäki, Sakari
Supervisor: Professor (emeritus) Matti Viren, University of Turku
Supervisor: Professor Janne Tukiainen, University of Turku
Department: Department of Economics
Faculty: Turku School of Economics
Opponent: Professor Mika Maliranta, University of Jyväskylä
Organization: University of Turku
Reviewer: Professor (emeritus) Pekka Ilmakunnas, Aalto University School of Business
Reviewer: Professor Mika Maliranta, University of Jyväskylä
Issued: 2021-03-05
ISBN: 978-951-29-8349-0
URN: URN:ISBN:978-951-29-8349-0
Language: eng
Publisher: University of Turku, Turku School of Economics
Related ISBN: 978-951-29-8348-3
Series name: Turun yliopiston julkaisuja - Annales Universitatis Turkuensis, Ser E: Oeconomica
Related ISSN: 2343-3167
Number in series: 74
Degree program: Doctoral Programme of Turku School of Economics
Discipline: Economics
Title: Essays on economic productivity
Alternat

skipping page 5 of ../pdfs/osuva.uwasa.fi_handle_10024_13675.pdf: text would become too long
Author: Li, Fang Fang
Faculty: School of Marketing and Communication
Organization: University of Vaasa
Reviewer: Karjaluoto, Heikki
Reviewer: Nijssen, Edwin
Issued: 2022-04-28
ISBN: 978-952-395-020-7
URN: URN:ISBN:978-952-395-020-7
Language: eng
Publisher: Vaasan yliopisto
Related ISBN: 978-952-395-019-1
Series name: Acta Wasaensia
Related ISSN: 0355-2667
Related ISSN: 2323-9123
Number in series: 485
Discipline: International Business
Title: The integration of social media and marketing strategy : tactics, strategy, and culture
Alternative title: Sosiaalisen median ja markkinointistrategian integrointi : taktiikat, strategia ja kulttuuri
COAR type: väitöskirja
OKM type: G5 Doctoral dissertation (article)
Thesis type: Doctoral dissertation (article-based)

converting ../metadata/train/docthes-fin.jsonl to ../llm-dataset/train/docthes-fin.jsonl
Author: Syväsalmi, Emilia
Supervisor: Apulaisprofess