# Download PDFs

This notebook will download all the PDF files and store them under `pdfs/` so that subsequent processing steps can make use of their content.

In [1]:
import os.path
import glob
import json

import requests

metadata_files = glob.glob("../metadata/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

for mdfile in sorted(metadata_files):
    print(f"downloading PDFs in {mdfile}")
    with open(mdfile) as infile:
        for line in infile:
            rec = json.loads(line)
            pdf_path = download(rec["url"], rec["id"])
            print(pdf_path)

downloading PDFs in ../metadata/docthes-eng-test.jsonl
../pdfs/www.utupub.fi_handle_10024_151200.pdf
../pdfs/www.doria.fi_handle_10024_185863.pdf
../pdfs/trepo.tuni.fi_handle_10024_122849.pdf
../pdfs/www.doria.fi_handle_10024_179962.pdf
../pdfs/www.doria.fi_handle_10024_177125.pdf
../pdfs/www.utupub.fi_handle_10024_152821.pdf
../pdfs/trepo.tuni.fi_handle_10024_122350.pdf
../pdfs/www.utupub.fi_handle_10024_152697.pdf
../pdfs/lutpub.lut.fi_handle_10024_163633.pdf
../pdfs/www.doria.fi_handle_10024_178000.pdf
../pdfs/trepo.tuni.fi_handle_10024_140540.pdf
../pdfs/www.utupub.fi_handle_10024_149427.pdf
../pdfs/trepo.tuni.fi_handle_10024_125025.pdf
../pdfs/www.utupub.fi_handle_10024_148953.pdf
../pdfs/osuva.uwasa.fi_handle_10024_12300.pdf
downloading PDFs in ../metadata/docthes-eng-train.jsonl
../pdfs/osuva.uwasa.fi_handle_10024_13675.pdf
../pdfs/lutpub.lut.fi_handle_10024_164752.pdf
../pdfs/trepo.tuni.fi_handle_10024_130697.pdf
../pdfs/osuva.uwasa.fi_handle_10024_11396.pdf
../pdfs/lutpub.lut.