# Download PDFs

This notebook will download all the PDF files and store them under `pdfs/` so that subsequent processing steps can make use of their content.

In [1]:
import os.path
import glob
import json

import requests

metadata_files = glob.glob("../metadata/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

for mdfile in sorted(metadata_files):
    print(f"downloading PDFs in {mdfile}")
    with open(mdfile) as infile:
        for line in infile:
            rec = json.loads(line)
            pdf_path = download(rec["url"], rec["id"])
            print(pdf_path)

downloading PDFs in ../metadata/article-eng-test.jsonl
../pdfs/www.doria.fi_handle_10024_182782.pdf
../pdfs/taju.uniarts.fi_handle_10024_6005.pdf
../pdfs/osuva.uwasa.fi_handle_10024_11652.pdf
../pdfs/osuva.uwasa.fi_handle_10024_11231.pdf
../pdfs/www.theseus.fi_handle_10024_780577.pdf
../pdfs/www.theseus.fi_handle_10024_702838.pdf
../pdfs/www.theseus.fi_handle_10024_333694.pdf
../pdfs/www.theseus.fi_handle_10024_267792.pdf
../pdfs/www.theseus.fi_handle_10024_343420.pdf
../pdfs/taju.uniarts.fi_handle_10024_7696.pdf
../pdfs/www.theseus.fi_handle_10024_267924.pdf
../pdfs/www.theseus.fi_handle_10024_333403.pdf
../pdfs/www.theseus.fi_handle_10024_504441.pdf
../pdfs/www.theseus.fi_handle_10024_355895.pdf
../pdfs/osuva.uwasa.fi_handle_10024_11397.pdf
downloading PDFs in ../metadata/article-eng-train.jsonl
wrote https://www.doria.fi/bitstream/handle/10024/185486/salafi 112804-Article Text-238911-1-10-20220611.pdf as ../pdfs/www.doria.fi_handle_10024_185486.pdf
../pdfs/www.doria.fi_handle_10024_