# Download PDFs

This notebook will download all the PDF files and store them under `pdfs/` so that subsequent processing steps can make use of their content.

In [1]:
import os.path
import glob
import json

import requests

metadata_files = glob.glob("../metadata/*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

for mdfile in sorted(metadata_files):
    print(f"downloading PDFs in {mdfile}")
    with open(mdfile) as infile:
        for line in infile:
            rec = json.loads(line)
            pdf_path = download(rec["url"], rec["id"])
            print(pdf_path)

downloading PDFs in ../metadata/2025a-en-test.jsonl
../pdfs/www.sitra.fi_wp_wp-content_uploads_2018_03_sitra-annual-report-and-financial-statements-2017.pdf.pdf
../pdfs/kavi.fi_sites_default_files_documents_mil_in_finland.pdf.pdf
../pdfs/d1vzi28wh99zvq.cloudfront.net_pdf_previews_380160-sample.pdf.pdf
../pdfs/www.hexgen.fi_basicrules.pdf.pdf
../pdfs/www.doria.fi_handle_10024_186609.pdf
../pdfs/static.espoo.fi_cdn_ff_19AUFSZOpUg8cWy_i0ryh-fYP43Wh69-v89_u0KxTpA_1632739947_public_2021-09_Englanti.pdf.pdf
../pdfs/www.sitra.fi_wp_wp-content_uploads_2022_04_sitra-annual-report-and-financial-statements-2021.pdf.pdf
../pdfs/www.traficom.fi_sites_default_files_media_publication_Airport%20Charges%20annual%20progress%20report.pdf.pdf
../pdfs/julkaisut.metsa.fi_wp-content_uploads_sites_2_2023_11_beetleslife_laymansreporteng.pdf.pdf
../pdfs/www.asub.ax_sites_default_files_media_document_ALSIFF24ENG.pdf.pdf
../pdfs/efi.int_sites_default_files_files_publication-bank_2023_Recommendations%20on%20Wildfi