# Download PDFs for NorRep

This notebook will download all the PDF files and store them under `pdfs/` so that subsequent processing steps can make use of their content.

In [6]:
import os.path
import yaml

import requests

mdfile = "../../swib_dataset/dataset.yml"

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_').replace(':','_').replace('?','_').replace('=','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    if not identifier:
        identifier = file_url
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

print(f"downloading PDFs in {mdfile}")
with open(mdfile) as infile:
    data = yaml.safe_load(infile)
    for did, rec in data.items():
        pdf_path = download(rec["url_file"], rec["url_info"])
        print(pdf_path)

downloading PDFs in ../../swib_dataset/dataset.yml
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_3154107.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_3057213.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2994684.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2979994.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2753374.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2650257.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2444538.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2423615.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2453860.pdf
../pdfs/nibio.brage.unit.no_nibio-xmlui_handle_11250_2445634.pdf
../pdfs/www.ssb.no_nasjonalregnskap-og-konjunkturer_finansregnskap_artikler_husholdningenes-finansregnskap-for-2023.pdf
../pdfs/www.ssb.no_nasjonalregnskap-og-konjunkturer_nasjonalregnskap_artikler_ressursrenten-i-naturressursnaeringene-i-norge-19842022.pdf
../pdfs/www.ssb.no_inntekt-og-forbruk_inn