In [None]:
import os
import shutil
from math import floor

import requests
import zipfile
import io

from nv_ingest_client.client import NvIngestClient
from nv_ingest_client.primitives import JobSpec
from nv_ingest_client.primitives.tasks import ExtractTask
from nv_ingest_client.primitives.tasks import SplitTask
from nv_ingest_client.util.file_processing.extract import extract_file_content
import logging, time

# Download the pdfs if they aren't already present
bo_20_path = "./data/bo20/"

pdf_ids = [
    "1016445",
    "1177640",
    "1479052",
    "1690009",
    "2132230",
    "1037700",
    "1238975",
    "1598224",
    "2049749",
    "2151932",
    "1043219",
    "1375277",
    "1620834",
    "2062555",
    "2189929",
    "1061225",
    "1422550",
    "1666072",
    "2127440",
    "2399488",
]

if not os.path.exists(bo_20_path):
    os.makedirs(bo_20_path)

def get_zip_range(zip_id):
    lower = floor(int(zip_id)/1000) * 1000
    upper = lower + 999
    return str(lower).zfill(4) + "-" + str(upper).zfill(4) + "/"


for pdf_id in pdf_ids:
    pdf_path = os.path.join(bo_20_path, pdf_id+".pdf")
    if not os.path.exists(pdf_path):
        print(f"Downloading pdf: {pdf_id}.pdf. Note: this requires downloading a large zipfile so it may take a while")
        base_url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/"
        zip_id = pdf_id[:4]
        zip_range = get_zip_range(zip_id)
        full_url = base_url + zip_range + zip_id + ".zip"

        if not os.path.exists(bo_20_path + "temp"):
            os.makedirs(bo_20_path + "temp")
        r = requests.get(full_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(bo_20_path + "temp")

        # Move desired file to bo_20 folder
        os.rename(bo_20_path + "temp/" + pdf_id + ".pdf", bo_20_path + pdf_id + ".pdf")

        # Delete excess pdfs
        shutil.rmtree(bo_20_path + "temp", ignore_errors=True)