# Download Bo20 and Bo767

In [None]:
import os
import shutil
from math import floor

import pickle

import requests
import zipfile
import io

In [None]:
def get_zip_range(zip_id):
    lower = floor(int(zip_id)/1000) * 1000
    upper = lower + 999
    return str(lower).zfill(4) + "-" + str(upper).zfill(4) + "/"

def download_pdfs(download_path, pdf_ids):
    dc_base_url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/"

    to_download = {}
    for pdf_id in pdf_ids:
        pdf_path = os.path.join(download_path, pdf_id+".pdf")
        if not os.path.exists(pdf_path):
            if pdf_id[:4] in to_download:
                to_download[pdf_id[:4]] += [pdf_id]
            else:
                to_download[pdf_id[:4]] = [pdf_id]

    for zip_id, sub_ids in to_download.items():
        print(f"Downloading: {zip_id}.zip. Note: this is a large zipfile so it may take a while")
        full_url = os.path.join(dc_base_url, get_zip_range(zip_id), zip_id+".zip")

        if not os.path.exists(os.path.join(download_path, "temp")):
            os.makedirs(os.path.join(download_path, "temp"))

        r = requests.get(full_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(os.path.join(download_path, "temp"))

        # Move desired files to download folder
        for pdf_id in sub_ids:
            os.rename(os.path.join(download_path, "temp", pdf_id + ".pdf"), os.path.join(download_path, pdf_id + ".pdf"))

        # Delete excess pdfs
        shutil.rmtree(os.path.join(download_path, "temp"), ignore_errors=True)

## Bo 20

In [None]:
bo20_path = "../data/bo20/"

bo20_ids = [
    "1016445",
    "1177640",
    "1479052",
    "1690009",
    "2132230",
    "1037700",
    "1238975",
    "1598224",
    "2049749",
    "2151932",
    "1043219",
    "1375277",
    "1620834",
    "2062555",
    "2189929",
    "1061225",
    "1422550",
    "1666072",
    "2127440",
    "2399488",
]

download_pdfs(bo20_path, bo20_ids)

## Bo 767

In [None]:
bo767_path = "../data/bo767/"

with open('bo767_ids.txt', 'r') as file:
    bo767_ids = [line[:-1] for line in file.readlines()]

download_pdfs(bo767_path, bo767_ids)