In [None]:
# default_exp core

# hmd_newspaper_dl
> Download Heritage made Digital Newspaper from the BL repository 

In [None]:
#export
import concurrent
import itertools
import json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import random
import sys
import time
from collections import namedtuple
from functools import lru_cache
from operator import itemgetter
#from os import umask
import os
from pathlib import Path
from typing import List, Optional, Union

import requests
from bs4 import BeautifulSoup
from fastcore.script import *
from fastcore.test import *
from fastcore.net import urlvalid
from loguru import logger
from nbdev.showdoc import *
from tqdm import tqdm

In [None]:
#export
def _get_link(x: str):
    end = x.split('/')[-1]
    return "https://bl.iro.bl.uk/concern/datasets/" + end

In [None]:
#export
@lru_cache(256)
def get_newspaper_links():
    urls = [f"https://bl.iro.bl.uk/collections/353c908d-b495-4413-b047-87236d2573e3?locale=en&page={page}" for page in range(1, 3)]
    link_tuples = []
    for url in urls:
        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'lxml')
        links = soup.find_all("p", class_="media-heading",)
        for link in links:
            link = link.find("a")
            url = link['href']
            if url:
                t = (link.text, _get_link(url))
                link_tuples.append(t)
        return link_tuples

In [None]:
links = get_newspaper_links()
links

[('The Express',
  'https://bl.iro.bl.uk/concern/datasets/93ec8ab4-3348-409c-bf6d-a9537156f654?locale=en'),
 ('The Press.',
  'https://bl.iro.bl.uk/concern/datasets/2f70fbcd-9530-496a-903f-dfa4e7b20d3b?locale=en'),
 ('The Star',
  'https://bl.iro.bl.uk/concern/datasets/dd9873cf-cba1-4160-b1f9-ccdab8eb6312?locale=en'),
 ('National Register.',
  'https://bl.iro.bl.uk/concern/datasets/f3ecea7f-7efa-4191-94ab-e4523384c182?locale=en'),
 ('The Statesman',
  'https://bl.iro.bl.uk/concern/datasets/551cdd7b-580d-472d-8efb-b7f05cf64a11?locale=en'),
 ('The British Press; or, Morning Literary Advertiser',
  'https://bl.iro.bl.uk/concern/datasets/aef16a3c-53b6-4203-ac08-d102cb54f8fa?locale=en'),
 ('The Sun',
  'https://bl.iro.bl.uk/concern/datasets/b9a877b8-db7a-4e5f-afe6-28dc7d3ec988?locale=en'),
 ('The Liverpool Standard etc',
  'https://bl.iro.bl.uk/concern/datasets/fb5e24e3-0ac9-4180-a1f4-268fc7d019c1?locale=en'),
 ('Colored News',
  'https://bl.iro.bl.uk/concern/datasets/bacd53d6-86b7-4f8a-af3

In [None]:
assert len(links[0]) == 2 #test tuple len
assert next(iter(set(map(urlvalid, map(itemgetter(1), links))))) == True #check second item valid url

In [None]:
assert len(links) == 10
assert type(links[0]) == tuple
assert (list(map(itemgetter(1), links))[-1]).startswith("https://")

In [None]:
#export
@lru_cache(256)
def get_download_urls(url: str) -> list:
    """Given a dataset page on the IRO repo return all download links for that page"""
    data, urls = None, None
    try:
        r = requests.get(url, timeout=30)
    except requests.exceptions.MissingSchema as E:
        print(E)

    soup = BeautifulSoup(r.text, "lxml")
    link_ends =  soup.find_all('a', id='file_download')
    urls = ["https://bl.iro.bl.uk" + link['href'] for link in link_ends]
        #data = json.loads(soup.find("script", type="application/ld+json").string)
    # except AttributeError as E:
    #     print(E)
    # if data:
    #     #data = data["distribution"]
    #     #urls = [item["contentUrl"] for item in data]
    return list(set(urls))

In [None]:
get_download_urls("https://bl.iro.bl.uk/concern/datasets/93ec8ab4-3348-409c-bf6d-a9537156f654")

['https://bl.iro.bl.uk/downloads/50ebdb11-9186-4c24-90e5-27caf73d3f11?locale=en',
 'https://bl.iro.bl.uk/downloads/5072df1a-75f3-4379-961a-59ac3566bc2f?locale=en',
 'https://bl.iro.bl.uk/downloads/3fd6b687-feb0-4d92-b8d7-4ea0acc5346c?locale=en',
 'https://bl.iro.bl.uk/downloads/9c4f2fd6-d58c-4a57-8fac-a5dd273f8ed3?locale=en',
 'https://bl.iro.bl.uk/downloads/17b6e110-8ed0-46cb-8030-6cc7f387ade5?locale=en',
 'https://bl.iro.bl.uk/downloads/e89ca9c4-b101-44bf-b1de-15052eb63d5e?locale=en',
 'https://bl.iro.bl.uk/downloads/54d974ba-fcb2-4566-a5ac-b66d85954963?locale=en',
 'https://bl.iro.bl.uk/downloads/b40aabab-b366-4148-975e-4481d30ba182?locale=en',
 'https://bl.iro.bl.uk/downloads/319d5656-94b0-4cbf-8f0d-d3ce0aa3ab40?locale=en',
 'https://bl.iro.bl.uk/downloads/0fd85a65-bfa3-4db8-8b92-7fc305cab4d4?locale=en',
 'https://bl.iro.bl.uk/downloads/ebd5d9eb-e0ec-40b0-ae10-132cdfbaa4e1?locale=en',
 'https://bl.iro.bl.uk/downloads/0ea7aa1f-3b4f-4972-bc12-b7559769471f?locale=en',
 'https://bl.iro

In [None]:
#export
def create_session() -> requests.sessions.Session:
    """returns a requests session"""
    retry_strategy = Retry(total=60)
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

In [None]:
#export
def _download(url: str, dir: Union[str, Path]):
    fname = None
    s = create_session()
    try:
        r = s.get(url, stream=True, timeout=(30))
        r.raise_for_status()
        fname = r.headers["Content-Disposition"].split('"')[1]
        if fname:
            with open(f"{dir}/{fname}", "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
    except KeyError:
        pass
    except requests.exceptions.RequestException as request_exception:
        logger.error(request_exception)
    return fname

In [None]:
#slow
test_url = "https://bl.iro.bl.uk/downloads/0ea7aa1f-3b4f-4972-bc12-b7559769471f?locale=en"
Path("test_dir").mkdir()
test_dir = Path("test_dir")
_download(test_url, test_dir)

'BLNewspapers_0002642_TheExpress_1848_f1c4cb8d-6bd5-401f-831f-a19199d47c0a.zip'

In [None]:
#slow
assert list(test_dir.iterdir())[0].suffix == ".zip"
assert len(list(test_dir.iterdir())) == 1
# tidy up
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()

In [None]:
#basic test to check bad urls won't raise unhandled exceptions
bad_link = "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1-3b4f-4972-bc12-b75597694f"
_download(bad_link, "test_dir")

2021-10-11 13:09:32.140 | ERROR    | __main__:_download:16 - HTTPSConnectionPool(host='bl.oar.bl.uk', port=443): Max retries exceeded with url: /fail_uploads/download_file?fileset_id=0ea7aa1-3b4f-4972-bc12-b75597694f (Caused by SSLError(SSLCertVerificationError("hostname 'bl.oar.bl.uk' doesn't match either of '*.oar.notch8.cloud', 'oar.notch8.cloud'")))


In [None]:
#export
def download_from_urls(urls: List[str], save_dir: Union[str, Path], n_threads: int = 8):
    download_count = 0
    tic = time.perf_counter()
    Path(save_dir).mkdir(exist_ok=True)
    logger.remove()
    logger.add(lambda msg: tqdm.write(msg, end=""))
    with tqdm(total=len(urls)) as progress:
        with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
            future_to_url = {executor.submit(_download, url, save_dir): url for url in urls}
            for future in future_to_url:
                future.add_done_callback(lambda p: progress.update(1))
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                except Exception as e:
                    logger.error("%r generated an exception: %s" % (url, e))
                else:
                    if data:
                        logger.info(f"{url} downloaded to {data}")
                        download_count+=1
        toc = time.perf_counter()
    logger.remove()
    logger.info(f"Downloads completed in {toc - tic:0.4f} seconds")
    return download_count

In [None]:
test_links = [
    "https://bl.iro.bl.uk/downloads/0ea7aa1f-3b4f-4972-bc12-b7559769471f?locale=en",
    "https://bl.iro.bl.uk/downloads/80708825-d96a-4301-9496-9598932520f4?locale=en",
]

In [None]:
download_from_urls(test_links, "test_dir")

 50%|█████     | 1/2 [00:29<00:29, 29.64s/it]

2021-10-11 13:10:02.071 | INFO     | __main__:download_from_urls:21 - https://bl.iro.bl.uk/downloads/80708825-d96a-4301-9496-9598932520f4?locale=en downloaded to BLNewspapers_0002642_TheExpress_1847_8f13ba53-0e13-4409-a384-830ba2b160db.zip


100%|██████████| 2/2 [00:30<00:00, 15.03s/it]

2021-10-11 13:10:02.475 | INFO     | __main__:download_from_urls:21 - https://bl.iro.bl.uk/downloads/0ea7aa1f-3b4f-4972-bc12-b7559769471f?locale=en downloaded to BLNewspapers_0002642_TheExpress_1848_f1c4cb8d-6bd5-401f-831f-a19199d47c0a.zip





2

In [None]:
#slow
assert len(test_links) == len(os.listdir("test_dir"))
test_dir = Path("test_dir")
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()

In [None]:
#slow
test_some_bad_links = [
    "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1f-3b4f-4972-bc12-b7559769471f",
    "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=7ac7a0cb-29a2-4172-8b79-4952e2c9b",
]
download_from_urls(test_some_bad_links, "test_dir")

100%|██████████| 2/2 [00:01<00:00,  1.26it/s]

2021-10-11 13:10:04.154 | ERROR    | __main__:_download:16 - HTTPSConnectionPool(host='bl.oar.bl.uk', port=443): Max retries exceeded with url: /fail_uploads/download_file?fileset_id=0ea7aa1f-3b4f-4972-bc12-b7559769471f (Caused by SSLError(SSLCertVerificationError("hostname 'bl.oar.bl.uk' doesn't match either of '*.oar.notch8.cloud', 'oar.notch8.cloud'")))
2021-10-11 13:10:04.156 | ERROR    | __main__:_download:16 - HTTPSConnectionPool(host='bl.oar.bl.uk', port=443): Max retries exceeded with url: /fail_uploads/download_file?fileset_id=7ac7a0cb-29a2-4172-8b79-4952e2c9b (Caused by SSLError(SSLCertVerificationError("hostname 'bl.oar.bl.uk' doesn't match either of '*.oar.notch8.cloud', 'oar.notch8.cloud'")))





0

In [None]:
#slow
test_dir = Path("test_dir")
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()

In [None]:
#export
@call_parse
def cli(
    save_dir: Param("Output Directory", str),
    n_threads: Param("Number threads to use") = 8,
    subset: Param("Download subset of HMD", Optional[int]) = None
):
    "Download HMD newspaper from iro to `save_dir` using `n_threads`"
    logger.info("Getting title urls")
    title_urls = get_newspaper_links()
    logger.info(f"Found {len(title_urls)} title urls")
    all_urls = []
    print(title_urls)
    for url in title_urls:
        logger.info(f"Getting zip download file urls for {url}")
        try:
            zip_urls = get_download_urls(url[1])
            all_urls.append(zip_urls)
        except Exception as e:
            logger.error(e)
    all_urls = list(itertools.chain(*all_urls))
    if subset:
        if len(all_urls) < subset:
            raise ValueError(f"Size of requested sample {subset} is larger than total number of urls:{all_urls}")
        all_urls = random.sample(all_urls, subset)
    print(all_urls)
    download_count = download_from_urls(all_urls, save_dir, n_threads=n_threads)
    request_url_count = len(all_urls)
    if request_url_count==download_count:
        logger.info(f"\U0001F600 Requested count of urls: {request_url_count} matches number downloaded: {download_count}")
    if request_url_count>download_count:
        logger.warning(f"\U0001F622 Requested count of urls: {request_url_count} higher than number downloaded: {download_count}")
    if request_url_count<download_count:
        logger.warning(f"\U0001F937 Requested count of urls: {request_url_count} lower than number downloaded: {download_count}")

In [None]:
cli("test_dir", subset=2)

  0%|          | 0/2 [00:00<?, ?it/s]

[('The Express', 'https://bl.iro.bl.uk/concern/datasets/93ec8ab4-3348-409c-bf6d-a9537156f654?locale=en'), ('The Press.', 'https://bl.iro.bl.uk/concern/datasets/2f70fbcd-9530-496a-903f-dfa4e7b20d3b?locale=en'), ('The Star', 'https://bl.iro.bl.uk/concern/datasets/dd9873cf-cba1-4160-b1f9-ccdab8eb6312?locale=en'), ('National Register.', 'https://bl.iro.bl.uk/concern/datasets/f3ecea7f-7efa-4191-94ab-e4523384c182?locale=en'), ('The Statesman', 'https://bl.iro.bl.uk/concern/datasets/551cdd7b-580d-472d-8efb-b7f05cf64a11?locale=en'), ('The British Press; or, Morning Literary Advertiser', 'https://bl.iro.bl.uk/concern/datasets/aef16a3c-53b6-4203-ac08-d102cb54f8fa?locale=en'), ('The Sun', 'https://bl.iro.bl.uk/concern/datasets/b9a877b8-db7a-4e5f-afe6-28dc7d3ec988?locale=en'), ('The Liverpool Standard etc', 'https://bl.iro.bl.uk/concern/datasets/fb5e24e3-0ac9-4180-a1f4-268fc7d019c1?locale=en'), ('Colored News', 'https://bl.iro.bl.uk/concern/datasets/bacd53d6-86b7-4f8a-af31-0a12e8eaf6ee?locale=en')

 50%|█████     | 1/2 [00:11<00:11, 11.07s/it]

2021-10-11 13:11:21.370 | INFO     | __main__:download_from_urls:21 - https://bl.iro.bl.uk/downloads/f0de9ec5-6a8c-4720-9e7c-420ce967d07a?locale=en downloaded to BLNewspapers_0002646_TheStar_1803_93364dfe-9d6d-4826-9922-78b02de69699.zip


100%|██████████| 2/2 [00:12<00:00,  6.11s/it]

2021-10-11 13:11:22.516 | INFO     | __main__:download_from_urls:21 - https://bl.iro.bl.uk/downloads/fcde5246-5e8e-4bb3-999d-6af2d31f9ae2?locale=en downloaded to BLNewspapers_0002646_TheStar_1829_1d7f3e2b-ec79-4d94-8bd0-5ab5c857de64.zip





In [None]:
assert all([f.suffix == '.zip' for f in Path("test_dir").iterdir()])
assert len(list(Path("test_dir").iterdir())) == 2

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.


In [None]:
test_dir = Path("test_dir")
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()