In [1]:
# default_exp core

# hmd_dl

> Download Heritage made Digital Newspaper from the BL repository 

In [2]:
#export
import concurrent
import itertools
import json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import random
import sys
import time
from collections import namedtuple
from functools import lru_cache
from operator import itemgetter
#from os import umask
import os
from pathlib import Path
from typing import List, Optional, Union

import requests
from bs4 import BeautifulSoup
from fastcore.script import *
from fastcore.test import *
from fastcore.net import urlvalid
from loguru import logger
from nbdev.showdoc import *
from tqdm import tqdm

In [3]:
#export
def _get_link(x: str):
    end = str(x).split('"')[3]
    return "https://bl.iro.bl.uk" + end

In [4]:
#export
@lru_cache(256)
def get_newspaper_links():
    urls = [f"https://bl.iro.bl.uk/collection/353c908d-b495-4413-b047-87236d2573e3?page={page}" for page in range(1, 3)]
    link_tuples = []
    for url in urls:
        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'lxml')
        links = soup.find_all("h3", class_="title_title__2nRVt")
        for link in links:
            t = (link.text, _get_link(link))
            link_tuples.append(t)
        return link_tuples

In [5]:
links = get_newspaper_links()
links

[('The Express',
  'https://bl.iro.bl.uk/work/ns/93ec8ab4-3348-409c-bf6d-a9537156f654'),
 ('The Press.',
  'https://bl.iro.bl.uk/work/ns/2f70fbcd-9530-496a-903f-dfa4e7b20d3b'),
 ('The Star',
  'https://bl.iro.bl.uk/work/ns/dd9873cf-cba1-4160-b1f9-ccdab8eb6312'),
 ('National Register.',
  'https://bl.iro.bl.uk/work/ns/f3ecea7f-7efa-4191-94ab-e4523384c182'),
 ('The Statesman',
  'https://bl.iro.bl.uk/work/ns/551cdd7b-580d-472d-8efb-b7f05cf64a11'),
 ('The British Press; or, Morning Literary Advertiser',
  'https://bl.iro.bl.uk/work/ns/aef16a3c-53b6-4203-ac08-d102cb54f8fa'),
 ('The Sun',
  'https://bl.iro.bl.uk/work/ns/b9a877b8-db7a-4e5f-afe6-28dc7d3ec988'),
 ('The Liverpool Standard etc',
  'https://bl.iro.bl.uk/work/ns/fb5e24e3-0ac9-4180-a1f4-268fc7d019c1'),
 ('Colored News',
  'https://bl.iro.bl.uk/work/ns/bacd53d6-86b7-4f8a-af31-0a12e8eaf6ee'),
 ('The Northern Daily Times etc',
  'https://bl.iro.bl.uk/work/ns/5243dccc-3fad-4a9e-a2c1-d07e750c46a6')]

In [6]:
assert len(links[0]) == 2 #test tuple len
assert next(iter(set(map(urlvalid, map(itemgetter(1), links))))) == True #check second item valid url

In [7]:
#export
@lru_cache(256)
def get_download_urls(url: str) -> list:
    """Given a dataset page on the IRO repo return all download links for that page"""
    try:
        r = requests.get(url, timeout=30)
    except requests.exceptions.MissingSchema as E:
        print(E)
    try:
        soup = BeautifulSoup(r.text, "lxml")
        data = json.loads(soup.find("script", type="application/ld+json").string)
    except AttributeError as E:
        print(E)
    if data:
        data = data["distribution"]
        urls = [item["contentUrl"] for item in data]
    return urls

In [8]:
get_download_urls("https://bl.iro.bl.uk/work/ns/93ec8ab4-3348-409c-bf6d-a9537156f654")[:5]

['https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=7ac7a0cb-29a2-4172-8b79-4952e2c9b128',
 'https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=80708825-d96a-4301-9496-9598932520f4',
 'https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1f-3b4f-4972-bc12-b7559769471f',
 'https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=17b6e110-8ed0-46cb-8030-6cc7f387ade5',
 'https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=54d974ba-fcb2-4566-a5ac-b66d85954963']

In [9]:
assert len(links) == 10
assert type(links[0]) == tuple
assert (list(map(itemgetter(1), links))[-1]).startswith("https://")

In [10]:
#export
def create_session() -> requests.sessions.Session:
    """returns a requests session"""
    retry_strategy = Retry(total=60)
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

In [11]:
#export
def _download(url: str, dir: Union[str, Path]):
    fname = None
    s = create_session()
    try:
        r = s.get(url, stream=True, timeout=(30))
        r.raise_for_status()
        fname = r.headers["Content-Disposition"].split('"')[1]
        if fname:
            with open(f"{dir}/{fname}", "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
    except KeyError:
        pass
    except requests.exceptions.RequestException as request_exception:
        logger.error(request_exception)
    return fname

In [12]:
#slow
test_url = "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=7ac7a0cb-29a2-4172-8b79-4952e2c9b128"
Path("test_dir").mkdir()
test_dir = Path("test_dir")
_download(test_url, test_dir)

'BLNewspapers_0002642_TheExpress_1846_5ba8c588-d803-4c39-9c57-65e52712b161.zip'

In [13]:
#slow
assert list(test_dir.iterdir())[0].suffix == ".zip"
assert len(list(test_dir.iterdir())) == 1
# tidy up
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()

In [14]:
#basic test to check bad urls won't raise unhandled exceptions
bad_link = "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1-3b4f-4972-bc12-b75597694f"
_download(bad_link, "test_dir")

2021-02-23 09:45:42.192 | ERROR    | __main__:_download:16 - 500 Server Error: Internal Server Error for url: https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1-3b4f-4972-bc12-b75597694f


In [15]:
#export
def download_from_urls(urls: List[str], save_dir: Union[str, Path], n_threads: int = 8):
    download_count = 0
    tic = time.perf_counter()
    Path(save_dir).mkdir(exist_ok=True)
    logger.remove()
    logger.add(lambda msg: tqdm.write(msg, end=""))
    with tqdm(total=len(urls)) as progress:
        with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
            future_to_url = {executor.submit(_download, url, save_dir): url for url in urls}
            for future in future_to_url:
                future.add_done_callback(lambda p: progress.update(1))
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                except Exception as e:
                    logger.error("%r generated an exception: %s" % (url, e))
                else:
                    if data:
                        logger.info(f"{url} downloaded to {data}")
                        download_count+=1
        toc = time.perf_counter()
    logger.remove()
    logger.info(f"Downloads completed in {toc - tic:0.4f} seconds")
    return download_count

In [16]:
test_links = [
    "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=50ebdb11-9186-4c24-90e5-27caf73d3f11",
    "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=9c4f2fd6-d58c-4a57-8fac-a5dd273f8ed3",
]

In [17]:
download_from_urls(test_links, "test_dir")

 50%|█████     | 1/2 [04:31<04:31, 271.81s/it]

2021-02-23 09:50:14.041 | INFO     | __main__:download_from_urls:21 - https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=9c4f2fd6-d58c-4a57-8fac-a5dd273f8ed3 downloaded to BLNewspapers_0002642_TheExpress_1864_68176359-717d-4878-a2ac-1228c0fb83f8.zip


100%|██████████| 2/2 [04:34<00:00, 137.07s/it]

2021-02-23 09:50:16.375 | INFO     | __main__:download_from_urls:21 - https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=50ebdb11-9186-4c24-90e5-27caf73d3f11 downloaded to BLNewspapers_0002642_TheExpress_1863_96dbbbb9-40b8-44c2-b350-b9559b092b1e.zip





2

In [18]:
#slow
assert len(test_links) == len(os.listdir("test_dir"))
test_dir = Path("test_dir")
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()

In [19]:
#slow
test_some_bad_links = [
    "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1f-3b4f-4972-bc12-b7559769471f",
    "https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=7ac7a0cb-29a2-4172-8b79-4952e2c9b",
]
download_from_urls(test_some_bad_links, "test_dir")

 50%|█████     | 1/2 [00:00<00:00,  2.14it/s]

2021-02-23 09:50:16.907 | ERROR    | __main__:_download:16 - 500 Server Error: Internal Server Error for url: https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=7ac7a0cb-29a2-4172-8b79-4952e2c9b


100%|██████████| 2/2 [02:29<00:00, 74.54s/it]

2021-02-23 09:52:45.520 | INFO     | __main__:download_from_urls:21 - https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=0ea7aa1f-3b4f-4972-bc12-b7559769471f downloaded to BLNewspapers_0002642_TheExpress_1848_f1c4cb8d-6bd5-401f-831f-a19199d47c0a.zip





1

In [20]:
#slow
test_dir = Path("test_dir")
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()

In [21]:
#export
@call_parse
def cli(
    save_dir: Param("Output Directory", str),
    n_threads: Param("Number threads to use") = 8,
    subset: Param("Download subset of HMD", Union[int]) = None
):
    "Download HMD newspaper from iro to `save_dir` using `n_threads`"
    logger.info("Getting title urls")
    title_urls = get_newspaper_links()
    logger.info(f"Found {len(title_urls)} title urls")
    all_urls = []
    for url in title_urls:
        logger.info(f"Getting zip download file urls for {url}")
        try:
            zip_urls = get_download_urls(url[1])
            all_urls.append(zip_urls)
        except Exception as e:
            logger.error(e)
    all_urls = list(itertools.chain(*all_urls))
    if subset:
        all_urls = random.sample(all_urls, subset)
    print(all_urls)
    download_count = download_from_urls(all_urls, save_dir, n_threads=n_threads)
    request_url_count = len(all_urls)
    if request_url_count==download_count:
        logger.info(f"\U0001F600 Requested count of urls: {request_url_count} matches number downloaded: {download_count}")
    if request_url_count>download_count:
        logger.warning(f"\U0001F622 Requested count of urls: {request_url_count} higher than number downloaded: {download_count}")
    if request_url_count<download_count:
        logger.warning(f"\U0001F937 Requested count of urls: {request_url_count} lower than number downloaded: {download_count}")

In [22]:
cli("test_dir", subset=3)

  0%|          | 0/3 [00:00<?, ?it/s]

['https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=cd69be4a-2773-4ac6-80b1-de65e6664f16', 'https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=958ece9a-49c5-4991-82a8-84739a9b2312', 'https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=e98123f6-7138-4b26-8b42-8d1211b97e02']


 33%|███▎      | 1/3 [02:29<04:58, 149.38s/it]

2021-02-23 09:55:42.503 | INFO     | __main__:download_from_urls:21 - https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=cd69be4a-2773-4ac6-80b1-de65e6664f16 downloaded to BLNewspapers_0002088_TheLiverpoolStandardAndGeneralCommercialAdvertiser_1835.zip


 67%|██████▋   | 2/3 [03:18<01:30, 90.68s/it] 

2021-02-23 09:56:32.088 | INFO     | __main__:download_from_urls:21 - https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=958ece9a-49c5-4991-82a8-84739a9b2312 downloaded to BLNewspapers_0002194_TheSun_1821.zip


100%|██████████| 3/3 [07:13<00:00, 144.35s/it]

2021-02-23 10:00:26.169 | INFO     | __main__:download_from_urls:21 - https://bl.oar.bl.uk/fail_uploads/download_file?fileset_id=e98123f6-7138-4b26-8b42-8d1211b97e02 downloaded to BLNewspapers_0002194_TheSun_1862.zip





In [23]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.


In [24]:
test_dir = Path("test_dir")
[f.unlink() for f in test_dir.iterdir()]
test_dir.rmdir()