# Common Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

In [1]:
import os
import sys

import unittest

import urllib.request
import urllib.parse
import re

from bs4 import BeautifulSoup

projdir = os.path.realpath(os.path.join(sys.path[0], '..'))

## Common Definitions

e.g. Nation and region names

In [2]:
# The 4 nations in the UK
nations = \
{
    "K02000001": "United Kingdom",
    "K03000001": "Great Britain",
    "K04000001": "England and Wales",
    "E92000001": "England",
    "W92000004": "Wales",
    "S92000003": "Scotland",
    "N92000002": "Northern Ireland"
}
nationNames = [*nations.values()]

UNITED_KINGDOM = nations["K02000001"]
GREAT_BRITAIN = nations["K03000001"]
ENGLAND_WALES = nations["K04000001"]
ENGLAND = nations["E92000001"]
WALES = nations["W92000004"]
SCOTLAND = nations["S92000003"]
NORTHERN_IRELAND = nations["N92000002"]

In [3]:
# The 9 regions in England
regions = \
{
    "E12000001": "North East",
    "E12000002": "North West",
    "E12000003": "Yorkshire and The Humber",
    "E12000004": "East Midlands",
    "E12000005": "West Midlands",
    "E12000006": "East of England",
    "E12000007": "London",
    "E12000008": "South East",
    "E12000009": "South West"
}
regionNames = [*regions.values()]

# All 7 NHS regions in England
nhsRegionNames = \
[
    "North East and Yorkshire",
    "North West",
    "Midlands",
    "East of England",
    "London",
    "South East",
    "South West"
]

## Printable Class

Simple class that allows other classes to be printed.

In [4]:
class Printable:
    def __repr__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

## Common Functions

Useful functions such as modifying area names for use as filenames

In [5]:
def getSafeName(areaName):
    """Return area name suitable for use in filenames"""

    for word in 'of', 'and', 'the', 'The', '+':
        areaName = areaName.replace(' ' + word + ' ', ' ')

    areaName = areaName.lower().replace(' ', '_').replace(',', '')

    return areaName

In [6]:
class TestGetSafeName(unittest.TestCase):
    '''Class to test getSafeName function'''   

    def test_of(self):
        '''Test use of the word "of"'''
        self.assertEqual(getSafeName('East of England'), 'east_england')

    def test_and(self):
        '''Test use of the words "and"'''
        self.assertEqual(getSafeName('England and Wales'), 'england_wales')
        self.assertEqual(getSafeName('North East and Yorkshire'), 'north_east_yorkshire')

    def test_and_the(self):
        '''Test use of the words "and" + "the"'''
        self.assertEqual(getSafeName('Yorkshire and the Humber'), 'yorkshire_humber')
        self.assertEqual(getSafeName('Yorkshire and The Humber'), 'yorkshire_humber')

    def test_lists(self):
        '''Test use of the seperator "," + word "and"'''
        self.assertEqual(getSafeName('Bournemouth, Christchurch and Poole'), 'bournemouth_christchurch_poole')

    def test_plus(self):
        '''Test use of the "+" symbol'''
        self.assertEqual(getSafeName('England + Wales'), 'england_wales')

## Download Functions

Download spreadsheets by parsing the HTML for suitable links

In [7]:
skipExisting = True
skipHistory = False
verbose = False

class WebDownload():

    def __init__(self, skipExisting=skipExisting, skipHistory=skipHistory, verbose=verbose):
        """Initialisise the area object"""

        self.skipExisting, self.skipHistory, self.verbose = skipExisting, skipHistory, verbose
        self.downloaded = {}
        
        
    def downloadFile(self, url, rawDir, subDir):
        """Download a binary file from the URL provided"""

        baseName = os.path.basename(url)
        partName = os.path.join(subDir, baseName)

        dirName = os.path.join(rawDir, subDir)
        fileName = os.path.join(dirName, baseName)

        if (os.path.exists(fileName) or baseName in self.downloaded) and self.skipExisting:
            if self.verbose:
                print(f"Skipping {partName}...")
        else:
            print(f"Downloading {partName}...")

            # Ensure raw path exists
            if not os.path.exists(dirName):
                os.makedirs(dirName)

            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
            response = urllib.request.urlopen(req, timeout=60)

            with open(fileName, "wb") as outfile:
                chunk = response.read(4096)
                while chunk:
                    outfile.write(chunk)
                    chunk = response.read(4096)

            response.close()

        if baseName not in self.downloaded:
            self.downloaded[baseName] = partName


    def downloadFiles(self, rawDir, url, patterns, category=None):

        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
        response = urllib.request.urlopen(req, timeout=15)

        soup = BeautifulSoup(response, "lxml")
        anchors = soup.find_all("a")

        response.close()

        done = False

        for anchor in anchors:
            href = anchor.get("href")

            for pattern in patterns:
                if re.search(pattern[1], href):
                    url = urllib.parse.urljoin(url, href)

                    if category:
                        subDir = os.path.join(pattern[0], category)
                    else:
                        subDir = pattern[0]

                    self.downloadFile(url, rawDir, subDir)

                    if self.skipHistory:
                        done = True
                        
            if done:
                break
                        
        return list(self.downloaded.values())

## Run Unit Tests

In [8]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.....
----------------------------------------------------------------------
Ran 5 tests in 0.013s

OK
