# Common Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

In [1]:
import os
import sys

import unittest

import urllib.request
import urllib.parse
import re

from bs4 import BeautifulSoup

import csv
import numpy as np

projdir = os.path.realpath(os.path.join(sys.path[0], ".."))
dataDir = os.path.join(projdir, "data")

## Common Definitions

e.g. Nation and region names

In [2]:
# There are 7 nations in the UK
nations = \
{
    "K02000001": "United Kingdom",
    "K03000001": "Great Britain",
    "K04000001": "England and Wales",
    "E92000001": "England",
    "W92000004": "Wales",
    "S92000003": "Scotland",
    "N92000002": "Northern Ireland"
}
nationNames = [*nations.values()]

UNITED_KINGDOM = nations["K02000001"]
GREAT_BRITAIN = nations["K03000001"]
ENGLAND_WALES = nations["K04000001"]
ENGLAND = nations["E92000001"]
WALES = nations["W92000004"]
SCOTLAND = nations["S92000003"]
NORTHERN_IRELAND = nations["N92000002"]

# Some data sources such as the ONS daily occurrences do not use the standard nation codes
nationMappings = \
{
    "W99999999": "W92000004"
}

In [3]:
# There are 9 regions in England
regions = \
{
    "E12000001": "North East",
    "E12000002": "North West",
    "E12000003": "Yorkshire and The Humber",
    "E12000004": "East Midlands",
    "E12000005": "West Midlands",
    "E12000006": "East of England",
    "E12000007": "London",
    "E12000008": "South East",
    "E12000009": "South West"
}
regionNames = [*regions.values()]

# There are some common aliases for regions in England
regionAliases = {
    "East of England": ["East"]    # Used by historical ONS deaths data
}

# There are 7 NHS regions in England
nhsRegionNames = \
[
    "North East and Yorkshire",
    "North West",
    "Midlands",
    "East of England",
    "London",
    "South East",
    "South West"
]

In [4]:
dateFieldNames = ["week_ended", "date"]

In [5]:
verbose = False

## Printable Class

Simple class that allows other classes to be printed.

In [6]:
class Printable:
    def __repr__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

## Common Functions

Useful functions such as modifying area names for use as filenames

In [7]:
def getSafeName(areaName):
    """Return area name suitable for use in filenames"""

    for word in 'of', 'and', 'the', 'The', '+':
        areaName = areaName.replace(' ' + word + ' ', ' ')

    areaName = areaName.lower().replace(' ', '_').replace(',', '')

    return areaName

In [8]:
class TestGetSafeName(unittest.TestCase):
    '''Class to test getSafeName function'''   

    def test_of(self):
        '''Test use of the word "of"'''
        self.assertEqual(getSafeName('East of England'), 'east_england')

    def test_and(self):
        '''Test use of the words "and"'''
        self.assertEqual(getSafeName('England and Wales'), 'england_wales')
        self.assertEqual(getSafeName('North East and Yorkshire'), 'north_east_yorkshire')

    def test_and_the(self):
        '''Test use of the words "and" + "the"'''
        self.assertEqual(getSafeName('Yorkshire and the Humber'), 'yorkshire_humber')
        self.assertEqual(getSafeName('Yorkshire and The Humber'), 'yorkshire_humber')

    def test_lists(self):
        '''Test use of the seperator "," + word "and"'''
        self.assertEqual(getSafeName('Bournemouth, Christchurch and Poole'), 'bournemouth_christchurch_poole')

    def test_plus(self):
        '''Test use of the "+" symbol'''
        self.assertEqual(getSafeName('England + Wales'), 'england_wales')

## Download Functions

Download spreadsheets by parsing the HTML for suitable links

In [9]:
skipExisting = True
skipHistory = False

class WebDownload():

    def __init__(self, skipExisting=skipExisting, skipHistory=skipHistory, verbose=verbose):
        """Initialisise the area object"""

        self.skipExisting, self.skipHistory, self.verbose = skipExisting, skipHistory, verbose
        self.downloaded = {}
        
        
    def downloadFile(self, url, rawDir, subDir):
        """Download a binary file from the URL provided"""

        baseName = os.path.basename(url)
        partName = os.path.join(subDir, baseName)

        dirName = os.path.join(rawDir, subDir)
        fileName = os.path.join(dirName, baseName)

        if (os.path.exists(fileName) or baseName in self.downloaded) and self.skipExisting:
            if self.verbose:
                print(f"Skipping download of {partName}...")
        else:
            print(f"Downloading {partName}...")

            # Ensure raw path exists
            if not os.path.exists(dirName):
                os.makedirs(dirName)

            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
            response = urllib.request.urlopen(req, timeout=60)

            with open(fileName, "wb") as outfile:
                chunk = response.read(4096)
                while chunk:
                    outfile.write(chunk)
                    chunk = response.read(4096)

            response.close()

        if baseName not in self.downloaded:
            self.downloaded[baseName] = partName


    def downloadFiles(self, rawDir, url, patterns, category=None):

        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
        response = urllib.request.urlopen(req, timeout=15)

        soup = BeautifulSoup(response, "lxml")
        anchors = soup.find_all("a")

        response.close()

        done = False

        for anchor in anchors:
            href = anchor.get("href")

            for pattern in patterns:
                if re.search(pattern[1], href):
                    url = urllib.parse.urljoin(url, href)

                    if category:
                        subDir = os.path.join(pattern[0], category)
                    else:
                        subDir = pattern[0]

                    self.downloadFile(url, rawDir, subDir)

                    if self.skipHistory:
                        done = True
                        
            if done:
                break
                        
        return list(self.downloaded.values())

## Load CSV into NumPy Array

SImple function to load data from CSV into an ndarray

In [10]:
def loadCsvIntoArray(fileName, verbose = verbose):
    '''Load CSV file into numpy array'''

    if verbose:
        print(f"Loading {fileName}...")

    try:
        with open(fileName, 'r') as f:
            reader = csv.reader(f, delimiter = ',')

            dtype = []
            converters = {}
            colNames = next(reader)

            for i in range(len(colNames)):
                colName = colNames[i]
                if colName in dateFieldNames:
                    dtype.append((colName, "U10"))
                else:
                    dtype.append((colName, "u4"))
                    converters[i] = lambda s: int(s or 0)

            data = np.genfromtxt(f, dtype=dtype, converters=converters, delimiter=",")

    except:
        print(f"Failed to load {fileName}")
        raise
        
    return data

## NumPy Helper Functions

Useful functionality such as moving average or rolling sum

In [11]:
def rollingSum(data, window = 7):
    """Calculate rolling sum using linear convolution"""
    
    # The mode "full" results in the more values than required, hence the len(data)
    # The result should also match the original data, hence the astype()
    result = np.convolve(data, np.ones(window), mode="full")[:len(data)].astype(data.dtype) 
    
    return result

In [12]:
class TestRollingSum(unittest.TestCase):
    '''Class to test rollingSum function'''   

    def testShortList(self):
        '''Test processing of a list shorter than the window size'''

        actual = rollingSum(np.arange(6), 7)
        expected = np.array([0, 1, 3, 6, 10, 15])

        self.assertEqual((actual == expected).all(), True)


    def testLongerList(self):
        '''Test processing of a list longer than the window size'''

        actual = rollingSum(np.arange(14), 7)
        expected = np.array([0, 1, 3, 6, 10, 15, 21, 28, 35, 42, 49, 56, 63, 70])

        self.assertEqual((actual == expected).all(), True)

In [13]:
def movingAverage(data, window = 7):
    """Calculate moving average using linear convolution"""

    # Only use convolution if the input is at least as long as the window size
    if len(data) >= window:
        # The mode "valid" results in the less values than required, hence the np.zeros()
        result = np.concatenate((np.zeros(window // 2),
                                 np.convolve(data, np.ones(window) / window, mode="valid"),
                                 np.zeros(window // 2)))

    else:
        # Result is a simple ndarray of zeros
        result = np.zeros(len(data))

    return result

In [14]:
class TestMovingAverage(unittest.TestCase):
    '''Class to test rollingSum function'''   

    def testShortList(self):
        '''Test processing of a list shorter than the window size'''

        actual = movingAverage(np.arange(6), 7)
        expected = np.array(np.zeros(6))

        self.assertEqual((actual == expected).all(), True)


    def testLongerList(self):
        '''Test processing of a list longer than the window size'''

        actual = movingAverage(np.arange(14), 7)
        expected = np.array([0, 0, 0, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0])

        # Comparison of floating point values must be approximate
        self.assertEqual((abs(actual - expected) < 1e-10).all(), True)

## Run Unit Tests

In [15]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.........
----------------------------------------------------------------------
Ran 9 tests in 0.073s

OK
