# Common Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

In [1]:
import os
import sys

import urllib.request
import urllib.parse
import re

from bs4 import BeautifulSoup

projdir = os.path.realpath(os.path.join(sys.path[0], '..'))

## Common Definitions

e.g. Nation and region names

In [2]:
# The 4 nations in the UK
nationNames = ["England", "Scotland", "Wales", "Northern Ireland"]

# The 9 regions in England
regionNames = ["North West", "North East", "Yorkshire and The Humber", "West Midlands", "East Midlands",
             "East of England", "London", "South East", "South West"]

# The aliases for regions in England
regionAliases = {"East of England": ["East"]}

## Printable Class

Simple class that allows other classes to be printed.

In [3]:
class Printable:
    def __repr__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

## Download Functions

Download spreadsheets by parsing the HTML for suitable links

In [4]:
skipExisting = True
skipHistory = False
verbose = False

class WebDownload():

    def __init__(self, skipExisting=skipExisting, skipHistory=skipHistory, verbose=verbose):
        """Initialisise the area object"""

        self.skipExisting, self.skipHistory, self.verbose = skipExisting, skipHistory, verbose
        self.downloaded = {}
        
        
    def downloadFile(self, url, rawDir, subDir):
        """Download a binary file from the URL provided"""

        baseName = os.path.basename(url)
        partName = os.path.join(subDir, baseName)

        dirName = os.path.join(rawDir, subDir)
        fileName = os.path.join(dirName, baseName)

        if (os.path.exists(fileName) or baseName in self.downloaded) and self.skipExisting:
            if self.verbose:
                print(f"Skipping {partName}...")
        else:
            print(f"Downloading {partName}...")

            # Ensure raw path exists
            if not os.path.exists(dirName):
                os.makedirs(dirName)

            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
            response = urllib.request.urlopen(req, timeout=60)

            with open(fileName, "wb") as outfile:
                chunk = response.read(4096)
                while chunk:
                    outfile.write(chunk)
                    chunk = response.read(4096)

            response.close()

        if baseName not in self.downloaded:
            self.downloaded[baseName] = partName


    def downloadFiles(self, rawDir, url, patterns, category=None):

        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
        response = urllib.request.urlopen(req, timeout=15)

        soup = BeautifulSoup(response, "lxml")
        anchors = soup.find_all("a")

        response.close()

        for anchor in anchors:
            href = anchor.get("href")

            for pattern in patterns:
                if re.search(pattern[1], href):
                    url = urllib.parse.urljoin(url, href)

                    if category:
                        subDir = os.path.join(pattern[0], category)
                    else:
                        subDir = pattern[0]

                    self.downloadFile(url, rawDir, subDir)

                    if self.skipHistory:
                        break
                        
        return self.downloaded.values()