# Common Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

In [1]:
import os
import sys

import urllib.request
import re

from bs4 import BeautifulSoup

projdir = os.path.realpath(os.path.join(sys.path[0], '..'))

## Printable Class

Simple class that allows other classes to be printed.

In [2]:
class Printable:
    def __repr__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

## Download Functions

Download spreadsheets by parsing the HTML for suitable links

In [3]:
class WebDownload():

    def __init__(self, skipExisting=False, skipHistory=False, verbose=False):
        """Initialisise the area object"""

        self.skipExisting, self.skipHistory, self.verbose = skipExisting, skipHistory, verbose
        self.downloaded = []
        
        
    def downloadFile(self, url, fileName):
        """Download a binary file from the URL provided"""

        if os.path.exists(fileName) and self.skipExisting or fileName in self.downloaded:
            if self.verbose:
                print(f"Skipping {os.path.basename(url)}...")
        else:
            print(f"Downloading {os.path.basename(url)}...")

            # Ensure raw path exists
            filePath = os.path.dirname(fileName)
            if not os.path.exists(filePath):
                os.makedirs(filePath)

            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
            response = urllib.request.urlopen(req, timeout=60)

            with open(fileName, "wb") as outfile:
                chunk = response.read(4096)
                while chunk:
                    outfile.write(chunk)
                    chunk = response.read(4096)

            response.close()
            
        self.downloaded.append(fileName)


    def downloadFiles(self, rawPath, url, patterns, category=None):

        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
        response = urllib.request.urlopen(req, timeout=15)

        soup = BeautifulSoup(response, "lxml")
        anchors = soup.find_all("a")

        response.close()

        for anchor in anchors:
            url = anchor.get("href")

            for pattern in patterns:
                if re.search(pattern[1], url):
                    filePath = os.path.join(rawPath, pattern[0])
                    if category:
                        filePath = os.path.join(filePath, category)
                    fileName = os.path.join(filePath, os.path.basename(url))
                    self.downloadFile(url, fileName)
                    if self.skipHistory:
                        break