# ONS Convert

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

## Imports

Standard python libraries plus determination of projdir, basic printable class, etc

In [1]:
import os
import re
from datetime import datetime, timedelta

import csv
from xlrd import open_workbook, xldate_as_tuple

import numpy as np

import common_core
import ons_core
import ons_download

## Constants Relating to ONS Spreadsheets

Text strings to avoid hard-coded values throughout the code; avoids clutter and silent errors.

In [2]:
# Weekly worksheet names (lower case)
WEEKLY_FIGURES_LOWER = "weekly figures 20"
ESTIMATED_TOTAL_DEATHS_LOWER = "estimated total deaths"
COVID_WEEKLY_REGISTRATIONS_LOWER = "covid-19 - weekly registrations"
COVID_WEEKLY_OCCURRENCES_LOWER = "covid-19 - weekly occurrences"

# Daily worksheet names (lower case)
REGIONAL_LOWER = "regional"

# Text used to find specific lines
WEEK_NUMBER_TEXT = "Week number"
WEEK_ENDED_TEXT = "Week ended"

# Regular expressions used to find specific lines
TOTAL_DEATHS_REGEX = "^Total deaths, all ages"
TOTAL_OCCURRENCES_REGEX = "^Estimated total death occurrences( \(2020/2021\))?$"
COVID_DEATHS_REGEX = "^Deaths involving COVID-19, all ages"
RESPIRATORY_REGEX = ".*ICD-10 J00-J99.*"

# Maximum number of columns to search for text / regex
MAX_COLS_WITH_HEADERS = 2

## Facilitate Spreadsheet Parsing

Find specific lines in the spreadsheet, etc

In [3]:
def findRowNo(sheet, heading, aliases={}):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Search for row headings with precise wording
    headingLower = heading.lower()

    # Aliases are still regarded as precise wording
    if heading in aliases:
        aliasesLower = [alias.lower() for alias in aliases[heading]]
    else:
        aliasesLower = []

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                cellValueLower = cellValue.lower()
                if cellValueLower == headingLower or cellValueLower in aliasesLower:
                    matches.append(rowNo)

    if len(matches) == 0:
        rowNo = -1
    elif len(matches) > 1:
        raise RuntimeError(f"'{heading}' found in '{sheet.name} multiple times - rows {[match + 1 for match in matches]}")
    else:
        rowNo = matches[0]

    return rowNo


def regexFindRowNos(sheet, pattern, verbose=common_core.verbose):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Pre-compile regex for minor speedup
    regex = re.compile(pattern)

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                if regex.match(cellValue):
                    matches.append(rowNo)

    if len(matches) > 1 and verbose:
        print(f"WARNING: '{pattern}' found in '{sheet.name}' multiple times - rows {[match + 1 for match in matches]}")

    if len(matches) == 0:
        rowNo = -1
    else:
        rowNo = matches[0]

    return rowNo


def getWeekColNos(sheet):
    '''Determine the columns of week numbers from the cells in the specified row.'''

    rowNo = findRowNo(sheet, WEEK_NUMBER_TEXT)
    colNos = []

    for colNo in range(sheet.ncols):
        cellValue = sheet.cell(rowNo, colNo).value

        # If the cell contains a value that can be converted to an integer then treat it as a week number
        try:
            intValue = int(cellValue)
            colNos.append(colNo)
        except:
            pass

    return colNos


def getWeekNumbers(sheet, colNos):
    '''Determine the week numbers from the cells in the specified row.'''

    rowNo = findRowNo(sheet, WEEK_NUMBER_TEXT)
    weekNumbers = []

    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        weekNumbers.append(int(cellValue))

    return weekNumbers


def getWeekEndings(sheet, colNos, dateMode):
    '''Determine the week endings from the cells in the specified row.'''

    rowNo = findRowNo(sheet, WEEK_ENDED_TEXT)
    weekEndings = []

    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        if isinstance(cellValue, str):
            weekEnding = datetime.strptime(cellValue, '%d-%b-%y').strftime("%Y-%m-%d")
        else:
            year, month, day, hour, minute, second = xldate_as_tuple(cellValue, dateMode)
            weekEnding = f"{year:04}-{month:02}-{day:02}"

        if datetime.strptime(weekEnding, "%Y-%m-%d").weekday() != 4:
            print(f"WARNING: {weekEnding} is not a Friday")

        # Patch typos in worksheet "Estimated total deaths 2020"
        # Spotted in publishedweek022021.xlsx and publishedweek082021.xlsx
        if weekEnding in ['2020-01-01', '2020-02-26']:
            print(f"WARNING: Converting {weekEnding} to {weekEnding.replace('2020', '2021')}")
            weekEnding = weekEnding.replace('2020', '2021')

        weekEndings.append(weekEnding)

    return weekEndings


def getWeekOffsets(weekEndings):
    '''Determine the week endings from the cells in the specified row.'''

    weekOffsets = []

    for weekEnding in weekEndings:
        weekEnding = datetime.strptime(weekEnding, "%Y-%m-%d")

        delta = weekEnding.date() - ons_core.minWeek
        weekOffset = delta.days // 7

        weekOffsets.append(weekOffset)

    return weekOffsets


def getCellValue(sheet, rowNo, colNo):
    '''Determine the weekly deaths from the cells in the specified row.'''

    cellValue = sheet.cell(rowNo, colNo).value

    # 2011 switched from ICD-10 v 2001 to ICD-10 v 2010 (NCHS)
    # 2014 switched from ICD-10 v 2010 (NCHS) to ICD-10 v 2013 (IRIS)
    if cellValue == ":":
        cellValue = 0

    else:
        # Allow non-integers to be treated as zero but show a warning
        try:
            if cellValue != "":
                cellValue = int(cellValue)
            else:
                cellValue = 0
        except:
            print(f"Warning: Failed to convert '{cellValue}' to integer in '{sheet.name}' (row {rowNo + 1} col {colNo + 1})")
            cellValue = 0

    return cellValue

## Load Weekly Deaths

Load weekly deaths  data into the cache as a means of converting from XLSX to CSV.

In [4]:
def getWorkbookYear(workbook):
    '''Scan the specified workbook to determine the year.'''

    year = None

    for sheet in workbook.sheets():
        if sheet.name.lower().startswith(WEEKLY_FIGURES_LOWER):
            year = int(sheet.name[-4:])

    if year == None:
        raise RuntimeError(f"Year could not be determined for workbook")

    return year


def initCache(cache, areaNames, verbose=common_core.verbose):
    '''Initialise cache for an individual region'''

    dtype = {'names':[ons_core.WEEK_ENDED, ons_core.WEEK_NUMBER,
                      ons_core.TOTAL_REGISTRATIONS, ons_core.TOTAL_OCCURRENCES,
                      ons_core.COVID_REGISTRATIONS, ons_core.COVID_OCCURRENCES],
             'formats':['U10', 'B', 'I', 'I', 'I', 'I']}

    # Calculate the maximum array length
    delta = ons_core.maxWeek - ons_core.minWeek
    maxWeeks = delta.days // 7 + 1

    # Allocate cache for the region
    for areaName in areaNames:
        if areaName not in cache:
            if verbose:
                print(f"Initialising {areaName}...")

            cache[areaName] = np.zeros(maxWeeks, dtype=dtype)

            # Pre-populate week_ended
            for weeksDelta in range(maxWeeks):
                weekEnding = ons_core.minWeek + timedelta(weeks=weeksDelta)
                cache[areaName][ons_core.WEEK_ENDED][weeksDelta] = weekEnding.strftime("%Y-%m-%d")
                cache[areaName][ons_core.WEEK_NUMBER][weeksDelta] = common_core.getOnsWeek(weekEnding)[1]


def processAreas(cache, sheetsInfo, areaNames, verbose=common_core.verbose):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''

    initCache(cache, areaNames, verbose=verbose)

    for sheetInfoKey in sheetsInfo:
        sheetInfo = sheetsInfo[sheetInfoKey]

        sheet = sheetInfo["sheet"]
        weekColNos = sheetInfo[ons_core.WEEK_COL_NOS]
        weekNumbers = sheetInfo[ons_core.WEEK_NUMBERS]
        weekEndings = sheetInfo[ons_core.WEEK_ENDINGS]
        weekOffsets = sheetInfo[ons_core.WEEK_OFFSETS]

        for areaName in areaNames:
            if areaName in sheetInfo["regions"]:
                rowNo = sheetInfo["regions"][areaName]

                for i in range(len(weekColNos)):
                    cellValue = getCellValue(sheet, rowNo, weekColNos[i])

                    weekNumber = weekNumbers[i]
                    weekEnding = weekEndings[i]
                    weekOffset = weekOffsets[i]

                    if (cache[areaName][ons_core.WEEK_ENDED][weekOffset] != weekEnding):
                        print(f"Week mismatch - {cache[areaName][ons_core.WEEK_ENDED][weekOffset]} vs {weekEnding}")
                        raise RuntimeError(f"Bug in week offset calculations? week {weekNumber} in {sheet.name}")

                    cache[areaName][ons_core.WEEK_NUMBER][weekOffset] = weekNumber
                    cache[areaName][sheetInfoKey][weekOffset] = cellValue


def processRegions(cache, sheetsInfo, verbose=common_core.verbose):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''

    regionNames = []

    for regionName in common_core.regionNames:
        found = False

        for sheetInfoKey in sheetsInfo:
            sheetInfo = sheetsInfo[sheetInfoKey]
            sheet = sheetInfo["sheet"]

            rowNo = findRowNo(sheet, regionName, aliases=common_core.regionAliases)

            if rowNo >= 0:
                sheetInfo["regions"][regionName] = rowNo
                found = True

        if found:
            regionNames.append(regionName)

    processAreas(cache, sheetsInfo, regionNames, verbose=verbose)


def processNations(cache, sheetsInfo, verbose=common_core.verbose):
    '''Parse the specified worksheet for weekly deaths in a specific country.'''

    nationNames = []

    for nationName in common_core.nationNames:
        found = False

        for sheetInfoKey in sheetsInfo:
            sheetInfo = sheetsInfo[sheetInfoKey]
            sheet = sheetInfo["sheet"]

            if nationName == common_core.ENGLAND_WALES:
                if sheet.name.lower().startswith(WEEKLY_FIGURES_LOWER):
                    rowNo = regexFindRowNos(sheet, TOTAL_DEATHS_REGEX)
                elif sheet.name.lower().startswith(ESTIMATED_TOTAL_DEATHS_LOWER):
                    rowNo = regexFindRowNos(sheet, TOTAL_OCCURRENCES_REGEX)
                else:
                    rowNo = regexFindRowNos(sheet, COVID_DEATHS_REGEX)
            else:
                rowNo = findRowNo(sheet, nationName)

            if rowNo >= 0:
                sheetInfo["regions"][nationName] = rowNo
                found = True

        if found:
            nationNames.append(nationName)

    processAreas(cache, sheetsInfo, nationNames, verbose=verbose)


def processWeeklySheets(cache, sheetsInfo, dateMode, verbose=common_core.verbose):
    '''Parse the specified worksheets for weekly deaths.'''

    for sheetInfoKey in sheetsInfo:
        sheetInfo = sheetsInfo[sheetInfoKey]
        sheet = sheetInfo["sheet"]

        weekColNos = getWeekColNos(sheet)
        sheetInfo[ons_core.WEEK_COL_NOS] = weekColNos

        weekNos = getWeekNumbers(sheet, weekColNos)
        assert len(weekNos) == len(weekColNos), f"Number of week numbers did not match number of weeks in '{sheet.name}'"
        sheetInfo[ons_core.WEEK_NUMBERS] = weekNos

        weekEndings = getWeekEndings(sheet, weekColNos, dateMode)
        assert len(weekEndings) == len(weekColNos), f"Number of week endings did not match number of weeks in '{sheet.name}'"
        sheetInfo[ons_core.WEEK_ENDINGS] = weekEndings

        weekOffsets = getWeekOffsets(weekEndings)
        assert len(weekOffsets) == len(weekEndings), f"Number of week offsets did not match number of weeks in '{sheet.name}'"
        sheetInfo[ons_core.WEEK_OFFSETS] = weekOffsets

        sheetInfo["regions"] = {}

    processNations(cache, sheetsInfo, verbose=verbose)
    processRegions(cache, sheetsInfo, verbose=verbose)


def processWeeklyWorkbook(cache, workbook, verbose=common_core.verbose):
    '''Parse the specified workbook for weekly deaths.'''

    sheetsInfo = {}

    for sheet in workbook.sheets():
        if sheet.name.lower().startswith(WEEKLY_FIGURES_LOWER):
            sheetsInfo[ons_core.TOTAL_REGISTRATIONS] = {"sheet": sheet}
        elif sheet.name.lower().startswith(ESTIMATED_TOTAL_DEATHS_LOWER):
            sheetsInfo[ons_core.TOTAL_OCCURRENCES] = {"sheet": sheet}
        elif sheet.name.lower() == COVID_WEEKLY_REGISTRATIONS_LOWER:
            sheetsInfo[ons_core.COVID_REGISTRATIONS] = {"sheet": sheet}
        elif sheet.name.lower() == COVID_WEEKLY_OCCURRENCES_LOWER:
            sheetsInfo[ons_core.COVID_OCCURRENCES] = {"sheet": sheet}

    processWeeklySheets(cache, sheetsInfo, workbook.datemode, verbose=verbose)


def loadWeeklyDeaths(partNames, verbose=common_core.verbose):
    '''Load the specified spreadsheets into cache.'''

    # Iterate through all workbooks to determine the years
    years = {}
    for partName in partNames:
        fileName = os.path.join(common_core.dataDir, partName)
        if verbose:
            print(f"Loading {partName}...")
        workbook = open_workbook(fileName)

        year = getWorkbookYear(workbook)
        years[year] = workbook

    # Iterate throught the years in chronological order - required to handle the 2021 hybrid!
    cache = {}
    for year in sorted(years):
        try:
            processWeeklyWorkbook(cache, years[year], verbose=verbose)
        except:
            print(f"ERROR: Exception raise whilst processing workbook for {year}")
            raise

    return(cache)

## Load Daily Deaths

Load daily occurrence data into the cache as a means of converting from XLSX to CSV.

In [5]:
def processDailySheet(cache, sheet, verbose=common_core.verbose):
    '''Process daily occurrences in worksheet'''

    headerFound = False

    for rowNo in range(sheet.nrows):
        if headerFound == False:
            # Primitive check to determine presence of header
            if (sheet.cell(rowNo, 0).value == "Year" and
                sheet.cell(rowNo, 1).value == "Month" and
                sheet.cell(rowNo, 2).value == "Day" and
                sheet.cell(rowNo, 3).value == "Region" and
                sheet.cell(rowNo, 4).value == "Deaths"):
                headerFound = True
        else:
            # Stop at the first empty row
            if sheet.cell(rowNo, 0).value == "":
                break

            # Extract data from current row
            year, month, day = int(sheet.cell(rowNo, 0).value), int(sheet.cell(rowNo, 1).value), int(sheet.cell(rowNo, 2).value)
            areaCode = sheet.cell(rowNo, 3).value
            deaths = int(sheet.cell(rowNo, 4).value)

            # Handle non-standard nation codes such as W99999999
            if areaCode in common_core.nationMappings:
                areaCode = common_core.nationMappings[areaCode]

            # Initialise cache if necessary
            if areaCode not in cache:
                cache[areaCode] = []

            # Store row in cache
            cache[areaCode].append([f"{year:04}-{month:02}-{day:02}", deaths])


def validateDailyCache(cache, areaCode, verbose=common_core.verbose):
    '''Sort the daily occurrences for the specified area and report non-contiguous dates'''

    if verbose:
        print(f"Validating daily cache for {areaCode}...")

    # Sort data chronologically
    cache[areaCode].sort()

    # Ensure that dates are contiguous
    prevDate = None
    for record in cache[areaCode]:
        currDate = datetime.strptime(record[0], '%Y-%m-%d')

        if prevDate and currDate != prevDate + timedelta(days=1):
            raise RuntimeError(f"Non-contiguous dates for {areaCode} - {record[0]}")

        prevDate = currDate


def loadDailyDeaths(verbose=common_core.verbose):
    '''Load available daily deaths into cache.'''

    rawPath = os.path.join(common_core.dataDir, ons_core.ONS_DEATHS, "raw", "daily")

    cache = {}

    for baseName in os.listdir(rawPath):
        # Ignore temporary files related to spreadhseets that are open
        if not baseName.startswith("~$"):
            fileName = os.path.join(rawPath, baseName)
            if verbose:
                print(f"Loading {baseName}...")
            workbook = open_workbook(fileName)

            for sheet in workbook.sheets():
                if sheet.name.lower().startswith(REGIONAL_LOWER):
                    processDailySheet(cache, sheet, verbose=verbose)

    for areaCode in cache:
        validateDailyCache(cache, areaCode, verbose=verbose)

    return cache

In [6]:
def processDailyEwmSheet(cache, nationName, sheet, verbose=common_core.verbose):
    '''Process daily occurrences in worksheet'''

    headerFound = False

    for rowNo in range(sheet.nrows):
        if headerFound == False:
            # Primitive check to determine presence of header
            if (sheet.cell(rowNo, 0).value == "Date" and
                sheet.cell(rowNo, 1).value == "Number of daily deaths"):
                headerFound = True
        else:
            # Stop at the first empty row
            if sheet.cell(rowNo, 0).value == "":
                break

            # Date format can vary from year to year in the EWM spreadsheets
            value = sheet.cell(rowNo, 0).value
            try:
                if "/" in value:
                    if len(value) == 10:
                        ymd = datetime.strptime(value, '%d/%m/%Y')
                    else:
                        ymd = datetime.strptime(value, '%d/%m/%y')
                elif " " in value:
                    ymd = datetime.strptime(value, '%d %b %y')
                else:
                    raise RuntimeError(f"Unsupported date format {value}")
            except:
                print(f"Error trying to parse {value}")
                raise

            deaths = int(sheet.cell(rowNo, 1).value)

            # Initialise cache if necessary
            if nationName not in cache:
                cache[nationName] = []

            # Store row in cache
            cache[nationName].append([ymd.strftime("%Y-%m-%d"), deaths])


def validateDailyEwmCache(cache, nationName, verbose=common_core.verbose):
    '''Sort the daily occurrences for the specified area and report non-contiguous dates'''

    if verbose:
        print(f"Validating daily cache for {nationName}...")

    # Sort data chronologically
    cache[nationName].sort()

    # Ensure that dates are contiguous
    prevDate = None
    for record in cache[nationName]:
        currDate = datetime.strptime(record[0], '%Y-%m-%d')

        if prevDate and currDate != prevDate + timedelta(days=1):
            raise RuntimeError(f"Non-contiguous dates for {nationName} - {record[0]}")

        prevDate = currDate


def loadDailyEwmDeaths(verbose=common_core.verbose):
    '''Load available daily deaths into cache.'''

    rawPath = os.path.join(common_core.dataDir, ons_core.ONS_EWM_DEATHS, "raw", "daily")

    cache = {}

    for baseName in os.listdir(rawPath):
        # Ignore temporary files related to spreadhseets that are open
        if not baseName.startswith("~$"):
            if common_core.ENGLAND in baseName:
                nationName = common_core.ENGLAND
            elif common_core.WALES in baseName:
                nationName = common_core.WALES
            else:
                raise RuntimeError(f"Unrecognised filename {baseName}")

            fileName = os.path.join(rawPath, baseName)
            if verbose:
                print(f"Loading {baseName}...")
            workbook = open_workbook(fileName)

            for sheet in workbook.sheets():
                processDailyEwmSheet(cache, nationName, sheet, verbose=verbose)

    for nationName in cache:
        validateDailyEwmCache(cache, nationName, verbose=verbose)

    return cache

## Cache Enhancement

Enhance data in the cache

In [7]:
def applyDailyDeaths(cache, verbose=common_core.verbose):
    '''Apply daily deaths from historical extracts'''

    dailyDeaths = ons_core.loadCsvFiles(ons_core.ONS_DEATHS, "daily", verbose=verbose)

    for areaName in dailyDeaths:

        # Determine with the first "week ended" in the daily cache
        weekEnded = dailyDeaths[areaName][0]["date"]

        # Locate the first suitable week ended in the main cache
        cacheIdx = np.where(cache[areaName][ons_core.WEEK_ENDED] >= weekEnded)[0][0]
        weekEnded = cache[areaName][cacheIdx][ons_core.WEEK_ENDED]

        # Locate this date in the daily occurrences
        matches = np.where(dailyDeaths[areaName]["date"] == weekEnded)[0]

        # Processing can only proceed if the date was found
        if len(matches) > 0:
            dailyIdx = matches[0]

            # Calculate 7 day rolling totals for daily occurrences
            # TODO - Consider optimising this and only calculating for limited rows?
            rollingSums = common_core.rollingSum(dailyDeaths[areaName][ons_core.TOTAL_OCCURRENCES], window = 7)

            # Take a copy of the days which are aligned with "week ended"
            fridayTotals = rollingSums[dailyIdx::7]
            cache[areaName][cacheIdx:cacheIdx + len(fridayTotals)][ons_core.TOTAL_OCCURRENCES] = fridayTotals

In [8]:
def createEngland(cache, verbose=common_core.verbose):
    '''Create data for England (if not already present) using England and Wales'''

    # Remove England from the cache if it already exists
    cache.pop(common_core.ENGLAND, None)

    # Create England from the individual regions
    for regionName in common_core.regionNames:
        if common_core.ENGLAND in cache:
            cache[common_core.ENGLAND][ons_core.TOTAL_REGISTRATIONS] += cache[regionName][ons_core.TOTAL_REGISTRATIONS]
            cache[common_core.ENGLAND][ons_core.TOTAL_OCCURRENCES] += cache[regionName][ons_core.TOTAL_OCCURRENCES]

            cache[common_core.ENGLAND][ons_core.COVID_REGISTRATIONS] += cache[regionName][ons_core.COVID_REGISTRATIONS]
            cache[common_core.ENGLAND][ons_core.COVID_OCCURRENCES] += cache[regionName][ons_core.COVID_OCCURRENCES]

        else:
            cache[common_core.ENGLAND] = np.copy(cache[regionName])

In [9]:
def applyDailyEwmDeaths(cache, verbose=common_core.verbose):
    '''Apply daily deaths from Excess Winter Mortality reports'''

    dailyDeaths = ons_core.loadCsvFiles(ons_core.ONS_EWM_DEATHS, "daily", verbose=verbose)

    for nationName in dailyDeaths:

        # Determine with the first "week ended" in the daily cache
        for dailyIdx in range(7):
            weekEnded = dailyDeaths[nationName][dailyIdx]["date"]
            if datetime.strptime(weekEnded, "%Y-%m-%d").weekday() == 4:
                break

        # Locate the week ended in the main cache
        cacheIdx = np.where(cache[nationName][ons_core.WEEK_ENDED] == weekEnded)[0][0]

        # Locate the first week in main cache without total_occurrences
        cacheIdx = np.where(cache[nationName][cacheIdx:][ons_core.TOTAL_OCCURRENCES] == 0)[0][0] + cacheIdx
        weekEnded = cache[nationName][cacheIdx][ons_core.WEEK_ENDED]

        # Locate this date in the daily occurrences
        matches = np.where(dailyDeaths[nationName]["date"] == weekEnded)[0]

        # Processing can only proceed if the date was found
        if len(matches) > 0:
            dailyIdx = matches[0]

            # Calculate 7 day rolling totals for daily occurrences
            # TODO - Consider optimising this and only calculating for limited rows?
            rollingSums = common_core.rollingSum(dailyDeaths[nationName][ons_core.TOTAL_OCCURRENCES], window = 7)

            # Take a copy of the days which are aligned with "week ended"
            fridayTotals = rollingSums[dailyIdx::7]
            fridayTotals += cache[nationName][cacheIdx:cacheIdx + len(fridayTotals)][ons_core.COVID_OCCURRENCES]
            cache[nationName][cacheIdx:cacheIdx + len(fridayTotals)][ons_core.TOTAL_OCCURRENCES] = fridayTotals

In [10]:
def patchEnglandWales(cache, verbose=common_core.verbose):
    '''Patch combined figures for England and Wales using individual figures for England and Wales'''

    # Find the first non-zero value in England and Wales
    idx = np.where(cache[common_core.ENGLAND_WALES][ons_core.TOTAL_OCCURRENCES] > 0)[0][0]

    # Patch everything before that value using individual nations
    cache[common_core.ENGLAND_WALES][:idx][ons_core.TOTAL_OCCURRENCES] = \
        cache[common_core.ENGLAND][:idx][ons_core.TOTAL_OCCURRENCES] + \
        cache[common_core.WALES][:idx][ons_core.TOTAL_OCCURRENCES]

In [11]:
def getEstimatedOccurrences(cache, verbose=common_core.verbose):
    """Calculate missing values in cache"""

    estimates = {}

    # Use the ONS occurrences for England and Wales
    masterArea = common_core.ENGLAND_WALES

    # Non-COVID registrations for the master area, shifted left by ~3.5 days
    knownRegistrations = ons_core.shiftRegistrations(cache[masterArea][ons_core.TOTAL_REGISTRATIONS] -
                                            cache[masterArea][ons_core.COVID_REGISTRATIONS])

    # Non-COVID occurrences for the master area without any shift
    knownOccurrences =  cache[masterArea][ons_core.TOTAL_OCCURRENCES] - cache[masterArea][ons_core.COVID_OCCURRENCES]

    # Run this process for all regions and nations other than the master area
    for areaName in common_core.regionNames + common_core.nationNames:
        if areaName != masterArea and areaName in cache:

            # Shift registrations left by half a week
            shiftedRegistrations = ons_core.shiftRegistrations(cache[areaName][ons_core.TOTAL_REGISTRATIONS] -
                                                     cache[areaName][ons_core.COVID_REGISTRATIONS])

            # Estimate occurrences using a simple percentage of the known occurrences
            estimatedOccurrences = knownOccurrences * np.divide(shiftedRegistrations, knownRegistrations,
                                      out=np.zeros_like(shiftedRegistrations), where=knownRegistrations != 0) + \
                                    cache[areaName][ons_core.COVID_OCCURRENCES]

            # Locate the last week in the cache where total_occurrences is populated
            cacheIdx = np.where(cache[areaName][ons_core.TOTAL_OCCURRENCES] > 0)[0][-1]

            # Patch subsequent data using the estimates
            cache[areaName][cacheIdx + 1:][ons_core.TOTAL_OCCURRENCES] = estimatedOccurrences[cacheIdx + 1:]

            # Maintain a cache of estimates for testing purpopses
            estimates[areaName] = estimatedOccurrences

    return estimates

In [12]:
def calculateErrors(cache, estimates, verbose=common_core.verbose):
    """Calculate estimation errors"""

    masterArea = common_core.ENGLAND_WALES

    for areaName in common_core.regionNames + common_core.nationNames:
        if areaName in cache and areaName != masterArea:
            print(f"{areaName}:")

            totalPctMAE = 0

            years = range(2010, 2018)

            for year in years:
                startDate = f"{year}-01-01"
                stopDate = f"{year}-12-31"

                areaData = cache[areaName]

                startIdx = np.where(areaData[ons_core.WEEK_ENDED] >= startDate)[0][0]
                stopIdx = np.where(areaData[ons_core.WEEK_ENDED] < stopDate)[0][-1]

                pctMAE = 100 * np.average(np.abs(areaData[startIdx:stopIdx][ons_core.TOTAL_OCCURRENCES] -
                                            areaData[startIdx:stopIdx][ons_core.TOTAL_REGISTRATIONS].astype(np.float64)) /
                                        areaData[startIdx:stopIdx][ons_core.TOTAL_OCCURRENCES])

                pctMAE = 100 * np.average(np.abs(areaData[startIdx:stopIdx][ons_core.TOTAL_OCCURRENCES] -
                                            ons_core.shiftRegistrations(areaData[startIdx:stopIdx][ons_core.TOTAL_REGISTRATIONS])) /
                                        areaData[startIdx:stopIdx][ons_core.TOTAL_OCCURRENCES])

                pctMAE = 100 * np.average(np.abs(areaData[startIdx:stopIdx][ons_core.TOTAL_OCCURRENCES] -
                                            estimates[areaName][startIdx:stopIdx]) /
                                        areaData[startIdx:stopIdx][ons_core.TOTAL_OCCURRENCES])

                totalPctMAE += pctMAE

                print(f"{year} = {pctMAE:.2f}%")

            print(f"Avg  = {totalPctMAE / len(years):.2f}%")
            print()

In [13]:
def polyfillCache(cache, estimationReport=False, verbose=common_core.verbose):
    '''Fill in missing cache values using additional data sources and derivations'''

    # Reset total_occurrences to zero so that they can be re-calculated
    for regionName in cache:
        if regionName != common_core.ENGLAND_WALES:
            cache[regionName][ons_core.TOTAL_OCCURRENCES] = 0

    # Apply daily deaths from historical extracts
    applyDailyDeaths(cache, verbose=verbose)

    # Create England from the regions
    createEngland(cache, verbose=verbose)

    # Apply daily deaths from Excess Winter Mortality reports
    applyDailyEwmDeaths(cache, verbose=verbose)

    # Create England and Wales
    patchEnglandWales(cache, verbose=verbose)

    # Estimate the number of occurrences where necessary
    estimates = getEstimatedOccurrences(cache, verbose=verbose)

    # Evaluation of estimation errors
    if estimationReport:
        calculateErrors(cache, estimates, verbose=verbose)

In [14]:
def trimCache(cache, verbose=common_core.verbose):
    '''Remove rows from cache which don't contain any useful data'''

    for areaName in cache:
        # Locate the first row to retain
        minIdx = np.where(cache[areaName][ons_core.WEEK_ENDED] == '2000-01-07')[0][0]

        # Locate the first row to retain - total_registrations or total_occurrences > 0
        maxRegIdx = np.where(cache[areaName][ons_core.TOTAL_REGISTRATIONS] > 0)[0][-1]
        maxOccIdx = np.where(cache[areaName][ons_core.TOTAL_OCCURRENCES] > 0)[0][-1]

        # Trim the area
        cache[areaName] = cache[areaName][minIdx:max(maxRegIdx, maxOccIdx) + 1]

## Save Cache Data

Save data in numpy arrays to CSV files

In [15]:
def saveDailyArea(cache, areaType, areaName, areaCode, verbose=common_core.verbose):
    '''Save data in cache to CSV'''

    if areaCode in cache:
        if verbose:
            print(f"Saving {areaName}...")

        # Ensure CSV path exists
        csvPath = os.path.join(common_core.dataDir, ons_core.ONS_DEATHS, "csv", "daily", areaType)
        if not os.path.exists(csvPath):
            os.makedirs(csvPath)

        # Determine safe filename
        csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")

        # Save data to CSV
        with open(csvFn, 'w') as csvFile:
            writer = csv.writer(csvFile)

            colNames = ["date", ons_core.TOTAL_OCCURRENCES]
            writer.writerow(colNames)

            writer.writerows(cache[areaCode])


def saveDailyDeaths(cache, verbose=common_core.verbose):
    '''Save all extracted data to CSV'''

    for nationCode in common_core.nations:
        nationName = common_core.nations[nationCode]
        saveDailyArea(cache, "nation", nationName, nationCode, verbose=verbose)

    for regionCode in common_core.regions:
        regionName = common_core.regions[regionCode]
        saveDailyArea(cache, "region", regionName, regionCode, verbose=verbose)

In [16]:
def saveDailyEwmNation(cache, nationName, verbose=common_core.verbose):
    '''Save data in cache to CSV'''

    if nationName in cache:
        if verbose:
            print(f"Saving {nationName}...")

        # Ensure CSV path exists
        csvPath = os.path.join(common_core.dataDir, ons_core.ONS_EWM_DEATHS, "csv", "daily", "nation")
        if not os.path.exists(csvPath):
            os.makedirs(csvPath)

        # Determine safe filename
        csvFn = os.path.join(csvPath, common_core.getSafeName(nationName) + ".csv")

        # Save data to CSV
        with open(csvFn, 'w') as csvFile:
            writer = csv.writer(csvFile)

            colNames = ["date", ons_core.TOTAL_OCCURRENCES]
            writer.writerow(colNames)

            writer.writerows(cache[nationName])


def saveDailyEwmDeaths(cache, verbose=common_core.verbose):
    '''Save all extracted data to CSV'''

    for nationName in common_core.nationNames:
        saveDailyEwmNation(cache, nationName, verbose=verbose)

In [17]:
def saveWeeklyArea(cache, areaType, areaName, verbose=common_core.verbose):
    '''Save data in cache to CSV'''

    if areaName in cache:
        header = ','.join(cache[areaName].dtype.names)

        # Ensure CSV path exists
        csvPath = os.path.join(common_core.dataDir, ons_core.ONS_DEATHS, "csv", "weekly", areaType)
        if not os.path.exists(csvPath):
            os.makedirs(csvPath)

        # Determine safe filename
        csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")

        if verbose:
            partName = common_core.getPartName(csvFn)
            print(f"Saving {partName}...")

        # Save data to CSV
        np.savetxt(csvFn, cache[areaName], fmt='%s', delimiter=',', header=header, comments='')


def saveWeeklyDeaths(cache, verbose=common_core.verbose):
    '''Save all extracted data to CSV'''

    for nationName in common_core.nationNames:
        saveWeeklyArea(cache, "nation", nationName, verbose=verbose)

    for regionName in common_core.regionNames:
        saveWeeklyArea(cache, "region", regionName, verbose=verbose)

## Interactive Testing

In [18]:
if __name__ == '__main__':

    verbose = True

    # One-off conversion of daily occurrences - quite slow so don't run repeatedly!
    #cache = loadDailyDeaths(verbose=verbose)
    #saveDailyDeaths(cache, verbose=verbose)

    # One-off conversion of daily occurrences - quite slow so don't run repeatedly!
    #cache = loadDailyEwmDeaths(verbose=verbose)
    #saveDailyEwmDeaths(cache, verbose=verbose)

    # Check website for latest spreadsheets then load into cache
    partNames = ons_download.downloadDeaths(verbose=verbose)
    cache = loadWeeklyDeaths(partNames, verbose=verbose)

    # Alternatively we can use the existing CSV files
    #cache = ons_core.loadCsvFiles(ons_core.ONS_DEATHS, "weekly", verbose=verbose)

    # Tidy up the data, prior to saving as CSV files
    polyfillCache(cache, estimationReport=False, verbose=verbose)
    trimCache(cache, verbose=verbose)

    # Save cache as CSV files
    saveWeeklyDeaths(cache, verbose=verbose)

Skipping download of ons-deaths/raw/weekly/publishedweek092021.xlsx...
Skipping download of ons-deaths/raw/weekly/publishedweek532020.xlsx...
Skipping download of ons-deaths/raw/weekly/publishedweek522019.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek522018withupdatedrespiratoryrow.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek522017.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek522016.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek2015.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek2014.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek2013.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek2012.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek2011.xls...
Skipping download of ons-deaths/raw/weekly/publishedweek2010.xls...
Loading ons-deaths/raw/weekly/publishedweek092021.xlsx...
Loading ons-deaths/raw/weekly/publishedweek532020.xlsx...
Loading ons-deaths/raw/weekly