# ONS Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

## Imports

Standard python libraries plus determination of projdir, basic printable class, etc

In [1]:
import os
import re
from datetime import date, datetime, timedelta

import csv
from xlrd import open_workbook

import numpy as np
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt

import common_core

## Configuration

Data to download from the NHS statistical work area

In [2]:
# The 4 nations in the UK
nationNames = ["England", "Scotland", "Wales", "Northern Ireland"]

# The 9 regions in England
regionNames = common_core.regionNames

# The aliases for regions in England
regionAliases = {"East of England": ["East"]}

# Combine all of these area types into a single list
areas = [("nation", nationNames), ("region", regionNames)]

In [3]:
# The latest ONS age bands
ageDemographics = [
    '<1', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29',
    '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90+'
]

# The legacy ONS age bands
legacyAgeDemographics = [
    '01-14', '15-44', '45-64', '65-74', '75-84', '85+'
]

In [4]:
deathsUrl = "https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales"
deathsPath = os.path.join(common_core.projdir, "data", "ons-deaths", "raw")

deathsFiles = [
    ("weekly", ".*\.xlsx?$")
]

## Constants

Text strings to avoid hard-coded values throughout the code; avoids clutter and silent errors.

In [5]:
# The epoch for dates in Excel
epoch = datetime(1900, 1, 1)

# Index from Fri 2 Jan 1970
minWeek = date(1970, 1, 2)
maxWeek = date(datetime.now().year, 12, 31)

In [6]:
# Nation names
ENGLAND = "England"
WALES = "Wales"
ENGLAND_WALES = "England + Wales"

# England + Wales (historical occurrences)
ENGLAND_WALES_OCC = "England + Wales Occ"

In [7]:
# Worksheet names (lower case)
WEEKLY_FIGURES_LOWER = "weekly figures 20"
ESTIMATED_TOTAL_DEATHS_LOWER = "estimated total deaths"
COVID_WEEKLY_REGISTRATIONS_LOWER = "covid-19 - weekly registrations"
COVID_WEEKLY_OCCURRENCES_LOWER = "covid-19 - weekly occurrences"

# Text used to find specific lines
WEEK_NUMBER_TEXT = "Week number"
WEEK_ENDED_TEXT = "Week ended"

# Regular expressions used to find specific lines
TOTAL_DEATHS_REGEX = "^Total deaths, all ages"
TOTAL_OCCURRENCES_REGEX = "^Estimated total death occurrences$"
COVID_DEATHS_REGEX = "^Deaths involving COVID-19, all ages"
RESPIRATORY_REGEX = ".*ICD-10 J00-J99.*"

# Maximum number of columns to search for text / regex
MAX_COLS_WITH_HEADERS = 2

In [8]:
# Weeks have an end date and a number, always ending on Fridays
WEEK_ENDED = "week_ended"
WEEK_NUMBER = "week_number"

# Internal names used by the cache
WEEK_COL_NOS = "week_col_nos"
WEEK_NUMBERS = "week_numbers"
WEEK_ENDINGS = "week_endings"
WEEK_OFFSETS = "week_offsets"

# Deaths are reported by registration date and occurrence date
TOTAL_REGISTRATIONS = "total_registrations"
TOTAL_OCCURRENCES = "total_occurrences"
COVID_REGISTRATIONS = "covid_registrations"
COVID_OCCURRENCES = "covid_occurrences"

## Download Spreadsheets

Download spreadsheets by parsing the HTML for suitable links

In [9]:
def downloadDeaths(skipExisting=common_core.skipExisting, verbose=common_core.verbose):
    """Download spreadsheets from ONS website"""

    webDownload = common_core.WebDownload(skipExisting=skipExisting, verbose=verbose)
    partNames = webDownload.downloadFiles(deathsPath, deathsUrl, deathsFiles)

    # Page did not have a 2021 section and replaced 2020
    if "weekly/publishedweek532020.xlsx" not in partNames:
        partNames += ["weekly/publishedweek532020.xlsx"]

    return partNames

## Facilitate Parsing

Find specific lines in the spreadsheet, etc

In [10]:
def findRowNo(sheet, heading, aliases = {}):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Search for row headings with precise wording
    headingLower = heading.lower()

    # Aliases are still regarded as precise wording
    if heading in aliases:
        aliasesLower = [alias.lower() for alias in aliases[heading]]
    else:
        aliasesLower = []

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                cellValueLower = cellValue.lower()
                if cellValueLower == headingLower or cellValueLower in aliasesLower:
                    matches.append(rowNo)

    if len(matches) == 0:
        rowNo = -1
    elif len(matches) > 1:
        raise RuntimeError(f"'{heading}' found in '{sheet.name} multiple times - rows {[match + 1 for match in matches]}")
    else:
        rowNo = matches[0]

    return rowNo


def regexFindRowNos(sheet, pattern, verbose = common_core.verbose):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Pre-compile regex for minor speedup
    regex = re.compile(pattern)

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                if regex.match(cellValue):
                    matches.append(rowNo)

    if len(matches) > 1 and verbose:
        print(f"WARNING: '{pattern}' found in '{sheet.name}' multiple times - rows {[match + 1 for match in matches]}")

    if len(matches) == 0:
        rowNo = -1
    else:
        rowNo = matches[0]

    return rowNo


def getWeekColNos(sheet):
    '''Determine the columns of week numbers from the cells in the specified row.'''

    rowNo = findRowNo(sheet, WEEK_NUMBER_TEXT)
    colNos = []

    for colNo in range(sheet.ncols):
        cellValue = sheet.cell(rowNo, colNo).value

        # If the cell contains a value that can be converted to an integer then treat it as a week number
        try:
            intValue = int(cellValue)
            colNos.append(colNo)
        except:
            pass

    return colNos


def getWeekNumbers(sheet, colNos):
    '''Determine the week numbers from the cells in the specified row.'''

    rowNo = findRowNo(sheet, WEEK_NUMBER_TEXT)
    weekNumbers = []

    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        weekNumbers.append(int(cellValue))

    return weekNumbers


def getWeekEndings(sheet, colNos):
    '''Determine the week endings from the cells in the specified row.'''

    rowNo = findRowNo(sheet, WEEK_ENDED_TEXT)
    weekEndings = []

    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        if isinstance(cellValue, str):
            weekEnding = datetime.strptime(cellValue, '%d-%b-%y')
        else:
            weekEnding = epoch + timedelta(days=cellValue - 2)

        weekEndings.append(weekEnding.strftime("%Y-%m-%d"))

    return weekEndings


def getWeekOffsets(weekEndings):
    '''Determine the week endings from the cells in the specified row.'''

    weekOffsets = []

    for weekEnding in weekEndings:
        weekEnding = datetime.strptime(weekEnding, "%Y-%m-%d")
            
        delta = weekEnding.date() - minWeek
        weekOffset = delta.days // 7

        weekOffsets.append(weekOffset)

    return weekOffsets


def getCellValue(sheet, rowNo, colNo):
    '''Determine the weekly deaths from the cells in the specified row.'''
    
    cellValue = sheet.cell(rowNo, colNo).value

    # 2011 switched from ICD-10 v 2001 to ICD-10 v 2010 (NCHS)
    # 2014 switched from ICD-10 v 2010 (NCHS) to ICD-10 v 2013 (IRIS)
    if cellValue == ":":
        cellValue = 0

    else:
        # Allow non-integers to be treated as zero but show a warning
        try:
            if cellValue != "":
                cellValue = int(cellValue)
            else:
                cellValue = 0
        except:
            print(f"Warning: Failed to convert '{cellValue}' to integer in '{sheet.name}' (row {rowNo + 1} col {colNo + 1})")
            cellValue = 0

    return cellValue

## Process Spreadsheet

Stuff more specific to the ONS spreadsheets

In [11]:
def initCache(cache, regionNames):
    '''Initialise cache for an individual region'''
    
    dtype = {'names':[WEEK_ENDED, WEEK_NUMBER, TOTAL_REGISTRATIONS, TOTAL_OCCURRENCES, COVID_REGISTRATIONS, COVID_OCCURRENCES],
             'formats':['U10', 'B', 'I', 'I', 'I', 'I']}

    # Calculate the maximum array length
    delta = maxWeek - minWeek
    maxWeeks = delta.days // 7 + 1

    # Allocate cache for the region
    for regionName in regionNames:
        if regionName not in cache:
            cache[regionName] = np.zeros(maxWeeks, dtype=dtype)

            # Pre-populate week_ended
            for weeksDelta in range(maxWeeks):
                weekEnding = minWeek + timedelta(weeks=weeksDelta)
                cache[regionName][WEEK_ENDED][weeksDelta] = weekEnding.strftime("%Y-%m-%d")


def processAreas(cache, sheetsInfo, regionNames):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''
    
    initCache(cache, regionNames)

    for sheetInfoKey in sheetsInfo:
        sheetInfo = sheetsInfo[sheetInfoKey]

        sheet = sheetInfo["sheet"]
        weekColNos = sheetInfo[WEEK_COL_NOS]
        weekNumbers = sheetInfo[WEEK_NUMBERS]
        weekEndings = sheetInfo[WEEK_ENDINGS]
        weekOffsets = sheetInfo[WEEK_OFFSETS]

        for regionName in regionNames:
            if regionName in sheetInfo["regions"]:
                rowNo = sheetInfo["regions"][regionName]

                for i in range(len(weekColNos)):
                    cellValue = getCellValue(sheet, rowNo, weekColNos[i])

                    weekNumber = weekNumbers[i]
                    weekEnding = weekEndings[i]
                    weekOffset = weekOffsets[i]

                    assert cache[regionName][WEEK_ENDED][weekOffset] == weekEnding, "Bug in week offset calculations!"

                    cache[regionName][WEEK_NUMBER][weekOffset] = weekNumber
                    cache[regionName][sheetInfoKey][weekOffset] = cellValue


def processRegions(cache, sheetsInfo):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''

    for regionName in common_core.regionNames:
        for sheetInfoKey in sheetsInfo:
            sheetInfo = sheetsInfo[sheetInfoKey]
            sheet = sheetInfo["sheet"]

            rowNo = findRowNo(sheet, regionName, regionAliases)

            if rowNo >= 0:
                sheetInfo["regions"][regionName] = rowNo

    processAreas(cache, sheetsInfo, common_core.regionNames)


def processNations(cache, sheetsInfo):
    '''Parse the specified worksheet for weekly deaths in a specific country.'''

    regionNames = [ENGLAND_WALES, WALES]
    
    for regionName in regionNames:
        for sheetInfoKey in sheetsInfo:
            sheetInfo = sheetsInfo[sheetInfoKey]
            sheet = sheetInfo["sheet"]

            if regionName == ENGLAND_WALES:
                if sheet.name.lower().startswith(WEEKLY_FIGURES_LOWER):
                    rowNo = regexFindRowNos(sheet, TOTAL_DEATHS_REGEX)
                elif sheet.name.lower().startswith(ESTIMATED_TOTAL_DEATHS_LOWER):
                    rowNo = regexFindRowNos(sheet, TOTAL_OCCURRENCES_REGEX)
                else:
                    rowNo = regexFindRowNos(sheet, COVID_DEATHS_REGEX)
            else:
                rowNo = findRowNo(sheet, regionName)

            if rowNo >= 0:
                sheetInfo["regions"][regionName] = rowNo
                
    processAreas(cache, sheetsInfo, regionNames)


def processSheets(cache, sheetsInfo):
    '''Parse the specified worksheets for weekly deaths.'''

    for sheetInfoKey in sheetsInfo:
        sheetInfo = sheetsInfo[sheetInfoKey]
        sheet = sheetInfo["sheet"]

        weekColNos = getWeekColNos(sheet)
        sheetInfo[WEEK_COL_NOS] = weekColNos

        weekNos = getWeekNumbers(sheet, weekColNos)
        assert len(weekNos) == len(weekColNos), f"Number of week numbers did not match number of weeks in '{sheet.name}'"
        sheetInfo[WEEK_NUMBERS] = weekNos
        
        weekEndings = getWeekEndings(sheet, weekColNos)
        assert len(weekEndings) == len(weekColNos), f"Number of week endings did not match number of weeks in '{sheet.name}'"
        sheetInfo[WEEK_ENDINGS] = weekEndings

        weekOffsets = getWeekOffsets(weekEndings)
        assert len(weekOffsets) == len(weekEndings), f"Number of week offsets did not match number of weeks in '{sheet.name}'"
        sheetInfo[WEEK_OFFSETS] = weekOffsets

        sheetInfo["regions"] = {}

    processRegions(cache, sheetsInfo)
    processNations(cache, sheetsInfo)


def processWorkbook(cache, workbook):
    '''Parse the specified workbook for weekly deaths.'''

    sheetsInfo = {}

    for sheet in workbook.sheets():           
        if sheet.name.lower().startswith(WEEKLY_FIGURES_LOWER):
            sheetsInfo[TOTAL_REGISTRATIONS] = {"sheet": sheet}
        elif sheet.name.lower().startswith(ESTIMATED_TOTAL_DEATHS_LOWER):
            sheetsInfo[TOTAL_OCCURRENCES] = {"sheet": sheet}
        elif sheet.name.lower() == COVID_WEEKLY_REGISTRATIONS_LOWER:
            sheetsInfo[COVID_REGISTRATIONS] = {"sheet": sheet}
        elif sheet.name.lower() == COVID_WEEKLY_OCCURRENCES_LOWER:
            sheetsInfo[COVID_OCCURRENCES] = {"sheet": sheet}
            
    processSheets(cache, sheetsInfo)


def getWorkbookYear(workbook):
    '''Scan the specified workbook to determine the year.'''

    year = None

    for sheet in workbook.sheets():           
        if sheet.name.lower().startswith(WEEKLY_FIGURES_LOWER):
            year = int(sheet.name[-4:])

    if year == None:
        raise RuntimeError(f"Year could not be determined for workbook")

    return year

## Save Data

Turn data from lists into CSV format

In [12]:
def saveArea(cache, areaType, areaName):
    '''Save data in cache to CSV'''

    header = ','.join(cache[areaName].dtype.names)
    
    # Ensure CSV path exists
    csvPath = os.path.join(common_core.projdir, "data", "ons-deaths", "csv", "weekly", "deaths", areaType)
    if not os.path.exists(csvPath):
        os.makedirs(csvPath)

    # Determine safe filename
    csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")

    # Save data to CSV
    np.savetxt(csvFn, cache[areaName], fmt='%s', delimiter=',', header=header, comments='')


def saveRegions(cache):
    '''Save extracted data for all regions'''

    for regionName in common_core.regionNames:
        saveArea(cache, "region", regionName)
    
    
def saveNations(cache):
    '''Save extracted data for supported nations'''

    for nationName in [ENGLAND_WALES, WALES]:
        saveArea(cache, "nation", nationName)
    
    
def saveCache(cache):
    '''Save all extracted data to CSV'''

    saveRegions(cache)
    saveNations(cache)

## Spreadsheet Interface

Main interface for converting from XLSX files to CSV files

In [13]:
def loadExcelFiles(partNames):
    '''Load the specified spreadsheets into cache.'''

    # Iterate through all workbooks to determine the years
    years = {}
    for partName in partNames:
        fileName = os.path.join(common_core.projdir, "data", "ons-deaths", "raw", partName)
        workbook = open_workbook(fileName)

        year = getWorkbookYear(workbook)
        years[year] = workbook

    # Iterate throught the years in chronological order - required to handle the 2021 hybrid!
    cache = {}
    for year in sorted(years):
        try:
            processWorkbook(cache, years[year])
        except:
            print(f"ERROR: Exception raise whilst processing workbook for {year}")
            raise

    return(cache)

## Calculate Regional Occurrences

Use the ONS modelled occurrences to calculate regional occurrences

In [14]:
def loadCsvFile(areaType, areaName):
    '''Load CSV file into numpy array'''
    csvPath = os.path.join(common_core.projdir, "data", "ons-deaths", "csv", "weekly", "deaths", areaType)
    csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")

    try:
        with open(csvFn, 'r') as f:
            reader = csv.reader(f, delimiter = ',')

            dtype = []
            converters = {}
            colNames = next(reader)

            for i in range(len(colNames)):
                colName = colNames[i]
                if colName == WEEK_ENDED:
                    dtype.append((colName, "U10"))
                else:
                    dtype.append((colName, "u4"))
                    converters[i] = lambda s: int(s or 0)

            data = np.genfromtxt(f, dtype=dtype, converters=converters, delimiter=",")

    except:
        print(f"Failed to load CSV data for {areaName}")
        raise
        
    return data


def loadCsvFiles():
    '''Convert weekly registrations into weekly occurrences'''
    
    cache = {}
    
    for nationName in [ENGLAND_WALES, WALES]:
        cache[nationName] = loadCsvFile("nation", nationName)

    for regionName in common_core.regionNames:
        cache[regionName] = loadCsvFile("region", regionName)

    return cache

In [15]:
def plotData(cache, areaNames, maxWeeks = 52):
    '''Plot data for visual inspection'''
    
    # week_ended, week_number, total_registrations, covid_registrations, covid_occurrences
    # week_ended, week_number, total_deaths
    
    ewData = cache[ENGLAND_WALES]

    for areaName in areaNames:
        areaData = cache[areaName]

        figure = plt.figure(clear=True, figsize=(16, 6))  
        plt.title(areaName)
        plt.ylabel('Number of deaths')

        try:
            y_points = areaData[TOTAL_OCCURRENCES][-maxWeeks:]
            x_points = np.arange(len(y_points))       
            plt.plot(x_points, y_points, label = "Total Occurrences", color='green')
        except:
            pass

        try:
            y_points = areaData[TOTAL_REGISTRATIONS][-maxWeeks:]
            x_points = np.arange(len(y_points))       
            plt.plot(x_points, y_points, label = "Total Registrations", color='lightsteelblue')
        except:
            pass

        try:
            #y_points = ewData[TOTAL_OCCURRENCES] * areaData[TOTAL_REGISTRATIONS] / ewData[TOTAL_REGISTRATIONS]
            x_points = np.arange(len(y_points))
            #plt.plot(x_points, y_points, label = "Estimated Occurrences", color='navy')
        except:
            pass

        try:
            y_points = areaData[COVID_OCCURRENCES][-maxWeeks:]
            x_points = np.arange(len(y_points))       
            plt.plot(x_points, y_points, label = "COVID Occurrences", color='red')
        except:
            pass

        x_ticks = np.array(areaData[WEEK_ENDED])[-maxWeeks:]
        plt.xticks(np.arange(0, len(x_ticks), step=1), x_ticks[::1], rotation=90)

        plt.yticks(np.arange(0, 24000, 1000))
        #figure.get_yaxis().set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
        #plt.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        #plt.rc('ytick', labelsize=500) 
        
        plt.legend()

## Interactive Testing

In [16]:
def calculateOccurrencesW53(cache):
    '''Calculate the estimated number of occurrences for week 53 of 2020'''

    indexW53 = np.where(cache[ENGLAND_WALES][WEEK_ENDED] == '2021-01-01')[0][0]
    
    # Take minimum of 4 weeks before and 4 weeks afterwards
    yBefore = cache[ENGLAND_WALES][TOTAL_OCCURRENCES][indexW53 - 4:indexW53]
    yAfter = cache[ENGLAND_WALES][TOTAL_OCCURRENCES][indexW53 + 1:indexW53 + 5]
    y = np.hstack((yBefore, yAfter))
    
    # y values need to correspond to the x values
    xBefore = np.arange(len(yBefore))
    xAfter = np.arange(len(yAfter)) + 5
    x = np.hstack((xBefore, xAfter))
    
    # Calculate the missing point using Cubic Spline
    cs = CubicSpline(x, y)

    cache[ENGLAND_WALES][TOTAL_OCCURRENCES][indexW53] = cs(4)
    

def polyfillCache(cache):
    '''Calculate missing values using whatever method is appropriate'''

    calculateOccurrencesW53(cache)

    
def trimCache(cache):
    '''Remove rows from cache which don't contain any useful data'''
    
    for areaName in cache:
        populated = np.where(cache[areaName][TOTAL_REGISTRATIONS] > 0)[0]
        cache[areaName] = cache[areaName][populated[0]:populated[-1] + 1]

In [17]:
if __name__ == '__main__':
    # Check / download latest spreadsheets
    #partNames = downloadDeaths()
    #cache = loadExcelFiles(partNames)
    
    # Tidy up the data, prior to saving in CSV format
    #trimCache(cache)
    #polyfillCache(cache)
    #saveCache(cache)
    
    # Cache can either be re-used or loaded from CSV files
    cache = loadCsvFiles()

    # Simple charts for review
    #areaNames = [ENGLAND_WALES, WALES] + common_core.regionNames
    areaNames = [ENGLAND_WALES]
    #plotData(cache, areaNames)