# ONS Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

## Imports

Standard python libraries plus determination of projdir, basic printable class, etc

In [1]:
import os
import re
from datetime import datetime, timedelta

import csv
from xlrd import open_workbook

import numpy as np
import matplotlib.pyplot as plt

import common_core

## Configuration

Data to download from the NHS statistical work area

In [2]:
# The 4 nations in the UK
nationNames = common_core.nationNames

# The 9 regions in England
regionNames = common_core.regionNames

# Combine all of these area types into a single list
areas = [("nation", nationNames), ("region", regionNames)]

In [3]:
# The latest ONS age bands
ageDemographics = [
    '<1', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29',
    '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90+'
]

# The legacy ONS age bands
legacyAgeDemographics = [
    '01-14', '15-44', '45-64', '65-74', '75-84', '85+'
]

In [4]:
deathsUrl = "https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales"
deathsPath = os.path.join(common_core.projdir, "data", "ons-deaths", "raw")

deathsFiles = [
    ("weekly", ".*\.xlsx?$")
]

## Download Data

Download spreadsheets by parsing the HTML for suitable links

In [5]:
def downloadDeaths(skipExisting=common_core.skipExisting, verbose=common_core.verbose):
    webDownload = common_core.WebDownload(skipExisting=skipExisting, verbose=verbose)
    partNames = webDownload.downloadFiles(deathsPath, deathsUrl, deathsFiles)

    # Page did not have a 2021 section and replaced 2020
    if "weekly/publishedweek532020.xlsx" not in partNames:
        partNames += ["weekly/publishedweek532020.xlsx"]

    return partNames

## Parse Data

Parse the weekly deaths spreadsheets

In [6]:
epoch = datetime(1900, 1, 1)

ENGLAND_WALES = "England + Wales"
ENGLAND_WALES_OCC = "England + Wales Occ"

WEEK_NUMBER = "Week number"
WEEK_ENDED = "Week ended"

TOTAL_DEATHS = "^Total deaths, all ages"
COVID_DEATHS = "^Deaths involving COVID-19, all ages"

RESPIRATORY_DISEASES = ".*ICD-10 J00-J99.*"

MAX_COLS_WITH_HEADERS = 2

In [7]:
def findRowNo(sheet, heading, aliases = {}):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Search for row headings with precise wording
    headingLower = heading.lower()

    # Aliases are still regarded as precise wording
    if heading in aliases:
        aliasesLower = [alias.lower() for alias in aliases[heading]]
    else:
        aliasesLower = []

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                cellValueLower = cellValue.lower()
                if cellValueLower == headingLower or cellValueLower in aliasesLower:
                    matches.append(rowNo)

    if len(matches) == 0:
        rowNo = -1
    elif len(matches) > 1:
        raise RuntimeError(f"{heading} found in {sheet.name} multiple times - {matches}")
    else:
        rowNo = matches[0]

    return rowNo


def regexFindRowNos(sheet, pattern):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Pre-compile regex for minor speedup
    regex = re.compile(pattern)

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                if regex.match(cellValue):
                    matches.append(rowNo)

    if len(matches) == 0:
        rowNo = -1
    elif len(matches) > 1:
        raise RuntimeError(f"{pattern} found in {sheet.name} multiple times - {matches}")
    else:
        rowNo = matches[0]

    return rowNo


def getWeekColNos(sheet, rowNo):
    '''Determine the week numbers from the cells in the specified row.'''
    colNos = []
    nextWeekNo = 1
    for colNo in range(sheet.ncols):
        cellValue = sheet.cell(rowNo, colNo).value
        if cellValue == nextWeekNo:
            colNos.append(colNo)
            nextWeekNo += 1

    return colNos


def getWeekEndings(sheet, rowNo, colNos):
    '''Determine the week endings from the cells in the specified row.'''
    epoch = datetime(1900, 1, 1)
    weekEndings = []
    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        if isinstance(cellValue, str):
            weekEnding = datetime.strptime(cellValue, '%d-%b-%y')
        else:
            weekEnding = epoch + timedelta(days=cellValue - 2)

        weekEndings.append(weekEnding.strftime("%Y-%m-%d"))

    return weekEndings


def getCellValue(sheet, rowNo, colNo):
    '''Determine the weekly deaths from the cells in the specified row.'''
    
    cellValue = sheet.cell(rowNo, colNo).value

    # 2011 switched from ICD-10 v 2001 to ICD-10 v 2010 (NCHS)
    # 2014 switched from ICD-10 v 2010 (NCHS) to ICD-10 v 2013 (IRIS)
    if cellValue == ":":
        cellValue = 0

    else:
        # Allow non-integers to be treated as zero but show a warning
        try:
            if cellValue != "":
                cellValue = int(cellValue)
            else:
                cellValue = 0
        except:
            print(f"Warning: Failed to convert '{cellValue}' to integer in '{sheet.name}' (row {rowNo + 1} col {colNo + 1})")
            cellValue = 0

    return cellValue

In [8]:
def processAreas(data, sheetsInfo, regionNames):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''

    sheetInfo = sheetsInfo[list(sheetsInfo.keys())[0]]
    weekEndings = sheetInfo["weekEndings"]
    numWeeks = len(weekEndings)
    
    for regionName in regionNames:
        for i in range(numWeeks):
            row = [sheetInfo["weekEndings"][i], i + 1]

            for sheetInfoKey in sheetsInfo:
                sheetInfo = sheetsInfo[sheetInfoKey]

                if regionName in sheetInfo["regions"]:
                    sheet = sheetInfo["sheet"]
                    rowNo = sheetInfo["regions"][regionName]
                    weekColNos = sheetInfo["weekColNos"]

                    #weekNumber = i + 1
                    #weekEnding = weekEndings[i]
                    cellValue = getCellValue(sheet, rowNo, weekColNos[i])

                    row.append(cellValue)

            # Pack out row with zeros
            row += [0] * (5 - len(row))

            data[regionName].append(row)


def processRegions(data, sheetsInfo):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''

    for regionName in common_core.regionNames:
        for sheetInfoKey in sheetsInfo:
            sheetInfo = sheetsInfo[sheetInfoKey]
            sheet = sheetInfo["sheet"]

            rowNo = findRowNo(sheet, regionName, common_core.regionAliases)

            if rowNo >= 0:
                sheetInfo["regions"][regionName] = rowNo

        if regionName not in data:
            data[regionName] = []

    processAreas(data, sheetsInfo, common_core.regionNames)


def processNations(data, sheetsInfo):
    '''Parse the specified worksheet for weekly deaths in a specific country.'''

    regionNames = [ENGLAND_WALES, "Wales"]
    
    for regionName in regionNames:
        for sheetInfoKey in sheetsInfo:
            sheetInfo = sheetsInfo[sheetInfoKey]
            sheet = sheetInfo["sheet"]

            if regionName == ENGLAND_WALES:
                rowNo = regexFindRowNos(sheet, TOTAL_DEATHS)
                
                if rowNo < 0:
                    rowNo = regexFindRowNos(sheet, COVID_DEATHS)
            else:
                rowNo = findRowNo(sheet, regionName)

            if rowNo >= 0:
                sheetInfo["regions"][regionName] = rowNo
                
        if regionName not in data:
            data[regionName] = []

    processAreas(data, sheetsInfo, regionNames)


def processSheets(data, sheetsInfo):
    '''Parse the specified worksheets for weekly deaths.'''

    for sheetInfoKey in sheetsInfo:
        sheetInfo = sheetsInfo[sheetInfoKey]
        sheet = sheetInfo["sheet"]

        weekRowNo = findRowNo(sheet, WEEK_NUMBER)
        weekColNos = getWeekColNos(sheet, weekRowNo)
        assert len(weekColNos) == 52 or len(weekColNos) == 53, f"Number of weeks was not 52 or 53 in '{sheet.name}'"
        sheetInfo["weekColNos"] = weekColNos

        weekEndingsRowNo = findRowNo(sheet, WEEK_ENDED)
        weekEndings = getWeekEndings(sheet, weekEndingsRowNo, weekColNos)
        assert len(weekEndings) == len(weekColNos), f"Number of week endings did not match number of weeks in '{sheet.name}'"
        sheetInfo["weekEndings"] = weekEndings
    
        sheetInfo["regions"] = {}

    processRegions(data, sheetsInfo)
    processNations(data, sheetsInfo)


def processWorkbook(data, workbook):
    '''Parse the specified workbook for weekly deaths.'''

    sheetsInfo = {}

    for sheet in workbook.sheets():           
        if sheet.name.lower().startswith("weekly figures"):
            sheetsInfo["total_registrations"] = {"sheet": sheet}
        elif sheet.name.lower() == "covid-19 - weekly registrations":
            sheetsInfo["covid_registrations"] = {"sheet": sheet}
        elif sheet.name.lower() == "covid-19 - weekly occurrences":
            sheetsInfo["covid_occurrences"] = {"sheet": sheet}
            
    processSheets(data, sheetsInfo)

## Save Data

Turn data from lists into CSV format

In [9]:
def saveArea(data, areaType, areaName):
    # Prepare headings
    headings = ["week_ended", "week_number",
                "total_registrations", "covid_registrations", "covid_occurrences"]

    # Sort data chronologically
    areaData = data[areaName]
    areaData.sort()

    # Ensure CSV path exists
    csvPath = os.path.join(common_core.projdir, "data", "ons-deaths", "csv", "weekly", "deaths", areaType)
    if not os.path.exists(csvPath):
        os.makedirs(csvPath)

    # Write the CSV
    csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")
    with open(csvFn, 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(headings)
        writer.writerows(areaData)


def saveRegions(data):
    '''Save extracted data for all regions'''

    for regionName in common_core.regionNames:
        saveArea(data, "region", regionName)
    
    
def saveNations(data):
    '''Save extracted data for supported nations'''

    for nationName in [ENGLAND_WALES, "Wales"]:
        saveArea(data, "nation", nationName)
    
    
def saveData(data):
    '''Save all extracted data to CSV'''

    saveRegions(data)
    saveNations(data)

## Interface

Main interface for converting from XLSX files to CSV files

In [10]:
def convertDeaths(partNames):
    '''Parse the specified spreadsheet for weekly deaths.'''

    # Initialise data for all regions
    data = {}

    # Iterate through all workbooks
    for partName in partNames:
        if 'publishedweek012021.' not in partName:
            fileName = os.path.join(common_core.projdir, "data", "ons-deaths", "raw", partName)
            workbook = open_workbook(fileName)
            processWorkbook(data, workbook)

    # Save all of the extracted data to CSV files
    saveData(data)

## Calculate Regional Occurrences

Use the ONS modelled occurrences to calculate regional occurrences

In [11]:
def loadAreaDeaths(areaType, areaName):

    csvPath = os.path.join(common_core.projdir, "data", "ons-deaths", "csv", "weekly", "deaths", areaType)
    csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")

    try:
        with open(csvFn, 'r') as f:
            reader = csv.reader(f, delimiter = ',')

            dtype = []
            converters = {}
            colNames = next(reader)

            for i in range(len(colNames)):
                colName = colNames[i]
                if colName == "week_ended":
                    dtype.append((colName, "U10"))
                else:
                    dtype.append((colName, "u4"))
                    converters[i] = lambda s: int(s or 0)

            data = np.genfromtxt(f, dtype=dtype, converters=converters, delimiter=",")

    except:
        print(f"Failed to load {dirName} for {self.areaName}")
        raise
        
    return data


def plotData(areasData):
    '''Plot data for visual inspection'''
    
    # week_ended, week_number, total_registrations, covid_registrations, covid_occurrences
    # week_ended, week_number, total_deaths
    
    occData = areasData[ENGLAND_WALES_OCC]
    ewData = areasData[ENGLAND_WALES]

    for areaName in areasData:
        areaData = areasData[areaName]

        plt.figure(clear=True, figsize=(16, 3))  
        plt.title(areaName)
        plt.ylabel('Number of deaths')

        try:
            y_points = areaData["total_deaths"]
            x_points = np.arange(len(y_points))       
            plt.plot(x_points, y_points, label = "Total Deaths", color='green')
        except:
            pass

        try:
            y_points = areaData["total_registrations"]
            x_points = np.arange(len(y_points))       
            plt.plot(x_points, y_points, label = "Total Registrations", color='lightsteelblue')
        except:
            pass

        try:
            y_points = occData["total_deaths"] * areaData["total_registrations"] / ewData["total_registrations"]
            x_points = np.arange(len(y_points))
            plt.plot(x_points, y_points, label = "Estimated Occurrences", color='navy')
        except:
            pass

        try:
            y_points = areaData["covid_registrations"]
            x_points = np.arange(len(y_points))       
            plt.plot(x_points, y_points, label = "COVID Registrations", color='red')
        except:
            pass

        x_ticks = np.array(areaData["week_ended"])
        plt.xticks(np.arange(0, len(x_ticks), step=14), x_ticks[::14], rotation=90)

        plt.legend()

    
def loadDeaths():
    '''Convert weekly registrations into weekly occurrences'''
    
    areasData = {}
    
    for nationName in [ENGLAND_WALES_OCC, ENGLAND_WALES, "Wales"]:
        areasData[nationName] = loadAreaDeaths("nation", nationName)

    for regionName in common_core.regionNames:
        areasData[regionName] = loadAreaDeaths("region", regionName)

    return areasData

## Interactive Testing

In [12]:
if __name__ == '__main__':
    partNames = downloadDeaths()
    convertDeaths(partNames)

    #areasData = loadDeaths()
    #plotData(areasData)