# ONS Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

## Imports

Standard python libraries plus determination of projdir, basic printable class, etc

In [1]:
import os
import re
from datetime import datetime, timedelta

import csv
from xlrd import open_workbook

import common_core

## Configuration

Data to download from the NHS statistical work area

In [2]:
# The 4 nations in the UK
nationNames = common_core.nationNames

# The 9 regions in England
regionNames = common_core.regionNames

# Combine all of these area types into a single list
areas = [("nation", nationNames), ("region", regionNames)]

In [3]:
# The latest ONS age bands
ageDemographics = [
    '<1', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29',
    '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90+'
]

# The legacy ONS age bands
legacyAgeDemographics = [
    '01-14', '15-44', '45-64', '65-74', '75-84', '85+'
]

In [4]:
deathsUrl = "https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales"
deathsPath = os.path.join(common_core.projdir, "data", "ons-deaths", "raw")

deathsFiles = [
    ("weekly", ".*\.xlsx?$")
]

## Download Data

Download spreadsheets by parsing the HTML for suitable links

In [5]:
def downloadDeaths(skipExisting=common_core.skipExisting, verbose=common_core.verbose):
    webDownload = common_core.WebDownload(skipExisting=skipExisting, verbose=verbose)
    partNames = webDownload.downloadFiles(deathsPath, deathsUrl, deathsFiles)
    return partNames

## Parse Data

Parse the weekly deaths spreadsheets

In [6]:
epoch = datetime(1900, 1, 1)

WEEK_NUMBER = "Week number"
WEEK_ENDED = "Week ended"

TOTAL_DEATHS = "^Total deaths, all ages"
RESPIRATORY_DISEASES = ".*ICD-10 J00-J99.*"

MAX_COLS_WITH_HEADERS = 2

In [7]:
def findRowNo(sheet, heading, aliases = {}):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Search for row headings with precise wording
    headingLower = heading.lower()

    # Aliases are still regarded as precise wording
    if heading in aliases:
        aliasesLower = [alias.lower() for alias in aliases[heading]]
    else:
        aliasesLower = []

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                cellValueLower = cellValue.lower()
                if cellValueLower == headingLower or cellValueLower in aliasesLower:
                    matches.append(rowNo)

    if len(matches) == 0:
        raise RuntimeError(f"{heading} not found in {sheet.name}")
    elif len(matches) > 1:
        raise RuntimeError(f"{heading} found in {sheet.name} multiple times - {matches}")

    return matches[0]


def regexFindRowNos(sheet, pattern):
    '''Find rows with the specified headings. Also check for possible aliases.'''

    matches = []

    # Pre-compile regex for minor speedup
    regex = re.compile(pattern)

    for rowNo in range(sheet.nrows):
        for colNo in range(MAX_COLS_WITH_HEADERS):
            cellValue = sheet.cell(rowNo, colNo).value

            if isinstance(cellValue, str):
                if regex.match(cellValue):
                    matches.append(rowNo)

    if len(matches) == 0:
        raise RuntimeError(f"{heading} not found in {sheet.name}")
    elif len(matches) > 1:
        raise RuntimeError(f"{heading} found in {sheet.name} multiple times - {matches}")

    return matches[0]


def getWeekColNos(sheet, rowNo):
    '''Determine the week numbers from the cells in the specified row.'''
    colNos = []
    nextWeekNo = 1
    for colNo in range(sheet.ncols):
        cellValue = sheet.cell(rowNo, colNo).value
        if cellValue == nextWeekNo:
            colNos.append(colNo)
            nextWeekNo += 1

    return colNos


def getWeekEndings(sheet, rowNo, colNos):
    '''Determine the week endings from the cells in the specified row.'''
    epoch = datetime(1900, 1, 1)
    weekEndings = []
    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        if isinstance(cellValue, str):
            weekEnding = datetime.strptime(cellValue, '%d-%b-%y')
        else:
            weekEnding = epoch + timedelta(days=cellValue - 2)

        weekEndings.append(weekEnding.strftime("%Y-%m-%d"))

    return weekEndings


def getCellValue(sheet, rowNo, colNo):
    '''Determine the weekly deaths from the cells in the specified row.'''
    
    cellValue = sheet.cell(rowNo, colNo).value

    # 2011 switched from ICD-10 v 2001 to ICD-10 v 2010 (NCHS)
    # 2014 switched from ICD-10 v 2010 (NCHS) to ICD-10 v 2013 (IRIS)
    if cellValue == ":":
        cellValue = 0

    else:
        # Allow non-integers to be treated as zero but show a warning
        try:
            cellValue = int(cellValue)
        except:
            cellValue = 0
            print(f"Warning: Failed to convert row {rowNo} col {colNo} to integer - {cellValue}")

    return cellValue

In [8]:
def processRegions(data, sheet, weekColNos, weekEndings):
    '''Parse the specified worksheet for weekly deaths in a specific region.'''

    for regionName in common_core.regionNames:
        rowNo = findRowNo(sheet, regionName, common_core.regionAliases)

        if regionName not in data:
            data[regionName] = []

        for i in range(len(weekColNos)):
            weekNumber = i + 1
            weekEnding = weekEndings[i]
            weeklyDeaths = getCellValue(sheet, rowNo, weekColNos[i])

            row = [weekEnding, weekNumber, weeklyDeaths]
            data[regionName].append(row)


def processNations(data, sheet, weekColNos, weekEndings):
    '''Parse the specified worksheet for weekly deaths in a specific country.'''

    for nationName in ["Wales"]:
        rowNo = findRowNo(sheet, nationName)

        if nationName not in data:
            data[nationName] = []

        for i in range(len(weekColNos)):
            weekNumber = i + 1
            weekEnding = weekEndings[i]
            weeklyDeaths = getCellValue(sheet, rowNo, weekColNos[i])

            row = [weekEnding, weekNumber, weeklyDeaths]
            data[nationName].append(row)


def processSheet(data, sheet):
    '''Parse the specified worksheet for weekly deaths.'''

    weekRowNo = findRowNo(sheet, WEEK_NUMBER)
    weekColNos = getWeekColNos(sheet, weekRowNo)
    assert len(weekColNos) == 52 or len(weekColNos) == 53, "Number of weeks was not 52 or 53"

    weekEndingsRowNo = findRowNo(sheet, WEEK_ENDED)
    weekEndings = getWeekEndings(sheet, weekEndingsRowNo, weekColNos)
    assert len(weekEndings) == len(weekColNos), "Number of week endings did not match number of weeks"
    
    processNations(data, sheet, weekColNos, weekEndings)
    processRegions(data, sheet, weekColNos, weekEndings)

## Save Data

Turn data from lists into CSV format

In [9]:
def saveArea(data, areaType, areaName):
    # Prepare headings
    headings = ["week_ended", "week_number", "total_deaths"]

    # Sort data chronologically
    areaData = data[areaName]
    areaData.sort()

    # Ensure CSV path exists
    csvPath = os.path.join(common_core.projdir, "data", "ons-deaths", "csv", "weekly", "deaths", areaType)
    if not os.path.exists(csvPath):
        os.makedirs(csvPath)

    # Write the CSV
    csvFn = os.path.join(csvPath, common_core.getSafeName(areaName) + ".csv")
    with open(csvFn, 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(headings)
        writer.writerows(areaData)


def saveRegions(data):
    '''Save extracted data for all regions'''

    for regionName in common_core.regionNames:
        saveArea(data, "region", regionName)
    
    
def saveNations(data):
    '''Save extracted data for supported nations'''

    for nationName in ["Wales"]:
        saveArea(data, "nation", nationName)
    
    
def saveData(data):
    '''Save all extracted data to CSV'''

    saveNations(data)
    saveRegions(data)

## Interface

Main interface for converting from XLSX files to CSV files

In [10]:
def convertDeaths(partNames):
    '''Parse the specified spreadsheet for weekly deaths.'''

    # Initialise data for al regions
    data = {}

    # Iterate through all workbooks
    for partName in partNames:
        fileName = os.path.join(common_core.projdir, "data", "ons-deaths", "raw", partName)
        workbook = open_workbook(fileName)

        # Iterate through all worksheets, extracting useful data
        for sheet in workbook.sheets():           
            if sheet.name.lower().startswith("weekly figures"):
                processSheet(data, sheet)

    # Save all of the extracted data to CSV files
    saveData(data)

In [11]:
partNames = downloadDeaths()

convertDeaths(partNames)