# ONS Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

## Imports

Standard python libraries plus determination of projdir, basic printable class, etc

In [1]:
import os
import re
from datetime import datetime, timedelta

import csv
from xlrd import open_workbook

import common_core

## Configuration

Data to download from the NHS statistical work area

In [2]:
# The 4 nations in the UK
nationNames = common_core.nationNames

# The 9 regions in England
regionNames = common_core.regionNames

# Combine all of these area types into a single list
areas = [("nation", nationNames), ("region", regionNames)]

In [3]:
# The latest ONS age bands
ageDemographics = [
    '<1', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29',
    '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90+'
]

# The legacy ONS age bands
legacyAgeDemographics = [
    '01-14', '15-44', '45-64', '65-74', '75-84', '85+'
]

In [4]:
deathsUrl = "https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales"
deathsPath = os.path.join(common_core.projdir, "data", "ons-deaths", "raw")

deathsFiles = [
    ("weekly", ".*\.xlsx?$")
]

## Download Data

Download spreadsheets by parsing the HTML for suitable links

In [5]:
def downloadDeaths(skipExisting=common_core.skipExisting, verbose=common_core.verbose):
    webDownload = common_core.WebDownload(skipExisting=skipExisting, verbose=verbose)
    partNames = webDownload.downloadFiles(deathsPath, deathsUrl, deathsFiles)
    return partNames

## Convert to CSV

In [6]:
WEEK_NUMBER = "Week number"
WEEK_ENDED = "Week ended"

TOTAL_DEATHS = "^Total deaths, all ages"
RESPIRATORY_DISEASES = ".*ICD-10 J00-J99.*"

MAX_COLS_WITH_HEADERS = 2


def findRowNos(sheet, headings, aliases, patterns):
    '''Find rows with the specified headings. Also check for possible aliases.'''
    rowNos = {}

    # Search for row headings with precise wording
    for heading in headings:
        headingLower = heading.lower()
        found = 0

        # Aliases are still regarded as precise wording
        if heading in aliases:
            aliasesLower = [alias.lower() for alias in aliases[heading]]
        else:
            aliasesLower = []

        for rowNo in range(sheet.nrows):
            for colNo in range(MAX_COLS_WITH_HEADERS):
                cellValue = sheet.cell(rowNo, colNo).value

                if isinstance(cellValue, str):
                    cellValueLower = cellValue.lower()
                    if cellValueLower == headingLower or cellValueLower in aliasesLower:
                        rowNos[heading] = rowNo
                        found += 1
                        
        assert found <= 1, f"Found multiple rows containing {heading}"

    # Search for row headings using regular expressions
    for pattern in patterns:
        regex = re.compile(pattern)
        found = 0

        for rowNo in range(sheet.nrows):
            for colNo in range(MAX_COLS_WITH_HEADERS):
                cellValue = sheet.cell(rowNo, colNo).value

                if isinstance(cellValue, str):
                    if regex.match(cellValue):
                        rowNos[pattern] = rowNo
                        found += 1

        assert found <= 1, f"Found multiple rows containing {pattern}"

    return rowNos 


def getWeekCols(sheet, rowNo):
    '''Determine the week numbers from the cells in the specified row.'''
    weekCols = []
    nextWeekNo = 1
    for colNo in range(sheet.ncols):
        cellValue = sheet.cell(rowNo, colNo).value
        if cellValue == nextWeekNo:
            weekCols.append(colNo)
            nextWeekNo += 1

    return weekCols


def getWeekEndings(sheet, rowNo, colNos):
    '''Determine the week endings from the cells in the specified row.'''
    epoch = datetime(1900, 1, 1)
    weekEndings = []
    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value
        if isinstance(cellValue, str):
            weekEnding = datetime.strptime(cellValue, '%d-%b-%y')
            weekEndings.append(weekEnding)
        else:
            weekEnding = epoch + timedelta(days=cellValue - 2)
            weekEndings.append(weekEnding)

    return weekEndings


def getWeeklyDeaths(sheet, rowNo, colNos):
    '''Determine the weekly deaths from the cells in the specified row.'''
    numDeaths = []
    for colNo in colNos:
        cellValue = sheet.cell(rowNo, colNo).value

        # 2011 switched from ICD-10 v 2001 to ICD-10 v 2010 (NCHS)
        # 2014 switched from ICD-10 v 2010 (NCHS) to ICD-10 v 2013 (IRIS)
        if cellValue == ":":
            cellValue = 0

        # Allow non-integers to be treated as zero but show a warning
        try:
            numDeaths.append(int(cellValue))
        except:
            numDeaths.append(0)
            print(f"Warning: Failed to convert row {rowNo} col {colNo} to integer - {cellValue}")

    return numDeaths


def processSheet(sheet):
    '''Parse the specified worksheet for weekly deaths.'''
    cells = 0

    data = {}
    
    headings = [WEEK_NUMBER, WEEK_ENDED] + common_core.regionNames + ["Wales"]
    patterns = [TOTAL_DEATHS, RESPIRATORY_DISEASES]
    rowNos = findRowNos(sheet, headings, common_core.regionAliases, patterns)
    
    data[WEEK_NUMBER] = getWeekCols(sheet, rowNos[WEEK_NUMBER])
    assert len(data[WEEK_NUMBER]) == 52 or len(data[WEEK_NUMBER]) == 53, "Number of weeks was not 52 or 53"

    data[WEEK_ENDED] = getWeekEndings(sheet, rowNos[WEEK_ENDED], data[WEEK_NUMBER])
    assert len(data[WEEK_ENDED]) == len(data[WEEK_NUMBER]), "Number of week endings did not match number of weeks"
    
    for regionName in common_core.regionNames + ["Wales"]:
        data[regionName] = getWeeklyDeaths(sheet, rowNos[regionName], data[WEEK_NUMBER])
        assert len(data[regionName]) == len(data[WEEK_NUMBER]), f"{regionName} did not have the correct number of weeks"

    for pattern in patterns:
        data[pattern] = getWeeklyDeaths(sheet, rowNos[pattern], data[WEEK_NUMBER])
        assert len(data[pattern]) == len(data[WEEK_NUMBER]), f"{pattern} did not have the correct number of weeks"

    return data


def saveData(data):
    for regionName in common_core.regionNames:
        pass
        #print(f"{regionName} = {common_core.getSafeName(regionName)}")

    regionName = "Wales"
    #print(f"{regionName} = {common_core.getSafeName(regionName)}")



def convertDeaths(partNames):
    '''Parse the specified spreadsheet for weekly deaths.'''
    for partName in partNames:
        fileName = os.path.join(common_core.projdir, "data", "ons-deaths", "raw", partName)
        print(f"Parsing {partName}...")
        workbook = open_workbook(fileName)

        for sheet in workbook.sheets():           
            if sheet.name.lower().startswith("weekly figures"):
                data = processSheet(sheet)
                saveData(data)

In [7]:
partNames = downloadDeaths()

convertDeaths(partNames)

Parsing weekly/publishedweek532020.xlsx...
Parsing weekly/publishedweek522019.xls...
Parsing weekly/publishedweek522018withupdatedrespiratoryrow.xls...
Parsing weekly/publishedweek522017.xls...
Parsing weekly/publishedweek522016.xls...
Parsing weekly/publishedweek2015.xls...
Parsing weekly/publishedweek2014.xls...
Parsing weekly/publishedweek2013.xls...
Parsing weekly/publishedweek2012.xls...
Parsing weekly/publishedweek2011.xls...
Parsing weekly/publishedweek2010.xls...
