# PHE Core

Created by Michael George (AKA Logiqx)

Website: https://logiqx.github.io/covid-stats/

## Imports

Standard python libraries plus determination of projdir, basic printable class, etc

In [1]:
import os
import requests
import json
import csv

import common_core

## Configuration

Data to download via the API - cases, patients, deaths

In [2]:
skipExisting = True

surveillanceUrl = "https://www.gov.uk/government/statistics/national-flu-and-covid-19-surveillance-reports"
surveillancePath = os.path.join(common_core.projdir, "data", "phe-surveillance", "raw")

surveillanceFiles = [
    ("weekly", "Weekly_Influenza_and_COVID19_report_data_.*\.xlsx$")
]

In [3]:
ENDPOINT = "https://api.coronavirus.data.gov.uk/v1/data"

In [4]:
# Only using England for now due to the availability of age demographics, etc
nationNames = ["England", "Scotland", "Wales", "Northern Ireland"]

# All 9 regions in England
regionNames = ["North West", "North East", "Yorkshire and The Humber", "West Midlands", "East Midlands",
             "East of England", "London", "South East", "South West"]

# All 7 NHS regions in England
nhsRegionNames = ["North West", "North East and Yorkshire", "Midlands",
                  "East of England", "London", "South East", "South West"]

# A selection of lower tier local authorities in Dorset, Hertfordshire, Birmingham, Derbyshire and London (LTLA)
ltlaNames = ["Dorset", "Bournemouth, Christchurch and Poole",
             "Stevenage", "Welwyn Hatfield", "North Hertfordshire", "East Hertfordshire",
             "Sandwell", "Dudley", "Birmingham",
             "Derbyshire Dales", "North East Derbyshire", "High Peak", "Sheffield",
             "Croydon"]

# Combine all of these area types into a single list
areas = [("overview", ["United Kingdom"]), ("nation", nationNames), ("region", regionNames), ("nhsregion", nhsRegionNames), ("ltla", ltlaNames)]

In [5]:
casesStructure = {
    "cases": "newCasesBySpecimenDate", # Cases by specimen date
    "casesRollingRate": "newCasesBySpecimenDateRollingRate", # Rate of cases per 100K population in 7 day period
    "casesAgeDemographics": "newCasesBySpecimenDateAgeDemographics", # Demographics - 5 year age bands
    "casesReported": "newCasesByPublishDate" # Cases by date reported
}

patientsStructure = {
    "admissions": "newAdmissions", # Patients admitted to hospital
    "admissionsRollingRate": "newAdmissionsRollingRate", # Rate of admissions per 100K population in 7 day period
    "patients": "hospitalCases", # Patients in hospital
    "patientsMv": "covidOccupiedMVBeds", # Patients in mechanical ventilation beds
    "cumAdmissionsByAge": "cumAdmissionsByAge" # Demographics - 05-, 6-17, 18-64, 65-84, 85+
}

deathsStructure = {
    "deaths": "newDeaths28DaysByDeathDate", # Deaths within 28d of +ve test by date of death
    "deathsRollingRate": "newDeaths28DaysByDeathDateRollingRate", # Rate of deaths per 100K population in 7 day period
    "deathsAgeDemographics": "newDeaths28DaysByDeathDateAgeDemographics", # Demographics - 5 year age bands
    "deathsReported": "newDeaths28DaysByPublishDate" # Deaths within 28d of +ve test by date reported
}

onsStructure = {
    "deathsRegistered": "newOnsDeathsByRegistrationDate" # COVID-19 on the death certificate
}

In [6]:
ageDemographics = [
    '00_04', '05_09', '10_14', '15_19', '20_24', '25_29',
    '30_34', '35_39', '40_44', '45_49', '50_54', '55_59',
    '60_64', '65_69', '70_74', '75_79', '80_84', '85_89', '90+'
]

admissionDemographics = [
    '0_to_5', '6_to_17', '18_to_64', '65_to_84', '85+'
]

demographicFields = [
    'cases', 'deaths', 'rollingRate', # newCasesBySpecimenDateAgeDemographics + newDeaths28DaysByDeathDateAgeDemographics
    'rate', 'value' # cumAdmissionsByAge
]

## Area Class

Download data via the API and prepare it for analysis.

Supports nations, regions and LTLAs.

In [7]:
class Area(common_core.Printable):
    def __init__(self, areaType, areaName):
        """Initialisise the area object"""

        self.areaType = areaType
        self.areaName = areaName

        self.safeName = areaName.lower().replace(' ', '_').replace(',', '')
        for word in 'of', 'and', 'the':
            self.safeName = self.safeName.replace(word + '_', '')

        self.csvName = self.safeName + '.csv'
        

    def getRawPath(self, period):
        """Get path for raw data"""
        rawPath = os.path.join(common_core.projdir, "data", "phe-dashboard", "raw", period, self.areaType)

        return rawPath

        
    def getCsvPath(self, period, category):
        """Get path for csv data"""
        csvPath = os.path.join(common_core.projdir, "data", "phe-dashboard", "csv", period, category, self.areaType)

        return csvPath

        
    def download(self, period = "daily"):
        """Download data from PHE dashboard"""

        # Catch all exceptions
        try:
            filters = [
                f"areaType={self.areaType}",
                f"areaName={self.areaName}"
            ]

            structure = {
                "date": "date",
                "areaName": "areaName"
            }

            if period == "weekly":
                if self.areaType in ['overview', 'nation', 'region', 'ltla']:
                    structure.update(onsStructure)
            else:
                if self.areaType in ['overview', 'nation', 'region', 'ltla']:
                    structure.update(casesStructure)
                    structure.update(deathsStructure)
                if self.areaType in ['overview', 'nation', 'nhsregion']:
                    structure.update(patientsStructure)

            api_params = {
                "filters": str.join(";", filters),
                "structure": json.dumps(structure, separators=(",", ":")),
                "format": "csv"
            }

            # Download raw data - hybrid of CSV and Python dictionaries
            response = requests.get(ENDPOINT, params=api_params, timeout=10)
            assert response.status_code == 200, f"Failed request for {self.areaName}: {response.status_code} {response.text}"

            # Ensure raw path exists
            rawPath = self.getRawPath(period)
            if not os.path.exists(rawPath):
                os.makedirs(rawPath)

            # Save raw data
            rawFn = os.path.join(rawPath, self.csvName)
            with open(rawFn, 'w') as f:
                f.write(response.content.decode())

        # General catch all to report exceptions then abort
        except:
            print(f"Failed to download {period} data for {self.areaName}")
            raise


    def downloadDaily(self):
        """Download daily data for analysis"""

        # Daily data is available for all area types
        print(f"Downloading {self.areaName}...")
        self.download()


    def downloadWeekly(self):
        """Download weekly data for analysis"""

        # ONS data is not available for 'nhsregion'
        if self.areaType in ['overview', 'nation', 'region', 'ltla']:
            print(f"Downloading {self.areaName}...")
            self.download("weekly")


    def prepare(self, category, period = "daily"):
        """Prepare data for analysis"""

        def getColNos(row, category):
            """Get column numbers relevant to the category"""

            if category == "cases":
                structure = casesStructure
            elif category == "patients":
                structure = patientsStructure
            elif category == "deaths":
                structure = deathsStructure
            elif category == "ons":
                structure = onsStructure
            else:
                raise ValueError(f"Unsupported category - {category}") 

            # Always include date and areaName - see default "structure" above
            colNos = [0, 1]

            # Other columns are dependent on the category and the fields defined in its "structure"
            for colNo in range(len(row)):
                if row[colNo] in structure:
                    # Age demographics are not available for deaths within "ltla"
                    if category == "cases":
                        if row[colNo].endswith("Demographics"):
                            if self.areaName == "England" or self.areaType in ["region", "ltla"]:
                                colNos.append(colNo)
                        else:
                            colNos.append(colNo)
                    elif category == "patients":
                        if row[colNo] == "cumAdmissionsByAge":
                            if self.areaName == "England" or self.areaType == "nhsregion":
                                colNos.append(colNo)
                        else:
                            colNos.append(colNo)
                    elif category == "deaths":
                        if row[colNo].endswith("Demographics"):
                            if self.areaName == "England" or self.areaType == "region":
                                colNos.append(colNo)
                        else:
                            colNos.append(colNo)
                    else:
                        colNos.append(colNo)

            return colNos


        def getSpecialColNo(row, colNos):
            """Get position of special column - e.g. demographics"""

            # Use -1 to indicate not present
            specialColNo = -1

            for colNo in colNos:
                if row[colNo].endswith("Demographics"):
                    specialColNo = colNo
                    break
                elif row[colNo] == "cumAdmissionsByAge":
                    specialColNo = colNo
                    break
                
            return specialColNo


        def getColNames(row, colNos):
            """Get column names relevant to the category"""

            # Always include date and areaName
            colNames = []

            # Other columns are dependent on the category and its structure
            for colNo in colNos:
                # Demographics are provided as a Python structure
                if row[colNo].endswith("Demographics"):
                    # Only use the demographic fields that are actually required for cases and deaths
                    for ageDemographic in ageDemographics:
                        colNames.append(f"{category}{ageDemographic}")
                        colNames.append(f"{category}RollingRate{ageDemographic}")
                elif row[colNo] == "cumAdmissionsByAge":
                    # Only use the demographic fields that are actually required for admissions
                    for ageDemographic in admissionDemographics:
                        colNames.append(f"cumAdmissionsRollingRate{ageDemographic}")
                        colNames.append(f"cumAdmissions{ageDemographic}")
                else:
                    colNames.append(row[colNo])
                
            return colNames


        def getRowValues(row, colNos, specialColNo, demographics):
            """Get row values for specific field numbers"""

            tidyRow = []
            for colNo in colNos:
                if colNo == specialColNo:
                    if row[colNo].startswith('['):
                        items = eval(row[colNo])
                    else:
                        items = []
                    # Pick out the age demographics that are required
                    for demographic in demographics:
                        found = False
                        for item in items:
                            if item["age"] == demographic:
                                for demographicField in demographicFields:
                                    if demographicField in item:
                                        tidyRow.append(item[demographicField])
                                found = True
                                break

                        # Some records do not include the age demographics, just an empty list
                        if found == False:
                            tidyRow.append("")
                            tidyRow.append("")
                else:
                    # Copy value of regular column
                    tidyRow.append(row[colNo])

            return tidyRow


        # Catch all exceptions
        try:
            # Determine raw filename
            rawPath = self.getRawPath(period)
            rawFn = os.path.join(rawPath, self.csvName)

            # Ensure the CSV path exists
            csvPath = self.getCsvPath(period, category)
            if not os.path.exists(csvPath):
                os.makedirs(csvPath)

            # Demographics vary for different categories
            if category == "patients":
                demographics = admissionDemographics
            else:
                demographics = ageDemographics

            # Generate the CSV from raw data
            csvFn = os.path.join(csvPath, self.csvName)
            with open(csvFn, 'w') as csvFile:
                writer = csv.writer(csvFile)

                with open(rawFn, 'r') as f:
                    reader = csv.reader(f, delimiter = ',')
                    rows = []
                    rowNo = 0

                    for row in reader:
                        if rowNo == 0:
                            # First row is column names
                            colNos = getColNos(row, category)
                            specialColNo = getSpecialColNo(row, colNos)
                            colNames = getColNames(row, colNos)
                        else:
                            # Subsequent rows are actual data
                            row = getRowValues(row, colNos, specialColNo, demographics)
                            rows.append(row)

                        rowNo += 1

                    # PHE publish data in reverse chronological order
                    rows.sort()

                    writer.writerow(colNames)
                    writer.writerows(rows)

        # General catch all to report exceptions then abort
        except:
            print(f"Failed to convert {period} {category} for {self.areaName}")
            raise


    def prepareDaily(self):
        """Prepare daily data for analysis"""

        print(f"Preparing {self.areaName}...")

        # There is no daily case / death data for the "nhsregion" area type
        if self.areaType in ["overview", "nation", "region", "ltla"]:
            self.prepare("cases")
            self.prepare("deaths")

        # Patient data is only available for "nation" and "nhsregion", not "ltla"
        if self.areaType in ["overview", "nation", "nhsregion"]:
            self.prepare("patients")


    def prepareWeekly(self):
        """Prepare weekly data for analysis"""

        if self.areaType in ["overview", "nation", "region", "ltla"]:
            print(f"Preparing {self.areaName}...")
            self.prepare("ons", "weekly")

In [8]:
def downloadSurveillance():
    webDownload = common_core.WebDownload(skipExisting=skipExisting)
    files = webDownload.downloadFiles(surveillancePath, surveillanceUrl, surveillanceFiles)