In [303]:
import pandas as pd
from datetime import date, timedelta
import urllib
import numpy as np
import altair as alt
from functools import reduce

In [304]:
# NHS File Format
nhs_file_format = 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/DIRECTORY/COVID-19-total-announced-deaths-DATE.xlsx'
nhs_file_date_format = '%-d-%B-%Y'
nhs_file_directory_format = '%Y/%m'

In [305]:
# Variables
our_trust_codes = [
    'RYW', # Birmingham Community Healthcare NHS Foundation Trust
    'RNA', # The Dudley Group NHS Foundation Trust
    'RL4', # The Royal Wolverhampton NHS Trust
    'RXK', # Sandwell And West Birmingham Hospitals NHS Trust
    'RXW', # Shrewsbury And Telford Hospital NHS Trust
    'RRJ', # The Royal Orthopaedic Hospital NHS Foundation Trust
    'RRK', # University Hospitals Birmingham NHS Foundation Trust
    'RJE', # University Hospitals Of North Midlands NHS Trust
    'RBK', # Walsall Healthcare NHS Trust
    'RWP', # Worcestershire Acute Hospitals NHS Trust
    'R1A', # Worcestershire Health And Care NHS Trust
    'RL1', # The Robert Jones And Agnes Hunt Orthopaedic Hospital NHS Foundation Trust
    'RLY', # North Staffordshire Combined Healthcare NHS Trust
]

In [306]:
start_date = date(2020, 4, 4) # as of today (2020-04-15) this is the first day/file to exist
end_date = date.today()

def filter_real_urls(day):
    try:
      urllib.request.urlopen(day['url'])
      return True
        
    except:
        return False

data = filter( # iterates through the object, leaving only files which exist
    filter_real_urls,
    map(
        lambda date : { # iterates through list of days, returning an object / dictionary
            'date': date,
            'url': nhs_file_format.replace(
                'DATE',
                date.strftime(nhs_file_date_format)
            ).replace(
                'DIRECTORY',
                date.strftime(nhs_file_directory_format)
            )
        },
        map( # list of days
            lambda interval : start_date + timedelta(days=interval),
            range((end_date - start_date).days+1)
        )
    )
)

In [307]:
# for each item in paths
    # convert excel to csv
    # filter and clean
    # create dataframe
    # add a column for date reported based on the file date / path date
    # merge/append this into the new dataframe

# convert to csv, filter and clean, returns dataframe
def map_nhsdeaths_to_dataframe(x):
    csv_filename = 'nhsdeathsbytrust-' + x['date'].strftime('%Y%m%d') + '.csv'
    excel = pd.read_excel(
        x['url'],
        header=15,
        sheet_name='COVID19 total deaths by trust'
    )
    excel.to_csv(csv_filename)
    csv = pd.read_csv(csv_filename)

    ## Filtering down to our_trusts
    our_trusts = csv[csv.Code.isin(our_trust_codes)] # filter to our trusts
    our_trusts = our_trusts.filter(regex='(^Code$|^Name$|[0-9]{4}-.+|^Up to.01-Mar-20$)', axis=1) # filter to only the required columns
    our_trusts = our_trusts.rename(columns={'Code': 'code', 'Name' : 'name'}) # clean up column names
    our_trusts = pd.melt(our_trusts, id_vars=['code','name'], var_name='date of death', value_name='deaths') # un-pivot the date columns
    our_trusts['date of death'] = our_trusts['date of death'].str.replace('Up to 01-Mar-20','2020-02-29 00:00:00')
    our_trusts['date of death'] = pd.to_datetime(our_trusts['date of death']) # clean all dates to YYYY-MM-DD

    ## Adding cumulative deaths
    our_trusts['cumulative_deaths'] = our_trusts.groupby('name').cumsum()

    ## Added relative difference
    our_trusts['reldiff'] = our_trusts.groupby('code')['cumulative_deaths'].pct_change().convert_dtypes(convert_integer=True) # add daily variance

    ## Cleaning float
    our_trusts['deaths'] = our_trusts['deaths'].convert_dtypes(convert_integer=True)
    our_trusts['cumulative_deaths'] = our_trusts['cumulative_deaths'].convert_dtypes(convert_integer=True)
    
    ## Add dataframe as a new column
    x['dataframe'] = our_trusts
    
    return x

def map_add_reported_day(x):
    x['dataframe']['date reported'] = x['date']
    return x

all_nhs_data = reduce(
    lambda all_data, day_data : all_data.append(day_data),
    map(
        lambda x : x['dataframe'],
        map(
            map_add_reported_day,
            map(map_nhsdeaths_to_dataframe, data)
        )
    ),
    pd.DataFrame(columns = ['code','name','date of death','date reported','deaths','cumulative_deaths'])
)

In [308]:
all_nhs_data

Unnamed: 0,code,name,date of death,date reported,deaths,cumulative_deaths,reldiff
0,RYW,BIRMINGHAM COMMUNITY HEALTHCARE NHS FOUNDATION...,2020-02-29,2020-04-04,0,0,
1,RXK,SANDWELL AND WEST BIRMINGHAM HOSPITALS NHS TRUST,2020-02-29,2020-04-04,1,1,
2,RXW,SHREWSBURY AND TELFORD HOSPITAL NHS TRUST,2020-02-29,2020-04-04,0,0,
3,RNA,THE DUDLEY GROUP NHS FOUNDATION TRUST,2020-02-29,2020-04-04,0,0,
4,RRJ,THE ROYAL ORTHOPAEDIC HOSPITAL NHS FOUNDATION ...,2020-02-29,2020-04-04,0,0,
...,...,...,...,...,...,...,...
593,RRK,UNIVERSITY HOSPITALS BIRMINGHAM NHS FOUNDATION...,2020-04-14,2020-04-15,0,484,0.000000
594,RJE,UNIVERSITY HOSPITALS OF NORTH MIDLANDS NHS TRUST,2020-04-14,2020-04-15,2,101,0.020202
595,RBK,WALSALL HEALTHCARE NHS TRUST,2020-04-14,2020-04-15,0,98,0.000000
596,RWP,WORCESTERSHIRE ACUTE HOSPITALS NHS TRUST,2020-04-14,2020-04-15,0,111,0.000000
