In [3]:
import pandas as pd
from datetime import date, datetime, timedelta
import untangle # xml
import requests # json
import re # regular expressions
from functools import reduce

In [4]:
# Configuration
today = date.today()
yesterday = date.today() - timedelta(days=1)

our_utla_codes = [
    'E08000025', # Birmingham
    'E08000027', # Dudley
    'E08000028', # Sandwell
    'E06000051', # Shropshire
    'E10000028', # Staffordshire
    'E06000021', # Stoke-on-Trent
    'E06000020', # Telford and Wrekin
    'E08000030', # Walsall
    'E08000031', # Wolverhampton
    'E10000034', # Worcestershire
]
utlacodes_expressandstar = [
    'E08000025', # Birmingham
    'E08000027', # Dudley
    'E08000028', # Sandwell
    'E10000028', # Staffordshire
    'E06000021', # Stoke-on-Trent
    'E08000030', # Walsall
    'E08000031', # Wolverhampton
]
utlacodes_blackcountry = [
    'E08000027', # Dudley
    'E08000028', # Sandwell
    'E08000030', # Walsall
    'E08000031', # Wolverhampton
]
utlacodes_blackcountryandbirmingham = [
    'E08000025', # Birmingham
    'E08000027', # Dudley
    'E08000028', # Sandwell
    'E08000030', # Walsall
    'E08000031', # Wolverhampton
]
utlacodes_sandwellandbirmingham = [
    'E08000025', # Birmingham
    'E08000028', # Sandwell
]
utlacodes_staffordshire = [
    'E10000028', # Staffordshire
]
utlacodes_wolverhampton = [
    'E08000031', # Wolverhampton
]
utlacodes_walsall = [
    'E08000030', # Walsall
]
utlacodes_dudley = [
    'E08000027', # Dudley
]
utlacodes_sandwell = [
    'E08000028', # Sandwell
]
utlacodes_birmingham = [
    'E08000025', # Birmingham
]
utlacodes_stoke = [
    'E06000021', # Stoke-on-Trent
]
utlacodes_staffordshireandstoke = [
    'E10000028', # Staffordshire
    'E06000021', # Stoke-on-Trent
]
utlacodes_worcestershire = [
    'E10000034', # Worcestershire
]

In [5]:
# Pull the XML and traverse to the blobs
xmlblobs = untangle.parse(
    'https://publicdashacc.blob.core.windows.net/publicdata?restype=container&comp=list'
).EnumerationResults.Blobs.Blob

In [6]:
# Extract the cdata of the names, filter to pattern, map to an object while extracting the date

pattern = '^data_([0-9]{4}(0[0-9]|1[0-2])([0-2][0-9]|3[0-1])).+\.json$'
datafiles = list(
    map(
        lambda filename: {
            'date': datetime.strptime(
                re.search(pattern,filename).group(1),
                '%Y%m%d'
            ),
            'filename': filename
        },
        filter(
            lambda filename: re.search(pattern,filename),
            map(
                lambda blob: blob.Name.cdata,
                xmlblobs
            )
        )
    )
)

In [7]:
def utla_day_to_dataframe(xmljsonref):
    # build a url to retrieve
    url = 'https://c19pub.azureedge.net/' + xmljsonref['filename']
    
    # request a URL
    json = requests.get(url).json()
    
    # empty array :facepalm:
    utla_data = []
    
    # iterate through utlas element of JSON
    for (code, data) in json['utlas'].items():
        # if the code is one of our UTLAs
        if code in our_utla_codes:
            # loop through the days of data in the dailyConfirmedCases element
            for day in data['dailyConfirmedCases']:
                # append a mix of this data and that above
                utla_data.append([
                    code, # UTLA code
                    data['name']['value'], # filename
                    day['date'], # date of cases
                    day['value'] # cases
                ])
    xmljsonref['dataframe'] = pd.DataFrame(data=utla_data, columns=['code','name','date of case','cases'])
    
    return xmljsonref

In [8]:
def map_add_reported_day(x):
    x['dataframe']['date reported'] = x['date']
    return x

In [9]:
our_utlas_historical = reduce(
    lambda all_data, day_data : all_data.append(day_data,ignore_index=True),
    map(
        lambda x : x['dataframe'],
        map(
            map_add_reported_day,
            map(utla_day_to_dataframe, datafiles)
        )
    ),
    pd.DataFrame(columns = ['code','name','date of case','date reported','cases'])
)
our_utlas_historical['cases'] = our_utlas_historical['cases'].convert_dtypes(convert_integer=True)

In [13]:
# copied from get_latest_deaths, which really ought to be very similar as it's non-cumulative
def get_latest_cases(code_group=False, df = our_utlas_historical.copy()):
    # get the last and penultimate reporting dates
    last_day = df['date reported'].max()
    penultimate_day = last_day - timedelta(days=1)

    # remove all but the last two days of data
    df = df[(df['date reported'] == last_day) | (df['date reported'] == penultimate_day)]
    
    # group
    df = df.sort_values(by=['date reported'], ascending=False)
    df = df.groupby(['code','name','date reported'], as_index=False)['cases'].sum()
    df['reporting diff'] = df['cases'].diff().convert_dtypes(convert_integer=True)
    df = df[df['date reported'] == last_day]
    df = df.sort_values(by=['reporting diff'], ascending=False)
    
    if code_group:
        df = df[df.code.isin(code_group)]
        df = df.groupby('date reported', as_index=False).sum()
        
    return df

In [15]:
get_latest_cases()

Unnamed: 0,code,name,date reported,cases,reporting diff
7,E08000025,Birmingham,2020-04-21,2361,51
17,E10000028,Staffordshire,2020-04-21,1287,39
19,E10000034,Worcestershire,2020-04-21,916,29
13,E08000030,Walsall,2020-04-21,714,27
5,E06000051,Shropshire,2020-04-21,352,22
9,E08000027,Dudley,2020-04-21,607,21
11,E08000028,Sandwell,2020-04-21,776,18
15,E08000031,Wolverhampton,2020-04-21,599,18
1,E06000020,Telford and Wrekin,2020-04-21,180,13
3,E06000021,Stoke-on-Trent,2020-04-21,303,7


In [16]:
get_latest_cases(utlacodes_birmingham)

Unnamed: 0,date reported,cases,reporting diff
0,2020-04-21,2361,51
