In [6]:
import pandas as pd
from datetime import date, timedelta
import urllib
import numpy as np
import altair as alt
from functools import reduce

In [7]:
## Global variables
today = date.today()
yesterday = date.today() - timedelta(days=1)
import_directory = 'Imports/'

## NHS File Format
nhs_file_format = 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/DIRECTORY/COVID-19-total-announced-deaths-DATE.xlsx'
nhs_file_date_format = '%-d-%B-%Y'
nhs_file_directory_format = '%Y/%m'

In [3]:
nhs_max_suffix = 5

In [4]:
range(nhs_max_suffix)

range(0, 5)

In [5]:
def map_suffix(n):
    if n == 0:
        return '.xlsx'
    else:
        return f'-{n}.xlsx'

In [6]:
list(
    map(
        map_suffix,
        reversed(range(nhs_max_suffix+1))
    )
)

['-5.xlsx', '-4.xlsx', '-3.xlsx', '-2.xlsx', '-1.xlsx', '.xlsx']

In [7]:
dates_to_try = [
    date.today(),
    date.today() - timedelta(days=1),
]

In [8]:
list(
    map(
        lambda date: date.strftime(nhs_file_date_format),
        dates_to_try
    )
)

['22-April-2020', '21-April-2020']

In [9]:
nhs_filenames = [
    'COVID-19-total-announced-deaths-',
    'COVID-19-all-announced-deaths-',
]

In [10]:
list(
    map(
        lambda date: date.strftime(nhs_file_directory_format),
        dates_to_try
    )
)

['2020/04', '2020/04']

In [24]:
start_date = date(2020, 4, 4) # as of today (2020-04-15) this is the first day/file to exist
end_date = date.today()
list(
    map( # list of days
        lambda interval : start_date + timedelta(days=interval),
        range((end_date - start_date).days+1)
    )
)

[datetime.date(2020, 4, 4),
 datetime.date(2020, 4, 5),
 datetime.date(2020, 4, 6),
 datetime.date(2020, 4, 7),
 datetime.date(2020, 4, 8),
 datetime.date(2020, 4, 9),
 datetime.date(2020, 4, 10),
 datetime.date(2020, 4, 11),
 datetime.date(2020, 4, 12),
 datetime.date(2020, 4, 13),
 datetime.date(2020, 4, 14),
 datetime.date(2020, 4, 15),
 datetime.date(2020, 4, 16),
 datetime.date(2020, 4, 17),
 datetime.date(2020, 4, 18),
 datetime.date(2020, 4, 19),
 datetime.date(2020, 4, 20),
 datetime.date(2020, 4, 21),
 datetime.date(2020, 4, 22)]

In [34]:
def map_suffix(n):
    if n == 0:
        return '.xlsx'
    else:
        return f'-{n}.xlsx'

nhs_max_suffix = 2
suffixes = list(map(map_suffix,reversed(range(nhs_max_suffix+1))))
start_date = date(2020, 4, 4) # as of today (2020-04-15) this is the first day/file to exist
end_date = date.today()
dates = reversed(list(map(lambda interval : start_date + timedelta(days=interval),range((end_date - start_date).days+1)))
)
filenames = ['COVID-19-total-announced-deaths-','COVID-19-all-announced-deaths-',]
nhs_file_date_format = '%-d-%B-%Y'
nhs_file_directory_format = '%Y/%m/'
nhs_file_start = 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/'

def url_builder(start,date,filename,suffix):
    url = start
    url += date.strftime(nhs_file_directory_format)
    url += filename
    url += date.strftime(nhs_file_date_format)
    url += suffix
    
    return url

urls = []

for day in dates:
    for suffix in suffixes:
        for name in filenames:
            url = url_builder(nhs_file_start,day,name,suffix)
            try:
                urllib.request.urlopen(url)
                print(url)
            except:
                pass

https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-22-April-2020.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-21-April-2020-1.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-21-April-2020.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-14-April-2020.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-13-April-2020.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-12-April-2020.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-11-April-2020.xlsx
https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-10-April-2020.xlsx
https:

_Disappointingly, on top of the naming convention volatility it seems older files are being deleted. I knew the path for April 15 and it isn't available on that URL anymore. Perhaps it renamed within the day but I doubt it._

In [36]:
def get_real_url(date):
    real_url = False
    # builder variables
    ## how many suffixes to try ( highest so far is 1 — 20200422 )
    nhs_max_suffix = 2
    
    ## make a list of suffixes
    suffixes = list(
        map(
            lambda n: '.xlsx' if n==0 else f'-{n}.xlsx',
            reversed(range(nhs_max_suffix+1))
        )
    )
    
    ## filenames so far
    filenames = ['COVID-19-total-announced-deaths-','COVID-19-all-announced-deaths-',]
    
    ## date format, only one so far (!)
    date_format = '%-d-%B-%Y'
    
    ## directory date format
    directory_format = '%Y/%m/'
    
    ## domain and initial path
    file_start = 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/'
    
    # testing
    for suffix in suffixes:
        if (real_url==False):
            for name in filenames:
                if (real_url==False):
                    url = file_start + date.strftime(directory_format) + name + date.strftime(nhs_file_date_format) + suffix
                    try:
                        urllib.request.urlopen(url)
                        real_url = url
                        break
                    except:
                        pass
    
    return real_url

In [37]:
get_real_url(
    date(2020, 4, 21)
)

'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-total-announced-deaths-21-April-2020-1.xlsx'