In [None]:
import pandas as pd

import numpy as np

import json

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## load UK NPI data

In [None]:
uk_npi_df = pd.read_csv('../../data/raw_data_w_sources/uk_JBC_NPI_data.csv')
uk_npi_df['date'] = pd.to_datetime(uk_npi_df['date'])

In [None]:
with open('../../data/raw_data_w_sources/uk_ltla_info.json') as json_file:
    uk_ltla_info_dict = json.load(json_file)

uk_ltla_info_df = pd.DataFrame([d['attributes'] for d in uk_ltla_info_dict['features']])
uk_ltla_info_df = uk_ltla_info_df.rename({'LAU117NM': 'area', 'NUTS318NM': 'NUTS3', 'NUTS118NM': 'region', 'NUTS218NM':'NUTS2'} ,axis=1)
uk_ltla_info_df = uk_ltla_info_df.set_index('area')

In [None]:
def ltla_to_nuts3_lookup(ltla):
    if ltla in uk_ltla_info_df.index:
        return uk_ltla_info_df.loc[ltla]['NUTS3']
    else:
        print(f'{ltla} missing from lookup')
        return 'unknown NUTS3'
    
uk_npi_df['NUTS3'] = uk_npi_df['ltla'].map(ltla_to_nuts3_lookup)

# load case and death data

In [None]:
uk_df = pd.read_csv('../../data/raw_data_w_sources/uk_case_deaths.csv', infer_datetime_format=True)
uk_df = uk_df.drop(['areaCode', 'newCasesByPublishDate', 'newDeaths28DaysByPublishDate'], axis=1)
uk_df['areaType'] = 'UK'
uk_df = uk_df.rename({'areaType': 'country', 'areaName':'area', 'newCasesBySpecimenDate': 'new_cases', 'newDeaths28DaysByDeathDate': 'new_deaths'}, axis=1)
uk_df['date'] = pd.to_datetime(uk_df['date'])
uk_df = uk_df.set_index(['area', 'date'])

In [None]:
uk_df = uk_df.sort_index(level=[1],ascending=[True])

# construct set to model on 

In [None]:
# only have until late november in terms of UK NPI data.
Ds = pd.date_range('2020-08-01', '2020-11-30')

In [None]:
regions = ['Durham CC',
 'Warwickshire',
 'West Surrey',
 'East Merseyside',
 'Greater Manchester North West',
 'Berkshire',
 'Gloucestershire',
 'Worcestershire',
 'Leeds',
 'Barnsley, Doncaster and Rotherham',
 'Leicestershire CC and Rutland',
 'Hertfordshire',
 'Merton, Kingston upon Thames and Sutton',
 'Tyneside',
 'Liverpool',
 'Sheffield',
 'Wirral',
 'Sunderland',
 'Sandwell',
 'South and West Derbyshire',
 'Cheshire West and Chester',
 'Essex Haven Gateway',
 'Barnet',
 'Hounslow and Richmond upon Thames',
 'Cambridgeshire CC',
 'Cheshire East',
 'Central Hampshire',
 'Croydon',
 'Oxfordshire',
 'Manchester',
 'Haringey and Islington',
 'Calderdale and Kirklees',
 'Ealing',
 'Kent Thames Gateway',
 'South Hampshire',
 'Heart of Essex',
 'East Riding of Yorkshire',
 'Dudley',
 'West Sussex (North East)',
 'Lambeth',
 'North Northamptonshire',
 'West Essex',
 'Enfield',
 'Derby',
 'Sefton',
 'Buckinghamshire CC',
 'Hackney and Newham',
 'South Teesside',
 'Nottingham',
 'Bedford',
 'Southend-on-Sea',
 'East Derbyshire',
 'Wiltshire',
 'South Nottinghamshire',
 'Mid Kent',
 'West Sussex (South West)',
 'Kingston upon Hull, City of',
 'West Kent',
 'Northumberland',
 'Medway',
 'Kensington & Chelsea and Hammersmith & Fulham',
 'Lancaster and Wyre',
 'East Lancashire',
 'Coventry',
 'Milton Keynes',
 'North and North East Lincolnshire',
 'Stoke-on-Trent',
 'Plymouth',
 'Isle of Wight',
 'Peterborough',
 'Camden and City of London',
 'Southampton',
 'Swindon',
 'Brighton and Hove',
 'Telford and Wrekin',
 'Bristol, City of',
 'Torbay',
 'Portsmouth',
 'York',
 'Breckland and South Norfolk']

uk_df_list = []
npis = uk_npi_df.columns[6:-115]

In [None]:
all_active_cms = np.zeros((len(regions), len(npis), len(Ds)))
new_cases = np.zeros((len(regions), len(Ds)))
new_deaths = np.zeros((len(regions), len(Ds)))

for region_i, region in enumerate(regions):
    filtered_df = uk_npi_df.loc[uk_npi_df['NUTS3'] == region]
    
    # npi data first
    ltlas = filtered_df['ltla'].unique()
    n_ltlas = len(ltlas)
    
    active_cms = np.zeros((len(ltlas), len(npis), len(Ds)))
    filtered_df = filtered_df.set_index(['ltla', 'date'])
    
    for ltla_index, ltla in enumerate(ltlas):
        ltla_df = filtered_df.loc[ltla].loc[Ds]
        for npi_index, npi in enumerate(npis):
            active_cms[ltla_index, npi_index, :] = ltla_df[npi]
        
        if ltla in uk_df.index.unique(0):
            new_cases[region_i, :] += uk_df.loc[ltla]['new_cases'].loc[Ds].to_numpy()
            new_deaths[region_i, :] += uk_df.loc[ltla]['new_deaths'].loc[Ds].to_numpy()
        
    all_active_cms[region_i, :, :] = np.all(active_cms, axis=0)

In [None]:
data_dict = {
    'new_cases': new_cases, 
    'new_deaths': new_deaths,
    'active_cms': all_active_cms,
    'regions': regions,
    'days': Ds,
    'CMs': list(npis),
}

In [None]:
import pickle

pickle.dump(data_dict, open('uk_test_set.pkl', 'wb'))