# What was the effective region size in the UK?

We want to decide between running things at a LTLA level (LAU1), or a NUTS 3 level. To do this, we will use the JBC LTLA data. 

In [None]:
import pandas as pd

import numpy as np

import json

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## load UK NPI data

In [None]:
uk_npi_df = pd.read_csv('../../data/raw_data_w_sources/UK_JBC_NPI_data.csv')
uk_npi_df['date'] = pd.to_datetime(uk_npi_df['date'])

In [None]:
with open('../../data/raw_data_w_sources/uk_ltla_info.json') as json_file:
    uk_ltla_info_dict = json.load(json_file)

uk_ltla_info_df = pd.DataFrame([d['attributes'] for d in uk_ltla_info_dict['features']])
uk_ltla_info_df = uk_ltla_info_df.rename({'LAU117NM': 'area', 'NUTS318NM': 'NUTS3', 'NUTS118NM': 'region', 'NUTS219NM':'NUTS2'} ,axis=1)
uk_ltla_info_df = uk_ltla_info_df.set_index('area')

In [None]:
def ltla_to_nuts3_lookup(ltla):
    if ltla in uk_ltla_info_df.index:
        return uk_ltla_info_df.loc[ltla]['NUTS3']
    else:
        print(f'{ltla} missing from lookup')
        return 'unknown NUTS3'
    
uk_npi_df['NUTS3'] = uk_npi_df['ltla'].map(ltla_to_nuts3_lookup)

In [None]:
NUTS3_regions = uk_npi_df['NUTS3'].unique()
ltlas = uk_npi_df['ltla'].unique()

In [None]:
# in case you want to make a csv with this information in!
#uk_npi_df.to_csv('england_jbc_npi_w_nuts3.csv')

# Note: some ltlas missing

There are ~400 in the UK (380 regions by cases and deaths data on the dashboard), so it seems that some areas are missing here. 

In [None]:
print(f'There are {len(ltlas)} LTLAs represented here')
print(f'There are {len(NUTS3_regions)} NUTS3 area represented')

# How many LTLAs are in each NUTS3 region?

In [None]:
ltlas_per_nuts3 = np.zeros(len(NUTS3_regions))

In [None]:
for i, nuts3_region in enumerate(NUTS3_regions):
    if 'unknown' in nuts3_region:
        continue
    
    filtered_uk_npi_df = uk_npi_df.loc[uk_npi_df['NUTS3'] == nuts3_region]
    
    ltlas = filtered_uk_npi_df['ltla'].unique()
    n_ltlas = len(ltlas)
    
    ltlas_per_nuts3[i] = n_ltlas

In [None]:
sns.histplot(ltlas_per_nuts3)
plt.title('LTLAs per NUTS3 Region')

In [None]:
print(f'The NUTS3 with the most LTLAs is {NUTS3_regions[np.argmax(ltlas_per_nuts3)]}')

This is confirmed here: https://www.ons.gov.uk/methodology/geography/ukgeographies/eurostat#london

In [None]:
print(f'There are NUTS3 {np.sum(ltlas_per_nuts3 > 1)} with more than 1 LTLAs')

Of the represented NUTS3 regions, this is half. 

# Load NPI data per NUTS3 region

In [None]:
Ds = pd.date_range(start='2020-08-01', end='2020-12-01')

In [None]:
nuts3_npi_dict = {}

In [None]:
npis = uk_npi_df.columns[6:-114]

In [None]:
print(f'The NPIs we are considering are {npis}')

In [None]:
for i, nuts3_region in enumerate(NUTS3_regions):
    if 'unknown' in nuts3_region:
        continue
    
    filtered_uk_npi_df = uk_npi_df.loc[uk_npi_df['NUTS3'] == nuts3_region]
    
    ltlas = filtered_uk_npi_df['ltla'].unique()
    n_ltlas = len(ltlas)
    
    active_cms = np.zeros((len(ltlas), len(npis), len(Ds)))
    filtered_uk_npi_df = filtered_uk_npi_df.set_index(['ltla', 'date'])
    
    if n_ltlas > 2:
        for ltla_index, ltla in enumerate(ltlas):
            ltla_df = filtered_uk_npi_df.loc[ltla].loc[Ds]
            for npi_index, npi in enumerate(npis):
                active_cms[ltla_index, npi_index, :] = ltla_df[npi]
                
    nuts3_npi_dict[nuts3_region] = active_cms

# Compute discrepancies per NPI

In [None]:
nuts3_npi_consistency_dict = {}

In [None]:
for nuts3_region in NUTS3_regions:
    if 'unknown' in nuts3_region:
        continue
        
    active_cms = nuts3_npi_dict[nuts3_region]
    nLTLAs, nCMs, nDs = active_cms.shape
    
    cms_consistent = np.zeros((nCMs, nDs))
    
    for npi_i in range(nCMs):
        cms_consistent[npi_i, :] = np.all(active_cms[:, npi_i, :] == active_cms[0, npi_i, :], axis=0)
        
    nuts3_npi_consistency_dict[nuts3_region] = cms_consistent

Now compute discrepancy per NPI

In [None]:
total_nuts3_days = (len(NUTS3_regions) - 1) * nDs
npi_consistent_days = np.zeros(len(npis))

In [None]:
for nuts3_region, npi_consistency_mat in nuts3_npi_consistency_dict.items():
    npi_consistent_days += np.sum(npi_consistency_mat, axis=-1)

In [None]:
total_days_inconsistent = total_nuts3_days - npi_consistent_days

In [None]:
plt.figure(figsize=(7, 3), dpi=300)
sns.barplot(npis, total_days_inconsistent)
plt.title(f'Total Num days NPIs in NUTS3 region were inhomogenous\n(out of {total_nuts3_days} datapoints)\nAug1-Dec12')
plt.xticks(rotation=-90)

In [None]:
npis_with_inconsistencies = np.nonzero(total_days_inconsistent)[0]

Some more questions:

- How many NUTS3 areas have these inconsistencies?
- What do the inconsistencies look like?

In [None]:
nuts3_regions_with_inconsistencies = []

for nuts3_region, npi_consistency_mat in nuts3_npi_consistency_dict.items():
    inconsistencies = np.sum(npi_consistency_mat == 0)
    if inconsistencies > 0:
        nuts3_regions_with_inconsistencies.append(nuts3_region)
        print(f'{nuts3_region} has {inconsistencies} inconsistencies')

In [None]:
print(f'{len(nuts3_regions_with_inconsistencies)} out of {len(NUTS3_regions)} NUTS3 regions have some inconsistency')

In [None]:
for inconsistent_nuts3 in nuts3_regions_with_inconsistencies:
    plt.figure(figsize=(10, 8), dpi=300)
    cms = nuts3_npi_dict[inconsistent_nuts3] 
    n_ltlas, _, nDs = cms.shape
    
    for plot_i, npi_i in enumerate(npis_with_inconsistencies):
        plt.subplot(6, 2, plot_i+1)
        plt.title(npis[npi_i])
        for ltla in range(n_ltlas):
            plt.plot(np.arange(nDs), cms[ltla, npi_i, :]+0.05*np.random.normal(size=1))
        plt.xlabel('days after 1st aug')
        plt.yticks([0, 1], ['NPI Inactive', 'NPI_Active'])
    plt.suptitle(inconsistent_nuts3)
    plt.tight_layout()

# Conclusions

NUTS3 looks mostly fine in the UK. There are small number of discrepencies in some regions (<10% of them), and these discrepancies only affect the above NPIs. 

Sampling will mean we only pick up a small number of these regions, and in most regions, the discrepancies look ok (i.e., delays rather than completely different policies). There are some NUTS3 regions where the discrepancies look a bit problematic - namely leicester, lancashire, and some of manchester. 

An open question remains on getting case and death data at this level - as some local areas seem to be in multiple nuts 3 regions. 