# Loading NPI data 

some quick notes: due to inconsistent data formatting (& sometimes just wrong dates), i'm currently not getting all the NPIs. i'm always getting around half in each country. Austra is almost perfect by default (go team Austria!). I spent a bit of time making exceptions to get 90% of UK NPIs. I haven't spent any time on making exceptions for Italy or Germany. Expect it to take at most a few hours to get 90+% of all NPIs. If you still plan to do a run tomorrow Mrinank, if you think it's worth it I can probably do a few hours of work in the morn to get up to 90%. lmk

In [None]:
import os
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from datetime import datetime as dt

In [None]:
# this dict contains information for loading the info
npi_information_dict = {
    'Public Outdoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Outdoors)',
        'end_date_col': 'End date (Public Outdoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Public Outdoors)'
    },
    'Public Indoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Indoors)',
        'end_date_col': 'End date (Public Indoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Public Indoors)'
    },
    'Private Outdoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Outdoors)',
        'end_date_col': 'End date (Private Outdoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Private Outdoors)'
    },
    'Private Indoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Indoors)',
        'end_date_col': 'End date (Private Indoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Private Indoors)'
    },
    'Public Outdoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Outdoors)',
        'end_date_col': 'End date (Public Outdoors)',
        'value_col': 'Limit on number of households (Public Outdoors)'
    },
    'Public Indoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Indoors)',
        'end_date_col': 'End date (Public Indoors)',
        'value_col': 'Limit on number of households (Public Indoors)'
    },
    'Private Outdoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Outdoors)',
        'end_date_col': 'End date (Private Outdoors)',
        'value_col': 'Limit on number of households (Private Outdoors)'
    },
    'Private Indoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Indoors)',
        'end_date_col': 'End date (Private Indoors)',
        'value_col': 'Limit on number of households (Private Indoors)'
    },
    'Mandatory Mask Wearing': {
        'type': 'value',
        'start_date_col': 'Start date (Mask Wearing)',
        'end_date_col': 'End date (Mask Wearing)',
        'value_col': 'Level of NPI (0-4) (Mask Wearing)'
    },
    'Some Face-to-Face Businesses Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Some Face2Face)',
        'end_date_col': 'End date (Some Face2Face)'
    },
    'Gastronomy Closed': {
        'type': 'binary',
        'start_date_col': 'Start date  (Gastronomy Closed)',
        'end_date_col': 'End date (Gastronomy Closed)'
    },
    'Leisure Venues Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Leisure Venue)',
        'end_date_col': 'End date (Leisure Venue)'
    },
    'Retail Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Retail)',
        'end_date_col': 'End date (Retail)'
    },
    'All Face-to-Face Businesses Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (All Face2Face)',
        'end_date_col': 'End date (All Face2Face)'
    },
    'Stay at Home Order': {
        'type': 'binary',
        'start_date_col': 'Start date (Stay Home)',
        'end_date_col': 'End date (Stay Home)'
    },
    'Curfew': {
        'type': 'binary',
        'start_date_col': 'Start date (Curfew)',
        'end_date_col': 'End date (Curfew)'
    },
    'Childcare Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Childcare)',
        'end_date_col': 'End date  (Childcare)'
    },
    'Primary Schools Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Primary Schools)',
        'end_date_col': 'End date (Primary Schools)'
    },
    'Secondary Schools Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Secondary Schools)',
        'end_date_col': 'End date (Secondary Schools)'
    },
    'Universities Away': {
        'type': 'binary',
        'start_date_col': 'Start date (Unis Away)',
        'end_date_col': 'End date (Unis Away)'
    },
}

In [None]:
def process_start_date_str(start_date_str):
    if start_date_str.strip() in ['Before 1 August', 'before 1 August 2020', 'Before 1 August 2020', 'Before August 1', '1 August 2020', 'Before 1st of August', 'before 01/08/2020', 'before 1/08/2020',  
                                  'before 1/8/2020', 'before 13/7/2020', 'Before 1st August']:
        return pd.to_datetime('2020-08-01')
    
    elif start_date_str.strip() in ['no', 'No', 'nan', 'N/A', 'NA']:
        return None
    
    try:
        return pd.to_datetime(start_date_str, dayfirst=True, infer_datetime_format=True)
    except ValueError:
        print(f'Could not convert start date {start_date_str}')
        
def process_end_date_str(end_date_str):
    if end_date_str.strip() in ['no', 'No', 'nan', 'N/A', 'NA', 'After 9 January 2021']:
        return pd.to_datetime('2021-01-09')
    
    try:
        return pd.to_datetime(end_date_str,  dayfirst=True, infer_datetime_format=True)
    except ValueError:
        print(f'Could not convert end date {end_date_str}')
        
def process_value(value):
    if str(value).strip() in ['no', 'No', 'nan', 'NaN']:
        return 0
    else:
        return int(value)
        
def process_cm_dict(row, cm_dict):
    sd = str(row[cm_dict['start_date_col']])
    ed = str(row[cm_dict['end_date_col']])

    sd_dt = process_start_date_str(sd)
    ed_dt = process_end_date_str(ed)

    if sd_dt is None:
        return (None, None, None)
    else:
        # sd is not None
        value = 1 if cm_dict['type'] == 'binary' else process_value(row[cm_dict['value_col']])
        return (sd_dt, ed_dt, value)
        
def datetime_to_index(dt, Ds):
    ind = None
    
    if dt < pd.to_datetime('2020-08-01'):
        ind = 0
    else:
        try:
            ind = list(Ds).index(dt)
        except:
            error_str = f'Date {dt} was not in my list'
            new_dt = dt - pd.DateOffset(years=1)
            if new_dt in Ds:
                ind = list(Ds).index(new_dt)
                error_str = f'{error_str} -- Used month and date'
            else:
                error_str = f'{error_str} -- failed'
            print(error_str)
    return ind

In [None]:
def create_active_cms_mat(df, Rs, npi_information_dict, Ds):
    CMs = list(npi_information_dict.keys())

    nRs = len(Rs)
    nDs = len(Ds)
    nCMs = len(CMs)
    
    active_cms = np.zeros((nRs, nCMs, nDs))
    
    for r_i, r in enumerate(Rs):
        sub_df = df.loc[r]
        for _, row in sub_df.iterrows():
            for cm_i, (cm_name, cm_dict) in enumerate(npi_information_dict.items()):
                sd_dt, ed_dt, value = process_cm_dict(row, cm_dict)

                if sd_dt is not None and ed_dt is not None:
                    start_ind = datetime_to_index(sd_dt, Ds)           
                    end_ind = datetime_to_index(ed_dt, Ds)
                    active_cms[r_i, cm_i, start_ind:end_ind+1] = value
    return active_cms

# produce active cms from DF

In [None]:
# currently reads directly from the CSV
uk_df = pd.read_csv('../../data/npi_data/england.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?'])

uk_df = uk_df.drop(droplist, axis=1)
uk_df = uk_df.rename(columns=lambda x: x.strip())
uk_df['Local area'] = uk_df['Local area'].apply(lambda x: str(x).strip())

uk_df = uk_df.set_index('Local area')

In [None]:
uk_Rs = ['Lincolnshire', 'Greater Manchester South West',
       'Redbridge and Waltham Forest', 'Enfield', 'Buckinghamshire CC',
       'Portsmouth', 'Southampton', 'Brighton and Hove', 'Coventry',
       'Walsall', 'North Yorkshire CC', 'Essex Haven Gateway',
       'Southend-on-Sea', 'Gloucestershire', 'East Derbyshire',
       'Enfield', 'Buckinghamshire CC']

In [None]:
start_date = '08-01-2020'
end_date = '01-09-2021'
Ds = pd.date_range(start=start_date, end=end_date)

In [None]:
uk_active_cms = create_active_cms_mat(uk_df, uk_Rs, npi_information_dict, Ds)

In [None]:
# currently reads directly from the CSV
at_df = pd.read_csv('../../data/npi_data/austria.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in at_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?', 'Local area'])

at_df = at_df.drop(droplist, axis=1)
at_df = at_df.rename(columns=lambda x: x.strip())

at_df = at_df.set_index('Region')

at_Rs = ['Wien', 'Burgenland', 'Steiermark', 'Oberösterreich',
       'Nieder­österreich', 'Voralberg ', 'Tirol', 'Karnten/Carinthia',
       'Salzburg']

In [None]:
at_active_cms = create_active_cms_mat(at_df, at_Rs, npi_information_dict, Ds)

In [None]:
# currently reads directly from the CSV
de_df = pd.read_csv('../../data/npi_data/germany.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?'])

de_df = de_df.drop(droplist, axis=1)
de_df = de_df.rename(columns=lambda x: x.strip())
de_df['Local area'] = de_df['Local area'].apply(lambda x: str(x).strip())

de_Rs = ['Nürnberg', 'SK Aschaffenburg', 'Fürth', 'Landsberg am Lech',
       'LK Donau-Ries', 'Minden-Lübbecke', 'Mönchengladbach', 'Münster',
       'Rhein-Kreis Neuss', 'LK Ennepe-Ruhr-Kreis', 'LK Rems-Murr-Kreis',
       'LK Breisgau-Hochschwarzwald', 'LK Enzkreis', 'LK Hildesheim',
       'LK Gifhorn']
de_df = de_df.set_index('Local area')

In [None]:
de_active_cms = create_active_cms_mat(de_df, de_Rs, npi_information_dict, Ds)

In [None]:
# currently reads directly from the CSV
it_df = pd.read_csv('../../data/npi_data/italy.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in it_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?', 'Local area'])

it_df = it_df.drop(droplist, axis=1)
it_df = it_df.rename(columns=lambda x: x.strip())

it_df = it_df.set_index('Region')

it_Rs = ['Abruzzo', 'Aosta Valley',
                      'Apulia (AKA Puglia)',
                               'Basilicata',
                                 'Calabria',
                                 'Campania',
                           'Emilia-Romagna',
                    'Friuli-Venezia Giulia',
                                    'Lazio',
                                 'Liguria ',
                                 'Lombardy',
                                   'Marche',
                                   'Molise',
                                 'Piedmont',
                                 'Sardinia',
                                   'Sicily',
                     'Trentino-South Tyrol',
                                'Tuscania ',
                                   'Umbria',
                                   'Veneto']

In [None]:
it_active_cms = create_active_cms_mat(it_df, it_Rs, npi_information_dict, Ds)

In [None]:
all_Rs = [*uk_Rs, *at_Rs, *de_Rs, *it_Rs]
all_Cs = [*['england' for i in range(len(uk_Rs))], *['austria' for i in range(len(at_Rs))], *['germany' for i in range(len(de_Rs))], *['italy' for i in range(len(it_Rs))]]