# Loading and Merging NPI data

In [33]:
import numpy as np
import pandas as pd

import json
import copy

import re

In [34]:
# this dict contains information for loading the info
npi_information_dict = {
    'Public Outdoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Outdoors)',
        'end_date_col': 'End date (Public Outdoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Public Outdoors)'
    },
    'Public Indoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Indoors)',
        'end_date_col': 'End date (Public Indoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Public Indoors)'
    },
    'Private Outdoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Outdoors)',
        'end_date_col': 'End date (Private Outdoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Private Outdoors)'
    },
    'Private Indoor Gathering Person Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Indoors)',
        'end_date_col': 'End date (Private Indoors)',
        'value_col': 'Limit on number of people (only X people or fewer) (Private Indoors)'
    },
    'Public Outdoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Outdoors)',
        'end_date_col': 'End date (Public Outdoors)',
        'value_col': 'Limit on number of households (Public Outdoors)'
    },
    'Public Indoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Public Indoors)',
        'end_date_col': 'End date (Public Indoors)',
        'value_col': 'Limit on number of households (Public Indoors)'
    },
    'Private Outdoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Outdoors)',
        'end_date_col': 'End date (Private Outdoors)',
        'value_col': 'Limit on number of households (Private Outdoors)'
    },
    'Private Indoor Household Limit': {
        'type': 'value',
        'start_date_col': 'Start date (Private Indoors)',
        'end_date_col': 'End date (Private Indoors)',
        'value_col': 'Limit on number of households (Private Indoors)'
    },
    'Mandatory Mask Wearing': {
        'type': 'value',
        'start_date_col': 'Start date (Mask Wearing)',
        'end_date_col': 'End date (Mask Wearing)',
        'value_col': 'Level of NPI (0-4) (Mask Wearing)'
    },
    'Some Face-to-Face Businesses Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Some Face2Face)',
        'end_date_col': 'End date (Some Face2Face)'
    },
    'Gastronomy Closed': {
        'type': 'binary',
        'start_date_col': 'Start date  (Gastronomy Closed)',
        'end_date_col': 'End date (Gastronomy Closed)'
    },
    'Leisure Venues Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Leisure Venue)',
        'end_date_col': 'End date (Leisure Venue)'
    },
    'Retail Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Retail)',
        'end_date_col': 'End date (Retail)'
    },
    'All Face-to-Face Businesses Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (All Face2Face)',
        'end_date_col': 'End date (All Face2Face)'
    },
    'Stay at Home Order': {
        'type': 'binary',
        'start_date_col': 'Start date (Stay Home)',
        'end_date_col': 'End date (Stay Home)'
    },
    'Curfew': {
        'type': 'binary',
        'start_date_col': 'Start date (Curfew)',
        'end_date_col': 'End date (Curfew)'
    },
    'Childcare Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Childcare)',
        'end_date_col': 'End date  (Childcare)'
    },
    'Primary Schools Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Primary Schools)',
        'end_date_col': 'End date (Primary Schools)'
    },
    'Secondary Schools Closed': {
        'type': 'binary',
        'start_date_col': 'Start date (Secondary Schools)',
        'end_date_col': 'End date (Secondary Schools)'
    },
    'Universities Away': {
        'type': 'binary',
        'start_date_col': 'Start date (Unis Away)',
        'end_date_col': 'End date (Unis Away)'
    },
}

In [35]:
# if there is a string, this lookup converts the month string into the month int and year int
# not the cleanest way of doing this....
lookup_months = {
    'January': (1, 2021),
    'Jan': (1, 2021),
    'September': (9, 2020),
    'Septemeber': (9, 2020),
    'December': (12, 2020),
    'August': (8, 2020),
    'November': (11, 2020),
}

In [36]:
def process_start_date_str(start_date_str):
    if start_date_str.strip() in ['Before 1 August', 'before 1 August 2020', 'Before 1 August 2020', 'Before August 1', '1 August 2020', 'Before 1st of August', 'before 01/08/2020', 'before 1/08/2020',  
                                  'before 1/8/2020', 'before 13/7/2020', 'Before 1st August']:
        return pd.to_datetime('2020-08-01')
    
    elif start_date_str.strip() in ['no', 'No', 'nan', 'N/A', 'NA']:
        return None
    
    try:
        return pd.to_datetime(start_date_str, dayfirst=True, infer_datetime_format=True)
    except ValueError:
        day = int(re.search('[0-9]+', start_date_str)[0])
        for month_str, (m, y) in lookup_months.items():
            if month_str in start_date_str:
                dt = pd.Timestamp(day=day, month=m, year=y)
                print(f'Regex Succeeded: Converted {start_date_str} to {dt}')
                return dt
        print(f'Could not convert start date {start_date_str}')
        
def process_end_date_str(end_date_str):
    if end_date_str.strip() in ['no', 'No', 'nan', 'N/A', 'NA', 'After 9 January 2021']:
        return pd.to_datetime('2021-01-09')

    try:
        return pd.to_datetime(end_date_str,  dayfirst=True, infer_datetime_format=True)
    except ValueError:
        day = int(re.search('[0-9]+', end_date_str)[0])
        for month_str, (m, y) in lookup_months.items():
            if month_str in end_date_str:
                dt = pd.Timestamp(day=day, month=m, year=y)
                print(f'Regex Succeeded: Converted {end_date_str} to {dt}')
                return dt
        
        print(f'Could not convert end date {end_date_str}')
                    
        
        
def process_value(value):
    if str(value).strip() in ['no', 'No', 'nan', 'NaN']:
        return 0

    if ',' in str(value):
        value = value.replace(',','')
    return int(value)
        
def process_cm_dict(row, cm_dict):
    sd = str(row[cm_dict['start_date_col']])
    ed = str(row[cm_dict['end_date_col']])

    sd_dt = process_start_date_str(sd)
    ed_dt = process_end_date_str(ed)

    if sd_dt is None:
        return (None, None, None)
    else:
        # sd is not None
        value = 1 if cm_dict['type'] == 'binary' else process_value(row[cm_dict['value_col']])
        return (sd_dt, ed_dt, value)
        
def datetime_to_index(dt, Ds):
    ind = None
    
    if dt < pd.to_datetime('2020-08-01'):
        ind = 0
    else:
        try:
            ind = list(Ds).index(dt)
        except:
            error_str = f'Date {dt} was not in my list'
            new_dt = dt - pd.DateOffset(years=1)
            if new_dt in Ds:
                ind = list(Ds).index(new_dt)
                error_str = f'{error_str} -- Used month and date'
            else:
                error_str = f'{error_str} -- failed'
            print(error_str)
    return ind



In [37]:
def create_active_cms_mat(df, Rs, npi_information_dict, Ds):
    CMs = list(npi_information_dict.keys())

    nRs = len(Rs)
    nDs = len(Ds)
    nCMs = len(CMs)
    
    active_cms = np.zeros((nRs, nCMs, nDs))
    
    for r_i, r in enumerate(Rs):
        sub_df = df.loc[r]
        for _, row in sub_df.iterrows():
            for cm_i, (cm_name, cm_dict) in enumerate(npi_information_dict.items()):
                sd_dt, ed_dt, value = process_cm_dict(row, cm_dict)

                if sd_dt is not None and ed_dt is not None:
                    start_ind = datetime_to_index(sd_dt, Ds)           
                    end_ind = datetime_to_index(ed_dt, Ds)
                    # the NPI should be active on the end date. that's why we need the "+1"
                    active_cms[r_i, cm_i, start_ind:end_ind+1] = value
    return active_cms


def load_new_cases_deaths_from_timeseries_df(Rs, timeseries_df, Ds):
    new_cases = np.zeros((len(Rs), len(Ds)))
    new_deaths = np.zeros((len(Rs), len(Ds)))
    
    for r_i, r in enumerate(Rs):
        new_cases[r_i, :] = timeseries_df.loc[r].loc[Ds]['new_cases']
        new_deaths[r_i, :] = timeseries_df.loc[r].loc[Ds]['new_deaths']
        
    return new_cases, new_deaths

# Load all data

## UK

In [38]:
# currently reads directly from the CSV
uk_df = pd.read_csv('../../data/npi_data/england.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in uk_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?'])

uk_df = uk_df.drop(droplist, axis=1)
uk_df = uk_df.rename(columns=lambda x: x.strip())
uk_df['Local area'] = uk_df['Local area'].apply(lambda x: str(x).strip())

uk_df = uk_df.set_index('Local area')

In [39]:
uk_Rs = ['Lincolnshire', 'Greater Manchester South West',
       'Redbridge and Waltham Forest', 'Enfield', 'Buckinghamshire CC',
       'Portsmouth', 'Southampton', 'Brighton and Hove', 'Coventry',
       'Walsall', 'North Yorkshire CC', 'Essex Haven Gateway',
       'Southend-on-Sea', 'Gloucestershire', 'East Derbyshire']

In [40]:
start_date = '08-01-2020'
end_date = '01-09-2021'
Ds = pd.date_range(start=start_date, end=end_date)

In [41]:
uk_active_cms = create_active_cms_mat(uk_df, uk_Rs, npi_information_dict, Ds)

Date 2021-11-05 00:00:00 was not in my list -- Used month and date
Regex Succeeded: Converted January 3 to 2021-01-03 00:00:00
Date 2021-06-15 00:00:00 was not in my list -- failed
Date 2021-11-05 00:00:00 was not in my list -- Used month and date
Regex Succeeded: Converted August 31 to 2020-08-31 00:00:00
Regex Succeeded: Converted August 31 to 2020-08-31 00:00:00
Date 2021-11-05 00:00:00 was not in my list -- Used month and date
Date 2021-11-05 00:00:00 was not in my list -- Used month and date
Date 2021-11-05 00:00:00 was not in my list -- Used month and date
Regex Succeeded: Converted 20 December to 2020-12-20 00:00:00
Regex Succeeded: Converted 20 December to 2020-12-20 00:00:00
Regex Succeeded: Converted 20 December to 2020-12-20 00:00:00
Date 2021-11-05 00:00:00 was not in my list -- Used month and date
Regex Succeeded: Converted 20 December to 2020-12-20 00:00:00
Regex Succeeded: Converted 19 December to 2020-12-19 00:00:00
Regex Succeeded: Converted 19 December to 2020-12-19 0

In [42]:
with open('../../data/raw_data_w_sources/uk_ltla_info.json') as json_file:
    uk_ltla_info_dict = json.load(json_file)

uk_ltla_info_df = pd.DataFrame([d['attributes'] for d in uk_ltla_info_dict['features']])
uk_ltla_info_df = uk_ltla_info_df.rename({'LAU117NM': 'area', 'NUTS318NM': 'NUTS3', 'NUTS118NM': 'region'} ,axis=1)
uk_ltla_info_df = uk_ltla_info_df.set_index('area')

uk_df = pd.read_csv('../../data/raw_data_w_sources/uk_case_deaths.csv', infer_datetime_format=True)
uk_df = uk_df.drop(['areaCode', 'newCasesByPublishDate', 'newDeaths28DaysByPublishDate'], axis=1)
uk_df['areaType'] = 'UK'
uk_df = uk_df.rename({'areaType': 'country', 'areaName':'area', 'newCasesBySpecimenDate': 'new_cases', 'newDeaths28DaysByDeathDate': 'new_deaths'}, axis=1)
uk_df = uk_df.set_index(['area', 'date'])

def NUTS3_lookup(ltla):
    try:
        nuts3 = uk_ltla_info_df.loc[ltla]['NUTS3']
    except KeyError:
#         print(f'{ltla} missing in my lookup table')
        nuts3 = 'unknown'
    return nuts3

nuts3_uk_df = uk_df.reset_index()
nuts3_uk_df['NUTS3'] = nuts3_uk_df['area'].map(NUTS3_lookup)
days = nuts3_uk_df['date'].unique()
nuts3_regions = nuts3_uk_df['NUTS3'].unique()
nuts3_df_list = []
nuts3_uk_df_merged = None

for nuts3_region in nuts3_regions:
    if nuts3_region == 'unknown':
        continue
    
    filtered_df = nuts3_uk_df.loc[nuts3_uk_df['NUTS3'] == nuts3_region]
    
    case_death_series = filtered_df.groupby('date').sum()
    case_death_series['area'] = nuts3_region
    
    if nuts3_uk_df_merged is None:
        nuts3_uk_df_merged = copy.deepcopy(case_death_series)
    else:
        nuts3_uk_df_merged = nuts3_uk_df_merged.append(case_death_series)
    
nuts3_uk_df_merged = nuts3_uk_df_merged.reset_index()
nuts3_uk_df_merged['date'] = pd.to_datetime(nuts3_uk_df_merged['date'])
nuts3_uk_df_merged = nuts3_uk_df_merged.set_index(['area', 'date'])
nuts3_uk_df_merged = nuts3_uk_df_merged.sort_index(level=[1],ascending=[True])

In [43]:
uk_cases, uk_deaths = load_new_cases_deaths_from_timeseries_df(uk_Rs, nuts3_uk_df_merged, Ds)

## Austria

In [44]:
# currently reads directly from the CSV
at_df = pd.read_csv('../../data/npi_data/austria.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in at_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?', 'Local area'])

at_df = at_df.drop(droplist, axis=1)
at_df = at_df.rename(columns=lambda x: x.strip())

at_df = at_df.set_index('Region')

at_Rs = ['Wien', 'Burgenland', 'Steiermark', 'Oberösterreich',
       'Nieder­österreich', 'Voralberg ', 'Tirol', 'Karnten/Carinthia',
       'Salzburg']

In [45]:
at_active_cms = create_active_cms_mat(at_df, at_Rs, npi_information_dict, Ds)

Regex Succeeded: Converted 6 Jan to 2021-01-06 00:00:00
Regex Succeeded: Converted 29 September to 2020-09-29 00:00:00
Regex Succeeded: Converted 29 September to 2020-09-29 00:00:00
Could not convert start date 26 Decemeber 


In [46]:
austria_ltla_lookup = pd.read_csv('../../data/raw_data_w_sources/at_lau_lookup.csv')
austria_ltla_lookup = austria_ltla_lookup.set_index('GKZ')

def at_ltla_lookup(ltla):
    if ltla in austria_ltla_lookup.index:
        return austria_ltla_lookup.loc[ltla]['State Code (middle column of HASC)']
    return 'Vienna'

austria_df = pd.read_csv('../../data/raw_data_w_sources/at_case_deaths.csv', error_bad_lines=False, delimiter=';', skiprows=1)
austria_df = austria_df.drop([' number of cases total',
       ' number of cases of 7 days', ' seven days of incidence cases',' number of total totals',
       ' number of held daily', ' number of healing total'], axis=1)
austria_df[' GKZ'] = austria_df[' GKZ'].map(at_ltla_lookup)
austria_df = austria_df.rename({'Time': 'date', ' district': 'area', ' GKZ': 'region', ' number of inhabitants': 'population', ' number of cases': 'new_cases', ' number of dead daily': 'new_deaths'}, axis=1)
austria_df = austria_df.drop('population', axis=1)
austria_df['date'] = pd.to_datetime(austria_df['date'], format='%d.%m.%Y %M:%H:%S')

austria_timeseries_df = austria_df.set_index(['area', 'date'])

austria_nuts2_regions = austria_timeseries_df['region'].unique()

austria_nuts2_df_list = []

austria_nuts2_df_merged = None

for nuts2_region in austria_nuts2_regions:    
    filtered_df = austria_timeseries_df.loc[austria_timeseries_df['region'] == nuts2_region]
    
    case_death_series = filtered_df.groupby('date').sum()
    case_death_series['area'] = nuts2_region
    
    if austria_nuts2_df_merged is None:
        austria_nuts2_df_merged = copy.deepcopy(case_death_series)
    else:
        austria_nuts2_df_merged = austria_nuts2_df_merged.append(case_death_series)
    
austria_nuts2_df_merged = austria_nuts2_df_merged.reset_index()
austria_nuts2_df_merged = austria_nuts2_df_merged.set_index(['area', 'date'])
austria_nuts2_df_merged = austria_nuts2_df_merged.sort_index(level=[1],ascending=[True])

austria_nuts2_timeseries_df = austria_nuts2_df_merged

timeseries_df_map_dict = {
    'BU': 'Burgenland',
    'Vienna': 'Wien',
    'ST': 'Steiermark',
    'OO': 'Oberösterreich',
    'TR': 'Tirol',
    'VO': 'Voralberg ',
    'KA': 'Karnten/Carinthia',
    'OO': 'Oberösterreich',
    'NO': 'Nieder\xadösterreich',
    'SZ': 'Salzburg',
}

austria_nuts2_timeseries_df.index = austria_nuts2_timeseries_df.index.map(lambda x: (timeseries_df_map_dict[x[0]], x[1]))

In [47]:
at_cases, at_deaths = load_new_cases_deaths_from_timeseries_df(at_Rs, austria_nuts2_timeseries_df, Ds)

# Germany

In [48]:
# currently reads directly from the CSV
de_df = pd.read_csv('../../data/npi_data/germany.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in de_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row'])

de_df = de_df.drop(droplist, axis=1)
de_df = de_df.rename(columns=lambda x: x.strip())
de_df['Local area'] = de_df['Local area'].apply(lambda x: str(x).strip())

de_Rs = ['Nürnberg', 'LK Aschaffenburg', 'Fürth', 'Landsberg am Lech',
       'LK Donau-Ries', 'Minden-Lübbecke', 'Mönchengladbach', 'Münster',
       'Rhein-Kreis Neuss', 'LK Ennepe-Ruhr-Kreis', 'LK Rems-Murr-Kreis',
       'LK Breisgau-Hochschwarzwald', 'LK Enzkreis', 'LK Hildesheim',
       'LK Gifhorn']
de_df = de_df.set_index('Local area')

In [49]:
de_active_cms = create_active_cms_mat(de_df, de_Rs, npi_information_dict, Ds)

In [50]:
# ags dict contains information about the local areas of germany
with open('../../data/raw_data_w_sources/de_ags.json') as json_file:
    ags_info_dict = json.load(json_file)
    
cases_df = pd.read_csv('../../data/raw_data_w_sources/de_cases-rki-by-ags.csv')
cases_df = cases_df.drop('sum_cases', axis=1)
cases_df = cases_df.rename({'time_iso8601': 'date'}, axis=1)
cases_df['date'] = pd.to_datetime(cases_df['date'])
cases_df['date'] = pd.to_datetime(cases_df['date'].dt.date)
cases_df = cases_df.set_index('date')
cases_df = cases_df.diff()

deaths_df = pd.read_csv('../../data/raw_data_w_sources/de_deaths-rki-by-ags.csv')
deaths = deaths_df.drop('sum_deaths', axis=1)
deaths_df = deaths_df.rename({'time_iso8601': 'date'}, axis=1)
deaths_df['date'] = pd.to_datetime(deaths_df['date'])
deaths_df['date'] = pd.to_datetime(deaths_df['date'].dt.date)
deaths_df = deaths_df.set_index('date')
deaths_df = deaths_df.diff()

ags_time_series_list = []

Ds = pd.date_range('2020-03-02', '2021-01-09')
for ags in ags_info_dict.keys():
    if ags == '3152':
        continue
        
    for d in Ds:
        ags_dict = {
            'area': ags_info_dict[ags]['name'],
            'date': d
        }
        ags_dict['new_cases'] = cases_df[ags][d]
        ags_dict['new_deaths'] = deaths_df[ags][d]
        
        ags_time_series_list.append(ags_dict)

germany_timeseries_df = pd.DataFrame(ags_time_series_list)
germany_timeseries_df = germany_timeseries_df.set_index(['area', 'date'])

In [51]:
de_Rs_conv = [
 'SK Nürnberg',
 'LK Aschaffenburg',
 'LK Fürth',
 'LK Landsberg a.Lech',
 'LK Donau-Ries',
 'LK Minden-Lübbecke',
 'SK Mönchengladbach',
 'SK Münster',
 'LK Rhein-Kreis Neuss',
 'LK Ennepe-Ruhr-Kreis',
 'LK Rems-Murr-Kreis',
 'LK Breisgau-Hochschwarzwald',
 'LK Enzkreis',
 'LK Hildesheim',
 'LK Gifhorn' 
]

In [52]:
for cs_name, npi_name in zip(de_Rs_conv, de_Rs):
    print(f'Cases and deaths from {cs_name}, NPIs from {npi_name}')

Cases and deaths from SK Nürnberg, NPIs from Nürnberg
Cases and deaths from LK Aschaffenburg, NPIs from LK Aschaffenburg
Cases and deaths from LK Fürth, NPIs from Fürth
Cases and deaths from LK Landsberg a.Lech, NPIs from Landsberg am Lech
Cases and deaths from LK Donau-Ries, NPIs from LK Donau-Ries
Cases and deaths from LK Minden-Lübbecke, NPIs from Minden-Lübbecke
Cases and deaths from SK Mönchengladbach, NPIs from Mönchengladbach
Cases and deaths from SK Münster, NPIs from Münster
Cases and deaths from LK Rhein-Kreis Neuss, NPIs from Rhein-Kreis Neuss
Cases and deaths from LK Ennepe-Ruhr-Kreis, NPIs from LK Ennepe-Ruhr-Kreis
Cases and deaths from LK Rems-Murr-Kreis, NPIs from LK Rems-Murr-Kreis
Cases and deaths from LK Breisgau-Hochschwarzwald, NPIs from LK Breisgau-Hochschwarzwald
Cases and deaths from LK Enzkreis, NPIs from LK Enzkreis
Cases and deaths from LK Hildesheim, NPIs from LK Hildesheim
Cases and deaths from LK Gifhorn, NPIs from LK Gifhorn


In [53]:
start_date = '08-01-2020'
end_date = '01-09-2021'
Ds = pd.date_range(start=start_date, end=end_date)

de_cases, de_deaths = load_new_cases_deaths_from_timeseries_df(de_Rs_conv, germany_timeseries_df, Ds)

## Italy

In [54]:
# currently reads directly from the CSV
it_df = pd.read_csv('../../data/npi_data/italy.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in it_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row', 'At any point in time, did the local area ever implement NPIs of interest in only some part of the local area. If yes, describe the situation a bit more.',
       'Would it be very easy to collect further data on neighboring local areas? If yes, for which local areas?',
       'How long did you need to collect this data?', 'Local area'])

it_df = it_df.drop(droplist, axis=1)
it_df = it_df.rename(columns=lambda x: x.strip())

it_df = it_df.set_index('Region')

it_Rs = ['Abruzzo', 'Aosta Valley', 'Apulia (AKA Puglia)','Basilicata','Calabria','Campania','Emilia-Romagna',
        'Friuli-Venezia Giulia','Lazio','Liguria ','Lombardy','Marche','Molise','Piedmont','Sardinia','Sicily',
         'Trentino (aka Trento)', 'South Tyrol (aka Bolzano aka Alto-Adige)', 'Tuscania ','Umbria','Veneto']

In [55]:
it_Rs

['Abruzzo',
 'Aosta Valley',
 'Apulia (AKA Puglia)',
 'Basilicata',
 'Calabria',
 'Campania',
 'Emilia-Romagna',
 'Friuli-Venezia Giulia',
 'Lazio',
 'Liguria ',
 'Lombardy',
 'Marche',
 'Molise',
 'Piedmont',
 'Sardinia',
 'Sicily',
 'Trentino (aka Trento)',
 'South Tyrol (aka Bolzano aka Alto-Adige)',
 'Tuscania ',
 'Umbria',
 'Veneto']

In [56]:
# it_Rs = ['Abruzzo', 'Aosta Valley',
#                       'Apulia (AKA Puglia)',
#                                'Basilicata',
#                                  'Calabria',
#                                  'Campania',
#                            'Emilia-Romagna',
#                     'Friuli-Venezia Giulia',
#                                     'Lazio',
#                                  'Liguria ',
#                                  'Lombardy',
#                                    'Marche',
#                                    'Molise',
#                                  'Piedmont',
#                                  'Sardinia',
#                                    'Sicily',
#                      'Trentino-South Tyrol',
#                                 'Tuscania ',
#                                    'Umbria',
#                                    'Veneto']

In [57]:
it_active_cms = create_active_cms_mat(it_df, it_Rs, npi_information_dict, Ds)

Regex Succeeded: Converted 21 Septemeber 2020 to 2020-09-21 00:00:00
Date 2021-12-11 00:00:00 was not in my list -- Used month and date
Regex Succeeded: Converted 6 November to 2020-11-06 00:00:00
Regex Succeeded: Converted 14 November to 2020-11-14 00:00:00
Regex Succeeded: Converted 14 November to 2020-11-14 00:00:00
Regex Succeeded: Converted 14 November to 2020-11-14 00:00:00
Regex Succeeded: Converted 14 November to 2020-11-14 00:00:00
Regex Succeeded: Converted 6 November to 2020-11-06 00:00:00


In [58]:
italy_df = pd.read_csv('../../data/raw_data_w_sources/it_cases_deaths.csv', delimiter=',')
italy_df['date'] = pd.to_datetime(italy_df['date'])
italy_df['date'] = italy_df['date'].dt.date
italy_df = italy_df.set_index(['area', 'date'])
italy_df['new_deaths'] = italy_df.groupby('area').diff()['total_deaths']
italy_df = italy_df.drop('total_deaths', axis=1)

italy_timeseries_df = italy_df

In [59]:
it_Rs_conv = ['Abruzzo',
 "Valle d'Aosta",
 'Puglia',
 'Basilicata',
 'Calabria',
 'Campania',
 'Emilia-Romagna',
 'Friuli Venezia Giulia',
 'Lazio',
 'Liguria',
 'Lombardia',
 'Marche',
 'Molise',
 'Piemonte',
 'Sardegna',
 'Sicilia',
 'P.A. Trento',
 'P.A. Bolzano', # or P.A. Trento
 'Toscana',
 'Umbria',
 'Veneto']

In [60]:
for cs_name, npi_name in zip(it_Rs_conv, it_Rs):
    print(f'Cases and deaths from {cs_name}, NPIs from {npi_name}')

Cases and deaths from Abruzzo, NPIs from Abruzzo
Cases and deaths from Valle d'Aosta, NPIs from Aosta Valley
Cases and deaths from Puglia, NPIs from Apulia (AKA Puglia)
Cases and deaths from Basilicata, NPIs from Basilicata
Cases and deaths from Calabria, NPIs from Calabria
Cases and deaths from Campania, NPIs from Campania
Cases and deaths from Emilia-Romagna, NPIs from Emilia-Romagna
Cases and deaths from Friuli Venezia Giulia, NPIs from Friuli-Venezia Giulia
Cases and deaths from Lazio, NPIs from Lazio
Cases and deaths from Liguria, NPIs from Liguria 
Cases and deaths from Lombardia, NPIs from Lombardy
Cases and deaths from Marche, NPIs from Marche
Cases and deaths from Molise, NPIs from Molise
Cases and deaths from Piemonte, NPIs from Piedmont
Cases and deaths from Sardegna, NPIs from Sardinia
Cases and deaths from Sicilia, NPIs from Sicily
Cases and deaths from P.A. Trento, NPIs from Trentino (aka Trento)
Cases and deaths from P.A. Bolzano, NPIs from South Tyrol (aka Bolzano aka A

In [61]:
it_cases, it_deaths = load_new_cases_deaths_from_timeseries_df(it_Rs_conv, italy_timeseries_df, Ds)

## Czech

In [62]:
# currently reads directly from the CSV
# cz_df = pd.read_csv('../../data/npi_data/czech.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
cz_df = pd.read_csv('../../data/npi_data/czech.csv', skiprows=2).dropna(axis='index', how='all')#.set_index('Code')
droplist = [c for c in cz_df.columns if 'Sources' in c or 'Quotes' in c or 'Description' in c or 'What is the reason' in c or 'How many' in c or 'Unnamed' in c]
droplist.extend(['Person who entered this row'])

cz_df = cz_df.drop(droplist, axis=1)
cz_df = cz_df.rename(columns=lambda x: x.strip())
cz_df['Region'] = cz_df['Region'].apply(lambda x: str(x).strip())

cz_df = cz_df.set_index('Region')

In [63]:
cz_Rs = list(cz_df.index.unique())
cz_Rs

['Ústí nad Labem',
 'Prague',
 'Moravian-Silesian',
 'Central Bohemian',
 'South Bohemian',
 'Vysočina',
 'Plzeň',
 'Karlovy Vary',
 'Liberec',
 'Hradec Králové',
 'Pardubice',
 'Olomouc',
 'South Moravian',
 'Zlín']

In [64]:
start_date = '08-01-2020'
end_date = '01-09-2021'
Ds = pd.date_range(start=start_date, end=end_date)

In [65]:
cz_active_cms = create_active_cms_mat(cz_df, cz_Rs, npi_information_dict, Ds)

Date 2021-12-23 00:00:00 was not in my list -- Used month and date


In [66]:
czech_df = pd.read_csv('../../data/raw_data_w_sources/cz_cases_deaths.csv', delimiter=',')
czech_df = czech_df.rename(columns = {'datum': 'date', 'kraj_nuts_kod':'NUTS3 Unit', 'okres_lau_kod':'LAU Unit', 'kumulativni_pocet_vylecenych':'Recovered', 'kumulativni_pocet_nakazenych': 'Infected', 'kumulativni_pocet_umrti': 'Deaths'})
czech_df['date'] = pd.to_datetime(czech_df['date'], format = '%Y-%m-%d')
cz_nuts3 = ['CZ042', 'CZ010', 'CZ080', 'CZ020', 'CZ031','CZ063', 'CZ032','CZ041','CZ051', 'CZ052','CZ053', 'CZ071', 'CZ064', 'CZ072']
czech_df = czech_df[czech_df['NUTS3 Unit'].isin(cz_nuts3)]
cz_nuts3_lookup = {cz_nuts3[i]:cz_Rs[i] for i in range(len(cz_Rs))}
czech_df = czech_df.replace({'NUTS3 Unit': cz_nuts3_lookup})
czech_df = czech_df.rename(columns = {'NUTS3 Unit':'area'}) #, 'Infected': 'new_cases', 'Deaths': 'new_deaths'})
cz_timeseries_df = czech_df.groupby(['area', 'date']).sum()

cz_timeseries_df['new_cases'] = cz_timeseries_df.groupby(level=[0]).diff()['Infected']
cz_timeseries_df['new_deaths'] = cz_timeseries_df.groupby(level=[0]).diff()['Deaths']

# czech_df = czech_df.drop(columns=['LAU Unit', 'Recovered'])

In [67]:
czech_cases, czech_deaths = load_new_cases_deaths_from_timeseries_df(cz_Rs, cz_timeseries_df, Ds)

# create merged CSV file

In [68]:
CMs = list(npi_information_dict.keys())
countries = ['England', 'Austria', 'Germany', 'Italy', 'Czech']
new_cases = [uk_cases, at_cases, de_cases, it_cases, czech_cases]
new_deaths = [uk_deaths, at_deaths, de_deaths, it_deaths, czech_deaths]
active_cms = [uk_active_cms, at_active_cms, de_active_cms, it_active_cms, cz_active_cms]
Rs = [uk_Rs, at_Rs, de_Rs, it_Rs, cz_Rs]

In [69]:
print(Rs)

[['Lincolnshire', 'Greater Manchester South West', 'Redbridge and Waltham Forest', 'Enfield', 'Buckinghamshire CC', 'Portsmouth', 'Southampton', 'Brighton and Hove', 'Coventry', 'Walsall', 'North Yorkshire CC', 'Essex Haven Gateway', 'Southend-on-Sea', 'Gloucestershire', 'East Derbyshire'], ['Wien', 'Burgenland', 'Steiermark', 'Oberösterreich', 'Nieder\xadösterreich', 'Voralberg ', 'Tirol', 'Karnten/Carinthia', 'Salzburg'], ['Nürnberg', 'LK Aschaffenburg', 'Fürth', 'Landsberg am Lech', 'LK Donau-Ries', 'Minden-Lübbecke', 'Mönchengladbach', 'Münster', 'Rhein-Kreis Neuss', 'LK Ennepe-Ruhr-Kreis', 'LK Rems-Murr-Kreis', 'LK Breisgau-Hochschwarzwald', 'LK Enzkreis', 'LK Hildesheim', 'LK Gifhorn'], ['Abruzzo', 'Aosta Valley', 'Apulia (AKA Puglia)', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria ', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Trentino (aka Trento)', 'South Tyrol (aka Bolzano aka Alto-Adige)', 'Tuscania 

In [86]:
def set_household_limits(active_CMs, household_NPI_index, gathering_NPI_index):
    nRs, _, nDs = active_CMs.shape
    new_acms = np.copy(active_CMs)
    for r in range(nRs):
        for day in range(nDs):
            if active_CMs[r, household_NPI_index, day] == 0 or active_CMs[r, gathering_NPI_index, day] < active_CMs[r, household_NPI_index, day]:
                new_acms[r, household_NPI_index, day] = active_CMs[r, gathering_NPI_index, day]
    return new_acms

def set_all_household_limits(active_CMs):
    new_acms = np.copy(active_CMs)
    new_acms = set_household_limits(new_acms, 4, 0)
    new_acms = set_household_limits(new_acms, 5, 1)
    new_acms = set_household_limits(new_acms, 6, 2)
    new_acms = set_household_limits(new_acms, 7, 3)
    return new_acms

active_cms = [set_all_household_limits(active_cm) for active_cm in active_cms]


In [87]:
all_rows = []
 
for c, ncs, nds, acms, rs in zip(countries, new_cases, new_deaths, active_cms, Rs):
    print(c)
    for r_i, r in enumerate(rs):
        for d_i, d in enumerate(Ds):
            row_dict = {
                'Country': c,
                'Area': r,
                'Date': d,
                'New Cases': ncs[r_i, d_i],
                'New Deaths': nds[r_i, d_i]
            }
            
            for cm_i, cm in enumerate(CMs):
                row_dict[cm] = acms[r_i, cm_i, d_i]

            all_rows.append(row_dict)

England
Austria
Germany
Italy
Czech


In [88]:
merged_df = pd.DataFrame(all_rows).set_index(['Area', 'Date'])

In [89]:
merged_df.to_csv('../../data/all_merged_data.csv')
