In [1]:
import requests
from contextlib import closing
import csv
import numpy as np
import pandas as pd
import xarray as xr

In [2]:
#era5 files
states_file = '../era5_ne_states.csv'
era5_states = pd.read_csv(states_file).to_xarray()

countries_file = '../era5_ne_countries.csv'
era5_countries = pd.read_csv(countries_file).to_xarray()

#covid19 url
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'


In [3]:
def find_era5_index(lat, lon, era5):
    '''
    Returns the era5 index corresponding with the closet lat-lon coordinate

            Parameters:
                    lat (float): latitude value from covid19 data
                    lon (float): longitude value from covid19 data
                    era5 (xr.Dataset): era5 dataset for either states or countries. 
                        Accepts: `era5_states` or `era5_countries`

            Returns:
                    index (int): index from era5 dataset that maps to a state or country
    '''
    era5_index = era5.index.values
    era5_lat = era5.cent_lat.values
    era5_lon = era5.cent_lon.values

    dist = [((lat - era5_lat[i])**2 + (lon - era5_lon[i])**2)**0.5 for i in era5_index]
    index = dist.index(min(dist))
    return index

def read_covid19_ds(url, era5_states, era5_countries):
    '''
    Returns the era5 index corresponding with the closet lat-lon coordinate

            Parameters:
                    url (string): latitude value from covid19 data
                    era5_states (xr.Dataset): era5 states dataset
                    era5_countries (xr.Dataset): era5 countries dataset

            Returns:
                    ds (xr.Dataset): covid19 datasest with era5 index attributes 
    '''
    with closing(requests.get(url, stream=True)) as r:
        f = (line.decode('utf-8') for line in r.iter_lines())
        reader = csv.reader(f, delimiter=',', quotechar='"')

        header = next(reader)
        time = pd.date_range(start=header[4], end=header[-1])

        ds_dict = {} 
        for line in reader:
            state = line[0]
            country = line[1]
            lat = float(line[2])
            lon = float(line[3])
            cases = [int(i) for i in line[4:]]

            if state:
                region_name = (state +'_' + country).replace(' ', '_').lower()
            else:
                region_name = country.replace(' ', '_').replace(',', '').lower()     

            states_index = find_era5_index(lat, lon, era5_states)
            countries_index = find_era5_index(lat, lon, era5_countries)

            da = xr.DataArray(cases, coords = [time], dims = ['time'], name = region_name)
            da.attrs['lat'] = lat
            da.attrs['lon'] = lon
            da.attrs['state'] = state
            da.attrs['era5_state'] = era5_states.name_en[states_index].values
            da.attrs['state_index'] = states_index
            da.attrs['country'] = country
            da.attrs['era5_country'] = era5_countries.NAME_EN[countries_index].values
            da.attrs['country_index'] = countries_index

            ds_dict[region_name] = da.to_dict()

    ds = xr.Dataset.from_dict(ds_dict)
    return ds

In [4]:
ds = read_covid19_ds(url, era5_states, era5_countries)
ds

<xarray.Dataset>
Dimensions:                                  (time: 70)
Dimensions without coordinates: time
Data variables:
    afghanistan                              (time) int64 0 0 0 ... 120 170 174
    albania                                  (time) int64 0 0 0 ... 212 223 243
    algeria                                  (time) int64 0 0 0 ... 511 584 716
    andorra                                  (time) int64 0 0 0 ... 334 370 376
    angola                                   (time) int64 0 0 0 0 0 ... 5 7 7 7
    antigua_and_barbuda                      (time) int64 0 0 0 0 0 ... 7 7 7 7
    argentina                                (time) int64 0 0 0 ... 745 820 1054
    armenia                                  (time) int64 0 0 0 ... 424 482 532
    australian_capital_territory_australia   (time) int64 0 0 0 0 ... 77 78 80
    new_south_wales_australia                (time) int64 0 0 0 ... 2032 2032
    northern_territory_australia             (time) int64 0 0 0 0 ... 15 15 

In [5]:
ds['armenia']

<xarray.DataArray 'armenia' (time: 70)>
array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   4,   8,  18,  26,  52,  78,
        84, 115, 136, 160, 194, 235, 249, 265, 290, 329, 407, 424, 482, 532])
Dimensions without coordinates: time
Attributes:
    lat:            40.0691
    lon:            45.0382
    state:          
    era5_state:     Ararat Province
    state_index:    300
    country:        Armenia
    era5_country:   Armenia
    country_index:  109

In [6]:
def combine_duplicated_indices(ds, region_level='states'):
    '''
    Checks if more than one region in the xr.Dataset correspond to the same state or country index 
    and adds COVID19 cases together for those regions.

            Parameters:
                    ds (xr.Dataset): COVID19 data
                    region_level (str): specification of combining regions based on state or country index 
                        Accepts: `states` or `countries`

            Returns:
                    ds_era5_regions (xr.Dataset): COVID19 data mapped to indices of etiher states or countries
    '''
    ds = ds.copy()
    era5_regions_dict = {}

    index_set = set()
    for region in ds:
        
        if region_level == 'states':
            name = ds[region].attrs['era5_state']
            index = ds[region].attrs['state_index']
                                     
        elif region_level == 'countries':
            name = ds[region].attrs['era5_country']
            index = ds[region].attrs['country_index']
            
            del ds[region].attrs['era5_state']
            del ds[region].attrs['state_index']
        
        region_dict = ds[region].to_dict()
        if index not in index_set:
            index_set.add(index)
            era5_regions_dict[name] = region_dict
        else:
            era5_regions_dict[name]['data'] = np.add(era5_regions_dict[name]['data'], region_dict['data'])
            
    ds_era5_regions = xr.Dataset.from_dict(era5_regions_dict)
    return ds_era5_regions

In [7]:
ds_states = combine_duplicated_indices(ds, region_level='states')
ds_states

<xarray.Dataset>
Dimensions:                                          (time: 70)
Dimensions without coordinates: time
Data variables:
    Urozgan                                          (time) int64 0 0 ... 174
    Elbasan County                                   (time) int64 0 0 ... 243
    GhardaÃ¯a Province                               (time) int64 0 0 ... 716
    Andorra la Vella                                 (time) int64 0 0 ... 376
    BiÃ© Province                                    (time) int64 0 0 0 ... 7 7
    Gnagna Province                                  (time) int64 0 0 ... 3995
    nan                                              (time) int64 0 0 ... 94 112
    Ararat Province                                  (time) int64 0 0 ... 532
    Australian Capital Territory                     (time) int64 0 0 ... 78 80
    Jervis Bay Territory                             (time) int64 0 0 ... 2032
    LautÃ©m Municipality                             (time) int64 0 0 ... 15 

In [8]:
ds_states['Tasmania']

<xarray.DataArray 'Tasmania' (time: 70)>
array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  2,  2,  2,  3,  3,  5,  5,  6,
        7,  7, 10, 10, 10, 16, 22, 28, 28, 36, 47, 47, 62, 66, 66, 69])
Dimensions without coordinates: time
Attributes:
    lat:            -41.4545
    lon:            145.9707
    state:          Tasmania
    era5_state:     Tasmania
    state_index:    2901
    country:        Australia
    era5_country:   Australia
    country_index:  137

In [9]:
ds_countries = combine_duplicated_indices(ds, region_level='countries')
ds_countries

<xarray.Dataset>
Dimensions:                              (time: 70)
Dimensions without coordinates: time
Data variables:
    Afghanistan                          (time) int64 0 0 0 0 ... 120 170 174
    Albania                              (time) int64 0 0 0 0 ... 212 223 243
    Algeria                              (time) int64 1 1 2 ... 258531 294159
    Switzerland                          (time) int64 0 0 0 ... 16403 17101
    Angola                               (time) int64 0 0 0 0 0 0 ... 4 5 7 7 7
    Togo                                 (time) int64 0 0 0 ... 17856 20719
    Armenia                              (time) int64 0 0 0 0 ... 424 482 532
    Australia                            (time) int64 0 0 0 0 ... 3313 3657 3799
    East Timor                           (time) int64 0 0 0 0 0 ... 16 16 16 18
    New Caledonia                        (time) int64 0 0 0 0 ... 671 704 759
    Austria                              (time) int64 0 0 0 ... 8788 9618 10180
    Azerbaijan 

In [10]:
ds_countries['Bangladesh']

<xarray.DataArray 'Bangladesh' (time: 70)>
array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  3,  3,  3,  3,  3,  3,  5,
        8, 10, 14, 17, 20, 25, 27, 33, 39, 39, 44, 48, 48, 48, 49, 51])
Dimensions without coordinates: time
Attributes:
    lat:            23.685
    lon:            90.3563
    state:          
    country:        Bangladesh
    era5_country:   Bangladesh
    country_index:  99