In [30]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
import geopandas as gpd
from datetime import date
import pyproj
from census import Census
import requests
import lxml.html as lh
import numpy as np

from src import utils as cutil
idx = pd.IndexSlice

datestamp = '20200315'
sacredentials_fpath = '/Users/ianbolliger/service-accounts/bolliger32.json'

adm1_shp_dir = cutil.DATA_RAW / 'multi_country' / f'ne_10m_admin_1_states_provinces_{datestamp}'
adm3_url_fmt = 'https://biogeo.ucdavis.edu/data/gadm3.6/gpkg/gadm36_{iso3}_gpkg.zip'
adm3_gpkg_fmt = 'gadm36_{iso3}.gpkg'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
def get_gpkg_path(iso3):
    return cutil.get_adm3_dir(iso3, datestamp) / adm3_gpkg_fmt.format(iso3=iso3)

def download_and_extract(url,out_dir):
    if not out_dir.is_dir():
        with urlopen(url) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(out_dir)
            
def process_gadm(in_gdf, iso3):
    cols_to_load = ['NAME_1','NAME_2','geometry']
    col_map = {
        'NAME_0': 'adm0_name',
        'NAME_1':'adm1_name',
        'NAME_2':'adm2_name'
    }
    if 'NAME_3' in in_gdf.columns:
        cols_to_load.append('NAME_3')
        col_map['NAME_3'] = 'adm3_name'
        
    in_gdf = in_gdf[cols_to_load]
    in_gdf = in_gdf.rename(columns=col_map)

    cent = in_gdf['geometry'].centroid
    in_gdf['latitude'] = cent.y
    in_gdf['longitude'] = cent.x
    in_gdf['adm0_name'] = iso3
    
    in_gdf = in_gdf.set_index(['adm0_name','adm1_name','adm2_name'])
    if 'adm3_name' in in_gdf.columns:
        in_gdf = in_gdf.set_index('adm3_name', append=True)
    
    return in_gdf

## Global adm1

In [32]:
# get file
adm1_url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip'
download_and_extract(adm1_url, adm1_shp_dir)

In [33]:
# process
in_gdf = gpd.read_file(adm1_shp_dir)
adm_gdf = in_gdf[['adm0_a3','name','geometry', 'latitude','longitude','gadm_level']]
adm_gdf = adm_gdf.rename(columns={
    'adm0_a3':'adm0_name',
    'name': 'adm1_name'
}).set_index(['adm0_name','adm1_name','gadm_level'])

# for now, when there are duplicates, just drop the second one without any better information
# could not find a data dictionary for the shapefile
adm_gdf = adm_gdf[~adm_gdf.index.duplicated(keep='first')].reset_index(drop=False, level='gadm_level')

# we know france is actually admin 2
adm_gdf.loc[idx['FRA',:],'gadm_level']=2

# separate into levels
adm1_gdf = adm_gdf[adm_gdf.gadm_level==1].drop(columns='gadm_level')
adm2_gdf = adm_gdf[adm_gdf.gadm_level==2].drop(columns='gadm_level')
adm2_gdf.index = adm2_gdf.index.set_names('adm2_name',level='adm1_name')

# Set up an adm3 dataset that is currently empty
adm3_gdf = gpd.GeoDataFrame(columns = adm2_gdf.reset_index(drop=False).columns, crs=adm_gdf.crs)
adm3_gdf['adm3_name'] = []
adm3_gdf['adm1_name'] = []
adm3_gdf = adm3_gdf.set_index(['adm0_name','adm1_name','adm2_name','adm3_name'])

## adm2+

### FRA

In [35]:
adm2_fr_fpath = cutil.DATA / 'interim' / 'france' / 'departement_info.dta'
adm2_fr = pd.read_stata(adm2_fr_fpath, index_col='departement_name', columns=['departement_name','adm1_name','cheflieu','densitehabitantskm2', 'superficiekmâ', 'population'])
adm2_fr.index = adm2_fr.index.str.encode('ISO-8859-1').str.decode('utf-8')
adm2_fr.cheflieu = adm2_fr.cheflieu.str.encode('ISO-8859-1').str.decode('utf-8')
adm2_fr.index.name = 'adm2_name'
adm2_fr = adm2_fr.rename(columns={
    "cheflieu": "capital",
    "densitehabitantskm2":"pop_density_km2",
    "superficiekmâ": "area_km2"
})

# manually correct some differences in naming btwn 2 datasets
name_map = {
    "Guyane française": "Guyane",
    "Haute-Rhin": "Haut-Rhin",
    "Vendée": "Vandée",
    "Côtes-d'Armor": "Côtes d'Armor",
    "Seine-Saint-Denis": "Seine-St-Denis",
    "Val-d'Oise": "Val-D'Oise",
    "Seien-et-Marne": "Seine-et-Marne"
}
adm2_gdf = adm2_gdf.rename(index=name_map, level='adm2_name')

# merge back in
adm2_gdf = adm2_gdf.join(adm2_fr, on='adm2_name', how='outer').reset_index(drop=False).set_index(['adm0_name','adm1_name','adm2_name'])

### Others

All of these are from the same source but:
- some work with the gpkg file others with the shapefile
- some are adm3 some are adm2

In [36]:
isos = ['ITA','USA','CHN','KOR','IRN']

for iso3 in isos:
    # sometimes the shapefile has finer resolution, sometimes the gpkg file does...
    try:
        download_and_extract(adm3_url_fmt.format(iso3=iso3), cutil.get_adm3_dir(iso3, datestamp))
        in_gdf = process_gadm(gpd.read_file(get_gpkg_path(iso3)), iso3)
    except KeyError:
        download_and_extract(adm3_url_fmt.format(iso3=iso3).replace('gpkg','shp'), cutil.get_adm3_dir(iso3, datestamp))
        in_gdf = process_gadm(gpd.read_file(cutil.get_adm3_dir(iso3, datestamp)), iso3)
        
    if 'adm3_name' in in_gdf.index.names:
        adm3_gdf = adm3_gdf.append(in_gdf)
        
        # now aggregate to level 2 to insert into that level
        in_gdf = in_gdf.dissolve(by=['adm0_name','adm1_name','adm2_name'])
        in_gdf['latitude'] = in_gdf.geometry.centroid.y
        in_gdf['longitude'] = in_gdf.geometry.centroid.x
    
    # insert into level 2 dataset
    assert not iso3 in adm2_gdf.index.get_level_values('adm0_name').unique()
    adm2_gdf = adm2_gdf.append(in_gdf)
    
    # now aggregate to level 1 to replace that level with better/more consistent data
    in_gdf = in_gdf.dissolve(by=['adm0_name','adm1_name'])
    in_gdf['latitude'] = in_gdf.geometry.centroid.y
    in_gdf['longitude'] = in_gdf.geometry.centroid.x
    adm1_gdf = adm1_gdf[adm1_gdf.index.get_level_values('adm0_name')!=iso3]
    adm1_gdf = adm1_gdf.append(in_gdf)

## Manual name adjustments

Some manual adjustments to make this match with the naming of the data produced by country teams

### ITA

In [38]:
region_dict = {
    'Emilia-Romagna':'Emilia Romagna',
    'Friuli-Venezia Giulia':'Friuli Venezia Giulia',
    'Apulia':'Puglia',
    'Sicily':'Sicilia',
}
add_regions = ['P.A. Bolzano', 'P.A. Trento']
add_regions_prov = [i.replace('P.A. ','') for i in add_regions]
province_dict = {
    "Forli' - Cesena":'Forlì-Cesena',
    "Reggio Nell'Emilia":"Reggio nell'Emilia",
    "Padua":"Padova",
    "Reggio Di Calabria":"Reggio di Calabria",
    "Pesaro E Urbino":"Pesaro e Urbino",
    "Syracuse":"Siracusa",
    "Florence":"Firenze",
    "Mantua":"Mantova",
    "Monza and Brianza":"Monza e della Brianza",
}
add_provinces = ["Sud Sardegna"]
add_provinces_reg = ['Sardegna']

In [39]:
# add new regions/provinces
n_reg = len(add_regions)
new_reg = pd.DataFrame(dict(
    adm0_name=['ITA']*n_reg,
    adm1_name=add_regions,
    adm2_name=add_regions_prov
)).set_index(['adm0_name','adm1_name','adm2_name'])
n_prov = len(add_provinces)
new_prov = pd.DataFrame(dict(
    adm0_name=['ITA']*n_prov,
    adm2_name=add_provinces,
    adm1_name=add_provinces_reg
)).set_index(['adm0_name','adm1_name','adm2_name'])

In [40]:
adm1_gdf = adm1_gdf.rename(index=region_dict, level='adm1_name')
adm2_gdf = adm2_gdf.rename(index=region_dict, level='adm1_name')
adm3_gdf = adm3_gdf.rename(index=region_dict, level='adm1_name')
adm2_gdf = adm2_gdf.rename(index=province_dict, level='adm2_name')
adm3_gdf = adm3_gdf.rename(index=province_dict, level='adm2_name')

for i in [new_reg,new_prov]:
    adm1_gdf = adm1_gdf.append(i.reset_index(level='adm2_name',drop=True))
    adm2_gdf = adm2_gdf.append(i)

## Pop

### US

In [43]:
census_apikey = '24f4f2dc127d1386d07db9af73526aa052c9c41f'

In [44]:
c = Census(census_apikey)
pop_city = pd.DataFrame(c.acs5.state_place(('NAME', 'B01003_001E'),Census.ALL,Census.ALL))
pop_cty = pd.DataFrame(c.acs5.state_county(('NAME', 'B01003_001E'),Census.ALL,Census.ALL))

#### Place-level

In [45]:
# save the place-level populations
pop_city[['adm3_name','adm_1_name']] = pd.DataFrame(pop_city.NAME.str.split(', ').values.tolist(), index= pop_city.index)
pop_city = pop_city.rename(columns={'B01003_001E':'pop'}).drop(columns='NAME')
pop_city = pop_city.set_index(['adm3_name','adm_1_name'])
pop_city.to_csv(cutil.DATA / 'interim' / 'usa' / 'adm3_pop.csv', index=True)

#### County-level

In [46]:
## get county-level populations
hasc_fips_url='http://www.statoids.com/yus.html'
#Create a handle, page, to handle the contents of the website
page = requests.get(hasc_fips_url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data
tr_elements = doc.xpath('//*[@id="yui-main"]/div/div/pre/text()[1]')
row_list = tr_elements[0].split("\r\n")[1:-1]
headers = row_list[0].split()
valid_rows = [r for r in row_list if r != "" and r[:4] not in ["Name","----"]]
name = [r[:23].rstrip() for r in valid_rows]
t = [r[23] for r in valid_rows]
hasc = [r[25:33] for r in valid_rows]
fips = [r[34:39] for r in valid_rows]
pop = [int(r[40:49].lstrip().replace(',','')) for r in valid_rows]
area_km2 = [int(r[50:57].lstrip().replace(',','')) for r in valid_rows]
area_mi2 = [int(r[58:65].lstrip().replace(',','')) for r in valid_rows]
z = [r[66] for r in valid_rows]
capital = [r[68:] for r in valid_rows]

# turn into dataframe
us_county_df = pd.DataFrame({
    'name': name,
    'type': t,
    'hasc': hasc,
    'fips': fips,
    'population': pop,
    'area_km2': area_km2,
    'capital': capital
}).set_index('hasc')

##### Merge in us adm2 dataset

In [47]:
us_gdf = gpd.read_file(get_gpkg_path('USA'))
us_gdf = us_gdf[us_gdf.HASC_2.notnull()]

In [48]:
us_pops = us_gdf.join(us_county_df, on='HASC_2', how='left')
us_pops = us_pops[['NAME_1','NAME_2','fips','population','area_km2','capital']]
us_pops = us_pops.rename(columns={'NAME_1':'adm1_name','NAME_2':'adm2_name'})
us_pops['pop_density_km2'] = us_pops['population'] / us_pops['area_km2']
us_pops['adm0_name'] = 'USA'
us_pops = us_pops.set_index(['adm0_name','adm1_name','adm2_name'])

##### Merge back into global adm datasets

Doing this for France as well, b/c we haven't merged in adm2 pops to adm1 for france yet either.

In [49]:
adm2_gdf = adm2_gdf.fillna(us_pops)
st_pops = adm2_gdf.loc[:,'population'].groupby(['adm0_name','adm1_name']).sum(min_count=1)
adm1_gdf['population'] = st_pops

### ITA

This pop data is copied directly from `GPL_covid/data/raw/italy/italy_policy.gsheet:Population`, which is from Google Public Data. None of the pops are comprehensive for that administrative level, so we will not be aggregating and applying to the higher-up level.

In [51]:
pop_str = """
adm1	Population (2018)	adm2	Population (2018)	adm3	Population (2018)
Lombardy	10036300	Lodi	229765	Bertonico	1127
Veneto	4905000	Padua	936740	Casalpusterlengo	15280
Marche	1531800	Pesaro and Urbino	360125	Castelgerundo	1489
Liguria	1557000	Savona	277810	Castiglione d'Adda	4651
Piedmont	4375900	Alessandria	424174	Codogno	15901
Emilia-Romagna	4452600	Asti	215884	Fombio	2325
Campania	5826900	Modena	701896	Maleo	3133
Sicilia	5027000	Novara	369595	San Fiorano	1841
Friuli Venezia Giulia	1215500	Parma	450256	Somaglia	3797
Abruzzo	1315200	Piacenza	286781	Terranova dei Passerini	918
Apulia	4048200	Reggio nell'Emilia	532575	Vo'Eugane	3341
		Rimini	337325	Taranto	198283
		Verbano-Cusio-Ossola	159159	Messina	234293
		Vercelli	172307		
		Napoli	3101000		
		Palermo	1260200		
		Taranto	580319		
		Messina	631297		
"""

ita_pop_2_maps = {
    'Pesaro and Urbino': 'Pesaro e Urbino',
    'Padua': "Padova"
}

ita_pop_1_maps = {
    'Lombardy': 'Lombardia',
    'Piedmont': "Piemonte",
    'Emilia-Romagna':'Emilia Romagna',
    'Apulia':'Puglia',
}

In [52]:
pop_data = [i.split("\t") for i in pop_str.strip().split("\n")]
pop_df = pd.DataFrame(pop_data[1:],columns=pop_data[0]).rename(columns={'Population (2018)':'population'})

#### adm3

In [53]:
ita_pop_3 = pop_df.iloc[:,4:]
ita_pop_3 = ita_pop_3[((ita_pop_3.notnull()) & (ita_pop_3!='')).all(axis=1)].rename(columns={'adm3':'adm3_name'})
ita_pop_3.population = ita_pop_3.population.astype(int)
ita_pop_3['adm0_name'] = 'ITA'
ita_pop_3 = ita_pop_3.set_index(['adm0_name','adm3_name'])

In [54]:
adm3_gdf = adm3_gdf.rename(lambda x: x.replace(
    "d' Adda", "d'Adda").replace(
    "Terranova Dei Passerini", "Terranova dei Passerini"), level='adm3_name')

# these two municipalities merged
castel = gpd.GeoDataFrame(adm3_gdf.loc[idx[:,:,:,['Cavacurta','Camairago']],['geometry']]).dissolve(by=['adm0_name','adm1_name','adm2_name'])
castel['adm3_name'] = ['Castelgerundo']
castel['latitude'] = castel.geometry.centroid.y
castel['longitude'] = castel.geometry.centroid.x
castel = castel.set_index('adm3_name', append=True)
adm3_gdf = adm3_gdf[~adm3_gdf.index.get_level_values('adm3_name').isin(['Cavacurta','Camairago'])].append(castel)

# this municipality not in dataset
adm3_gdf.loc[idx['ITA','Veneto','Padua',"Vo'Eugane"],:] = pd.Series({
    'geometry': None,
    'latitude': np.nan,
    'longitude': np.nan
})

In [56]:
adm3_gdf = adm3_gdf.join(ita_pop_3, on=['adm0_name','adm3_name'],how='outer')

#### adm2

In [57]:
ita_pop_2 = pop_df.iloc[:,2:4]
ita_pop_2 = ita_pop_2[ita_pop_2.notnull().all(axis=1)].rename(columns={'adm2':'adm2_name'})
ita_pop_2.population = ita_pop_2.population.astype(int)
ita_pop_2['adm0_name'] = 'ITA'
ita_pop_2.adm2_name = ita_pop_2.adm2_name.apply(lambda x: ita_pop_2_maps[x] if x in ita_pop_2_maps.keys() else x)
ita_pop_2 = ita_pop_2.set_index(['adm0_name','adm2_name']).population

In [58]:
new_pop = pd.DataFrame(adm2_gdf.reset_index(level='adm1_name', drop=False).population.fillna(ita_pop_2))
new_pop['adm1_name'] = adm2_gdf.index.get_level_values('adm1_name')
new_pop = new_pop.reset_index(drop=False).set_index(['adm0_name','adm1_name','adm2_name'])
adm2_gdf.population = new_pop

In [59]:
ita_pop_1 = pop_df.iloc[:,:2]
ita_pop_1 = ita_pop_1[(ita_pop_1!="").any(axis=1)].rename(columns={'adm1':'adm1_name'})
ita_pop_1.population = ita_pop_1.population.astype(int)
ita_pop_1.adm1_name = ita_pop_1.adm1_name.apply(lambda x: ita_pop_1_maps[x] if x in ita_pop_1_maps.keys() else x)
ita_pop_1 = ita_pop_1.set_index('adm1_name')

#### adm1

In [60]:
ita_pop_1 = pop_df.iloc[:,:2]
ita_pop_1 = ita_pop_1[(ita_pop_1!="").any(axis=1)].rename(columns={'adm1':'adm1_name'})
ita_pop_1.population = ita_pop_1.population.astype(int)
ita_pop_1.adm1_name = ita_pop_1.adm1_name.apply(lambda x: ita_pop_1_maps[x] if x in ita_pop_1_maps.keys() else x)
ita_pop_1['adm0_name'] = 'ITA'
ita_pop_1 = ita_pop_1.set_index(['adm0_name','adm1_name'])

adm1_gdf.population = adm1_gdf.population.fillna(ita_pop_1.population)

## Area

In [63]:
def finishing_touches(df):
    # area
    area_km2_mercator = df[df.geometry.notna()].to_crs('EPSG:3395').geometry.area / 1e6
    if 'area_km2' in df.columns:
        df['area_km2'] = df.area_km2.fillna(area_km2_mercator)
    else:
        df['area_km2'] = area_km2_mercator
        
    # pop density
    if 'pop_density_km2' in df.columns:
        df.pop_density_km2 = df.pop_density_km2.fillna(df.population.astype(float) / df.area_km2)
    else:
        df['pop_density_km2'] = df.population.astype(float) / df.area_km2
    
    # lat/lon
    df.longitude = df.longitude.fillna(df.geometry.centroid.x)
    df.latitude = df.latitude.fillna(df.geometry.centroid.y)
    
    df = df.sort_index()
    return df

adm1_gdf = finishing_touches(adm1_gdf)
adm2_gdf = finishing_touches(adm2_gdf)
adm3_gdf = finishing_touches(adm3_gdf)

## Save

In [65]:
for ix,i in enumerate([adm1_gdf,adm2_gdf,adm3_gdf]):
    fname = f'adm{ix+1}'
    out_dir = cutil.DATA_INTERIM / 'adm' / fname
    out_dir.mkdir(parents=True, exist_ok=True)
    i.to_file(out_dir / f'{fname}.shp', index=True)
    i.drop(columns='geometry').to_csv(out_dir / f'{fname}.csv', index=True)