-
Notifications
You must be signed in to change notification settings - Fork 218
/
load_household_data.py
38 lines (30 loc) · 1.58 KB
/
load_household_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
Load country household size from UN data source.
Returns dictionary with country name as key and average household size as value,
for insertion in covasim/data.
"""
import numpy as np
import pandas as pd
import covasim as cv
url = 'https://population.un.org/household/exceldata/population_division_UN_Houseshold_Size_and_Composition_2019.xlsx'
df_raw: pd.DataFrame = pd.read_excel(url, sheet_name='UN HH Size and Composition 2019', skiprows=4)
assert len(df_raw) > 1, "Loaded UN Household size."
# Select and rename columns
target_columns = ['Country or area', 'Reference date (dd/mm/yyyy)', 'Average household size (number of members)']
df: pd.DataFrame = df_raw[target_columns].copy()
df.columns = ['country', 'date', 'size']
# Convert date column to datetime type and replace nodata with NA.
df['date'] = df['date'].apply(lambda d: pd.to_datetime(d, format='%d/%m/%Y'))
df['size'] = df['size'].apply(lambda s: np.nan if isinstance(s, str) and s == '..' else s)
df = df.dropna()
# Take the most recent household size for each country.
df = df.sort_values(by=['country', 'date']).groupby(by=['country']).last()[['size']]
un_country_households_dict = df.to_dict()['size'].copy()
# Add to the dictionary commonly used aliases like USA for United States of America.
us_countries = [k.lower() for k in un_country_households_dict]
country_mappings_dict = cv.data.loaders.get_country_aliases()
for alias, name in country_mappings_dict.items():
if name.lower() in us_countries:
un_country_households_dict[alias] = un_country_households_dict[name]
# Copy this into
print(un_country_households_dict)