# Devlog 2023-07-07

Re-creating the us_states and us_counties GEOs as part of a data reconciliation effort.

NOTE: running this notebook requires you to have a US Census API key in your environment. (A `.env` file will work!)

In [1]:
import dataclasses
import os
from pathlib import Path

import numpy as np
import pandas as pd
import pygris
from census import Census

from epymorph.data_shape import Shapes
from epymorph.data_type import CentroidDType
from epymorph.error import GeoValidationException
from epymorph.geo.spec import LABEL, StaticGeoSpec, Year
from epymorph.geo.static import StaticGeo
from epymorph.geo.static import StaticGeoFileOps as F
from epymorph.geography.us_census import StateScope
from epymorph.simulation import AttributeDef

YEAR = 2015
NUM_STATES = 52
NUM_COUNTIES = 3220

state_scope = StateScope.all(year=YEAR)
county_scope = state_scope.lower_granularity()

# Both state and county geo will have the same attributes, just different scope.
state_spec = StaticGeoSpec(
    attributes=[
        LABEL,
        AttributeDef('geoid', str, Shapes.N),
        AttributeDef('centroid', CentroidDType, Shapes.N),
        AttributeDef('population', int, Shapes.N),
        AttributeDef('commuters', int, Shapes.NxN),
    ],
    scope=state_scope,
    time_period=Year(YEAR),
)

county_spec = dataclasses.replace(state_spec, scope=county_scope)

# Initialize Census API
census = Census(os.environ['CENSUS_API_KEY'])

## States data

In [2]:
state_query = {'for': 'state: *'}
state_data = census.acs5.get(('NAME', 'B01003_001E'), state_query, year=YEAR)
state_geog = pygris.states(year=YEAR, cache=True)

In [3]:
# NOTE: this commflows data is also needed for counties... might as well just load it once.

commflows_raw = pd.read_excel(
    'https://www2.census.gov/programs-surveys/demo/tables/metro-micro/2015/commuting-flows-2015/table1.xlsx',
    skiprows=6,
    skipfooter=2,
    names=[
        "res_state_fips",
        "res_county_fips",
        "res_state",
        "res_county",
        "wrk_state_fips",
        "wrk_county_fips",
        "wrk_state",
        "wrk_county",
        "workers",
        "moe",
    ],
    dtype=str
)
# non-US destinations will have NaN in either state or county fips column (or both)
# filter those out
commflows_raw = commflows_raw[
    ~commflows_raw.isna().any(axis=1)
]
# reformat work state FIPS to two digits
commflows_raw['wrk_state_fips'] = commflows_raw['wrk_state_fips'].str.slice(1)

commflows_by_county = pd.DataFrame({
    'res_geoid': commflows_raw['res_state_fips'] + commflows_raw['res_county_fips'],
    'wrk_geoid': commflows_raw['wrk_state_fips'] + commflows_raw['wrk_county_fips'],
    'workers': commflows_raw['workers'].astype(np.int64)
})
commflows_by_county = commflows_by_county[
    ~commflows_by_county.isna().any(axis=1)
]

# county GEOID -> index
county_geoids = {geoid: i for i, geoid
                 in enumerate(commflows_by_county['res_geoid'].unique())}

commflows_by_state = pd.DataFrame({
    'res_geoid': commflows_raw['res_state_fips'],
    'wrk_geoid': commflows_raw['wrk_state_fips'],
    'workers': commflows_raw['workers'].astype(np.int64)
})
commflows_by_state = commflows_by_state[
    ~commflows_by_state.isna().any(axis=1)
]
commflows_by_state = commflows_by_state\
    .groupby(['res_geoid', 'wrk_geoid'], as_index=False).sum()

# state GEOID -> index
state_geoids = {geoid: i for i, geoid
                in enumerate(commflows_by_state['res_geoid'].unique())}

In [4]:
# Get label, geoid, and population
d = pd.DataFrame.from_records(state_data).astype({
    'NAME': np.str_,
    'B01003_001E': np.int64,
    'state': np.str_,
})
d.rename(columns={
    'NAME': 'label',
    'B01003_001E': 'population',
    'state': 'geoid'
}, inplace=True)
d.sort_values(by='geoid', inplace=True)
d.reset_index(drop=True, inplace=True)

# Get centroid and merge
d = d.merge(pd.DataFrame({
    'geoid': state_geog['GEOID'],
    'centroid': state_geog['geometry'].apply(lambda row: row.centroid.coords[0])
}), on='geoid')

# Get commuters
c = np.zeros(shape=(NUM_STATES, NUM_STATES), dtype=np.int64)
for _, row in commflows_by_state.iterrows():
    res_idx = state_geoids[row['res_geoid']]
    wrk_idx = state_geoids[row['wrk_geoid']]
    c[res_idx, wrk_idx] = row['workers']

states_values = {
    'label': d['label'].to_numpy(dtype=np.str_),
    'geoid': d['geoid'].to_numpy(dtype=np.str_),
    'centroid': d['centroid'].to_numpy(dtype=CentroidDType),
    'population': d['population'].to_numpy(dtype=np.int64),
    'commuters': c,
}

num_states = len(states_values['label'])
if num_states != NUM_STATES:
    print(f"WARNING! Unexpected number of states ({num_states})!")

In [5]:
geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')
try:
    states_geo = StaticGeo(state_spec, states_values)
    states_geo.validate()
    states_geo.save(geofile)
except GeoValidationException as e:
    print(e.pretty())

In [6]:
# Verify that we can load the file back.
F.load_from_archive(geofile)

<epymorph.geo.static.StaticGeo at 0x7f0c8a6dc390>

## County data

In [7]:
county_query = {'for': 'county: *'}
county_data = census.acs5.get(('NAME', 'B01003_001E'), county_query, year=YEAR)
county_geog = pygris.counties(year=YEAR, cache=True)

In [8]:
# Get label, geoid, and population
d = pd.DataFrame.from_records(county_data).astype({
    'NAME': np.str_,
    'B01003_001E': np.int64,
})
d.insert(1, 'geoid', d['state'] + d['county'])
d.drop(columns=['state', 'county'], inplace=True)
d.sort_values(by='geoid', inplace=True)
d.rename(columns={
    'B01003_001E': 'population',
    'NAME': 'label'
}, inplace=True)
d.reset_index(drop=True, inplace=True)

# Get centroid and merge
d = d.merge(pd.DataFrame({
    'geoid': county_geog['GEOID'],
    'centroid': county_geog['geometry'].apply(lambda row: row.centroid.coords[0])
}), on='geoid')

# Get commuters
c = np.zeros(shape=(NUM_COUNTIES, NUM_COUNTIES), dtype=np.int64)
for _, row in commflows_by_county.iterrows():
    res_idx = county_geoids[row['res_geoid']]
    wrk_idx = county_geoids[row['wrk_geoid']]
    c[res_idx, wrk_idx] = row['workers']

counties_values = {
    'label': d['label'].to_numpy(dtype=str),
    'geoid': d['geoid'].to_numpy(dtype=str),
    'centroid': d['centroid'].to_numpy(dtype=CentroidDType),
    'population': d['population'].to_numpy(dtype=np.int64),
    'commuters': c
}

num_counties = len(counties_values['label'])
if num_counties != NUM_COUNTIES:
    print(f"WARNING! Unexpected number of counties ({num_counties})!")

In [9]:
geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')
try:
    counties_geo = StaticGeo(county_spec, counties_values)
    counties_geo.validate()
    counties_geo.save(geofile)
except GeoValidationException as e:
    print(e.pretty())

In [10]:
# Verify that we can load the file back.
F.load_from_archive(geofile)

<epymorph.geo.static.StaticGeo at 0x7f0c8f1db850>