# Devlog 2023-07-07

Re-creating the us_states and us_counties GEOs as part of a data reconciliation effort.

In [1]:
import pandas as pd
import numpy as np
import pygris
from census import Census
from epymorph.geo import Attribute, CentroidDType, Schema, validate_schema

YEAR = 2015

Nc = 3220

COUNTIES_SCHEMA: Schema = {
    'label':      Attribute(np.str_, (Nc,)),
    'geoid':      Attribute(np.str_, (Nc,)),
    'centroid':   Attribute(CentroidDType, (Nc,)),
    'population': Attribute(np.int64, (Nc,)),
    'commuters':  Attribute(np.int64, (Nc, Nc))
}

Ns = 52

STATES_SCHEMA: Schema = {
    'label':      Attribute(np.str_, (Ns,)),
    'geoid':      Attribute(np.str_, (Ns,)),
    'centroid':   Attribute(CentroidDType, (Ns,)),
    'population': Attribute(np.int64, (Ns,)),
    'commuters':  Attribute(np.int64, (Ns, Ns))
}


def save_compressed_geo(id, data):
    if not 'label' in data:
        msg = f"Geo {id} must have a 'label' attribute in order to be saved and loaded."
        raise Exception(msg)
    np.savez_compressed(f"./epymorph/data/geo/{id}_geo.npz", **data)

## Data fetching

In [2]:
census = Census('6cf86640c2654b91dfa0418755fae477ccfe0be5')

county_query = {'for': 'county: *'}
county_data = census.acs5.get(('NAME', 'B01003_001E'), county_query, year=YEAR)
county_geog = pygris.counties(year=YEAR, cache=True)

state_query = {'for': 'state: *'}
state_data = census.acs5.get(('NAME', 'B01003_001E'), state_query, year=YEAR)
state_geog = pygris.states(year=YEAR, cache=True)

In [3]:
commflows_raw = pd.read_excel(
    'https://www2.census.gov/programs-surveys/demo/tables/metro-micro/2015/commuting-flows-2015/table1.xlsx',
    skiprows=6,
    skipfooter=2,
    names=[
        "res_state_fips",
        "res_county_fips",
        "res_state",
        "res_county",
        "wrk_state_fips",
        "wrk_county_fips",
        "wrk_state",
        "wrk_county",
        "workers",
        "moe"
    ],
    dtype=str
)
# non-US destinations will have NaN in either state or county fips column (or both)
# filter those out
commflows_raw = commflows_raw[
    ~commflows_raw.isna().any(axis=1)
]
# reformat work state FIPS to two digits
commflows_raw['wrk_state_fips'] = commflows_raw['wrk_state_fips'].str.slice(1)

commflows_by_county = pd.DataFrame({
    'res_geoid': commflows_raw['res_state_fips'] + commflows_raw['res_county_fips'],
    'wrk_geoid': commflows_raw['wrk_state_fips'] + commflows_raw['wrk_county_fips'],
    'workers': commflows_raw['workers'].astype(np.int64)
})
commflows_by_county = commflows_by_county[
    ~commflows_by_county.isna().any(axis=1)
]

# county GEOID -> index
county_geoids = {geoid: i for i, geoid
                 in enumerate(commflows_by_county['res_geoid'].unique())}

commflows_by_state = pd.DataFrame({
    'res_geoid': commflows_raw['res_state_fips'],
    'wrk_geoid': commflows_raw['wrk_state_fips'],
    'workers': commflows_raw['workers'].astype(np.int64)
})
commflows_by_state = commflows_by_state[
    ~commflows_by_state.isna().any(axis=1)
]
commflows_by_state = commflows_by_state\
    .groupby(['res_geoid', 'wrk_geoid'], as_index=False).sum()

# state GEOID -> index
state_geoids = {geoid: i for i, geoid
                in enumerate(commflows_by_state['res_geoid'].unique())}

## County GEO processing

In [4]:
# Get label, geoid, and population
d = pd.DataFrame.from_records(county_data).astype({
    'NAME': np.str_,
    'B01003_001E': np.int64,
})
d.insert(1, 'geoid', d['state'] + d['county'])
d.drop(columns=['state', 'county'], inplace=True)
d.sort_values(by='geoid', inplace=True)
d.rename(columns={
    'B01003_001E': 'population',
    'NAME': 'label'
}, inplace=True)
d.reset_index(drop=True, inplace=True)

# Get centroid and merge
d = d.merge(pd.DataFrame({
    'geoid': county_geog['GEOID'],
    'centroid': county_geog['geometry'].apply(lambda row: row.centroid.coords[0])
}), on='geoid')

# Get commuters
c = np.zeros(shape=(Nc, Nc), dtype=np.int64)
for _, row in commflows_by_county.iterrows():
    res_idx = county_geoids[row['res_geoid']]
    wrk_idx = county_geoids[row['wrk_geoid']]
    c[res_idx, wrk_idx] = row['workers']

counties = {
    'label': d['label'].to_numpy(dtype=str),
    'geoid': d['geoid'].to_numpy(dtype=str),
    'centroid': d['centroid'].to_numpy(dtype=CentroidDType),
    'population': d['population'].to_numpy(dtype=np.int64),
    'commuters': c
}

validate_schema(COUNTIES_SCHEMA, counties)
save_compressed_geo('us_counties_2015', counties)

## State GEO processing

In [5]:
# Get label, geoid, and population
d = pd.DataFrame.from_records(state_data).astype({
    'NAME': np.str_,
    'B01003_001E': np.int64,
    'state': np.str_
})
d.rename(columns={
    'NAME': 'label',
    'B01003_001E': 'population',
    'state': 'geoid'
}, inplace=True)
d.sort_values(by='geoid', inplace=True)
d.reset_index(drop=True, inplace=True)

# Get centroid and merge
d = d.merge(pd.DataFrame({
    'geoid': state_geog['GEOID'],
    'centroid': state_geog['geometry'].apply(lambda row: row.centroid.coords[0])
}), on='geoid')

# Get commuters
c = np.zeros(shape=(Ns, Ns), dtype=np.int64)
for _, row in commflows_by_state.iterrows():
    res_idx = state_geoids[row['res_geoid']]
    wrk_idx = state_geoids[row['wrk_geoid']]
    c[res_idx, wrk_idx] = row['workers']

states = {
    'label': d['label'].to_numpy(dtype=np.str_),
    'geoid': d['geoid'].to_numpy(dtype=np.str_),
    'centroid': d['centroid'].to_numpy(dtype=CentroidDType),
    'population': d['population'].to_numpy(dtype=np.int64),
    'commuters': c
}

validate_schema(STATES_SCHEMA, states)
save_compressed_geo('us_states_2015', states)