# devlog 2024-06-12

_author: Trevor Johnson_

We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).

The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates CSV ADRIOs to load the data into NDArrays. Census ADRIOs are also created that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly.

In [1]:
from datetime import date, datetime
from pathlib import Path
from unittest.mock import Mock

import numpy as np
from pandas import DataFrame, read_csv

from epymorph import *
from epymorph.data_shape import SimDimensions
from epymorph.geo.adrio import acs5, us_tiger
from epymorph.geo.adrio.commuting_flows import Commuters
from epymorph.geo.adrio.csv import CSV, CSVMatrix, CSVTimeSeries
from epymorph.geo.spec import Year
from epymorph.geography.us_census import (STATE, CountyScope, StateScope,
                                          get_us_counties, get_us_states)
from epymorph.simulation import NamespacedAttributeResolver
from epymorph.simulator.data import evaluate_param

# create and store 'pei_population.csv'
states_list = ['AZ', 'FL', 'GA', 'MD', 'NY', 'NC', 'SC', 'VA']
scope = StateScope.in_states_by_code(states_list, 2015)
data = Mock(spec=NamespacedAttributeResolver)
dim = Mock(spec=SimDimensions)
rng = Mock(spec=np.random.Generator)
adrio = acs5.Population()
result = adrio.evaluate_in_context(data, dim, scope, rng)

df = DataFrame({'label': states_list, 'population': result})
df.sort_values(by='population', inplace=True)
df.to_csv("./scratch/pei_population.csv", header=False, index=False)

Evaluating Population ADRIO...
Completed Population ADRIO (1.729s).


In [2]:
scope = StateScope.in_states(['12', '13', '24', '37', '45', '51'], 2015)
csv_adrio = CSV(file_path=Path("./scratch/pei_population.csv"), key_col=0,
                data_col=1, data_type=int, key_type="state_abbrev", skiprows=None)
census_adrio = acs5.Population()

csv_result = csv_adrio.evaluate_in_context(data, dim, scope, rng)
census_result = census_adrio.evaluate_in_context(data, dim, scope, rng)

Evaluating CSV ADRIO...
Completed CSV ADRIO (0.006s).
Evaluating Population ADRIO...
Completed Population ADRIO (0.594s).


In [3]:
# validate geo and ensure both ADRIOs fetched identical data
if not np.array_equal(csv_result, census_result):
    raise Exception("Data not equal.")

In [5]:
# create and store 'us_sw_counties_population.csv'

# get commuters data from asc5
states_list = ['04', '08', '49', '35', '32']
scope = CountyScope.in_states(states_list, year=2015)
rume = Rume.single_strata(
    ipm=ipm_library['sirs'](),
    mm=mm_library['centroids'](),
    init=init.SingleLocation(location=0, seed_size=10_000),
    scope=scope,
    time_frame=TimeFrame.of("2015-01-01", 300),
    params={
        'ipm::beta': 0.4,
        'ipm::gamma': 1 / 5,
        'ipm::xi': 1 / 90,
        'mm::phi': 40.0,
        'population': acs5.Population(),
        'centroid': us_tiger.InternalPoint(),
        'population_by_age_table': acs5.PopulationByAgeTable(),
        'population_00-19': acs5.PopulationByAge(0, 19),
        'population_20-64': acs5.PopulationByAge(20, 64),
        'population_65-79': acs5.PopulationByAge(65, 79),
        'geo::label': us_tiger.Name(),
    }
)

young = evaluate_param(rume, 'population_00-19')
adult = evaluate_param(rume, 'population_20-64')
elderly = evaluate_param(rume, 'population_65-79')

# get county and state info from shapefiles and convert to dataframes
counties_info = get_us_counties(2010)
states_info = get_us_states(2010)
counties_info_df = DataFrame({
    'state_geoid': [STATE.extract(county_id) for county_id in counties_info.geoid],
    'geoid': counties_info.geoid,
    'name': counties_info.name,
})
states_info_df = DataFrame({
    'state_geoid': states_info.geoid,
    'state_name': states_info.name,
})

# merge dataframes and create "County, State" name column
merged_df = counties_info_df.merge(states_info_df, on='state_geoid')
merged_df['county_name'] = merged_df['name'] + ", " + merged_df['state_name']
merged_df = merged_df.loc[merged_df['state_geoid'].isin(states_list)]

# create and merge dataframes to be converted to csvs
df = DataFrame({
    'Date': [date(2015, 1, 1) for i in merged_df.index],
    'County': merged_df['county_name'],
    'Young': young,
    'Adult': adult,
    'Elderly': elderly,
})

# sort incorrectly and store as csv
df.sort_values('Young', inplace=True)
df.to_csv("./scratch/us_sw_counties_population.csv", index=False)

Evaluating PopulationByAgeTable ADRIO...
Completed PopulationByAgeTable ADRIO (2.485s).
Evaluating PopulationByAge ADRIO...
Completed PopulationByAge ADRIO (0.000s).
Evaluating PopulationByAge ADRIO...
Completed PopulationByAge ADRIO (0.001s).
Evaluating PopulationByAge ADRIO...
Completed PopulationByAge ADRIO (0.000s).


In [6]:
young_csv = CSV(file_path=Path("./scratch/us_sw_counties_population.csv"),
                key_col=1, data_col=2, data_type=int, key_type="county_state", skiprows=1)
adult_csv = CSV(file_path=Path("./scratch/us_sw_counties_population.csv"),
                key_col=1, data_col=3, data_type=int, key_type="county_state", skiprows=1)
elderly_csv = CSV(file_path=Path("./scratch/us_sw_counties_population.csv"),
                  key_col=1, data_col=4, data_type=int, key_type="county_state", skiprows=1)

In [7]:
young_csv_result = young_csv.evaluate_in_context(data, dim, scope, rng)
adult_csv_result = adult_csv.evaluate_in_context(data, dim, scope, rng)
elderly_csv_result = elderly_csv.evaluate_in_context(data, dim, scope, rng)

census_df = DataFrame({
    'Young': young,
    'Adult': adult,
    'Elderly': elderly,
})

if not np.array_equal(young_csv_result, census_df['Young']):
    raise Exception("Young data not equal.")
if not np.array_equal(adult_csv_result, census_df['Adult']):
    raise Exception("Adult data not equal.")
if not np.array_equal(elderly_csv_result, census_df['Elderly']):
    raise Exception("Elderly data not equal.")

Evaluating CSV ADRIO...
Completed CSV ADRIO (0.027s).
Evaluating CSV ADRIO...
Completed CSV ADRIO (0.030s).
Evaluating CSV ADRIO...
Completed CSV ADRIO (0.025s).


In [4]:
# create and store 'vaccination_time_series.csv'
fips = ",".join(f"'{node}'" for node in ['08001', '35001', '04013', '04017'])
url = f"https://data.cdc.gov/resource/8xkx-amqh.csv?$select=date,fips,series_complete_yes&$where=fips%20in({fips})&$limit=1962781"
df = read_csv(url, dtype={'fips': str})

df['date'] = [datetime.fromisoformat(
    week.replace('/', '-')).date() for week in df['date']]

df = df[df['date'] >= date(2021, 1, 1)]
df = df[df['date'] <= date(2021, 12, 31)]

df.to_csv('./scratch/vaccination_time_series.csv', index=False)

In [5]:
scope = CountyScope.in_counties(['08001', '04013', '35001'], 2021)
csv_adrio = CSVTimeSeries(file_path=Path("./scratch/vaccination_time_series.csv"),
                          time_col=0, time_period=Year(2021), key_col=1, data_col=2, data_type=float, key_type="geoid", skiprows=1)

In [6]:
csv_result = csv_adrio.evaluate_in_context(data, dim, scope, rng)
display(csv_result)

Evaluating CSVTimeSeries ADRIO...
Completed CSVTimeSeries ADRIO (0.012s).


array([[8.200000e+01, 0.000000e+00, 0.000000e+00],
       [9.400000e+01, 0.000000e+00, 0.000000e+00],
       [1.140000e+02, 0.000000e+00, 0.000000e+00],
       ...,
       [2.395809e+06, 3.307850e+05, 4.484610e+05],
       [2.398973e+06, 3.312020e+05, 4.489280e+05],
       [2.402173e+06, 3.316520e+05, 4.495000e+05]])

In [10]:
# create and store 'counties_commuters_2020.csv'
scope = CountyScope.in_counties(['08001', '35001', '04013', '04017'], 2020)
commuters_adrio = Commuters()
df = commuters_adrio.evaluate_in_context(data, dim, scope, rng)
df['res_geoid'] = df['res_state_code'] + df['res_county_code']
df['wrk_geoid'] = df['wrk_state_code'].apply(lambda x: x[1:]) + df['wrk_county_code']

df.sort_values(by='workers', inplace=True)

df.to_csv('./scratch/counties_commuters_2020.csv',
          columns=['res_geoid', 'wrk_geoid', 'workers'], index=False)

In [11]:
scope = CountyScope.in_counties(['35001', '04013', '04017'], 2020)
csv_adrio = CSVMatrix(file_path=Path("./scratch/counties_commuters_2020.csv"),
                      from_key_col=0, to_key_col=1, data_col=2, data_type=int, key_type="geoid", skiprows=1)
census_adrio = Commuters()

In [12]:
if not np.array_equal(csv_adrio.evaluate_in_context(data, dim, scope, rng), census_adrio.evaluate_in_context(data, dim, scope, rng)):
    raise Exception("Data not equal.")