# devlog 2024-06-12

_author: Trevor Johnson_

We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).

The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates geos with CSV ADRIOs to load the data into NDArrays. These geos also contain Census ADRIOs that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly.

In [1]:
from datetime import date
from pathlib import Path

from numpy import array_equal
from datetime import datetime

from epymorph.data_shape import Shapes
from epymorph.geo.adrio import adrio_maker_library
from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus
from epymorph.geo.adrio.file.adrio_csv import (CSVSpec, CSVSpecMatrix,
                                               CSVSpecTime)
from epymorph.geo.dynamic import DynamicGeo
from epymorph.geo.spec import DynamicGeoSpec, Year
from epymorph.geography.us_census import (STATE, CountyScope, StateScope,
                                          get_us_counties, get_us_states)
from epymorph.simulation import geo_attrib
from pandas import DataFrame, concat, read_csv

# create and store 'pei_population.csv'
census_maker = ADRIOMakerCensus()
states_list = ['AZ', 'FL', 'GA', 'MD', 'NY', 'NC', 'SC', 'VA']
population = census_maker.make_adrio(geo_attrib(
    'population', int, Shapes.N), StateScope.in_states_by_code(states_list), Year(2015))
df = DataFrame({'label': states_list, 'population': population.get_value()})
df.sort_values(by='population', inplace=True)
df.to_csv("./scratch/pei_population.csv", header=False, index=False)

In [2]:
spec = DynamicGeoSpec(
    attributes=[
        geo_attrib('label', str, Shapes.N),
        geo_attrib('population', int, Shapes.N),
        geo_attrib('population_census', int, Shapes.N)
    ],
    time_period=Year(2015),
    scope=StateScope.in_states(['12', '13', '24', '37', '45', '51']),
    source={
        'label': 'Census:name',
        'population': CSVSpec(file_path=Path("./scratch/pei_population.csv"),
                              key_col=0, data_col=1, key_type="state_abbrev", skiprows=None, time_col=None),
        'population_census': 'Census:population'
    }
)

In [3]:
geo = DynamicGeo.from_library(spec, adrio_maker_library)

# validate geo and ensure both ADRIOs fetched identical data
geo.validate()
if not array_equal(geo['population'], geo['population_census']):
    raise Exception("Data not equal.")

In [4]:
# create and store 'us_sw_counties_population.csv'

# get commuters data from asc5
states_list = ['04', '08', '49', '35', '32']
population_2015 = census_maker.make_adrio(geo_attrib(
    'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2015)).get_value()
population_2016 = census_maker.make_adrio(geo_attrib(
    'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2016)).get_value()

# get county and state info from shapefiles and convert to dataframes
counties_info = get_us_counties(2010)
states_info = get_us_states(2010)
counties_info_df = DataFrame({'state_geoid': [STATE.extract(
    county_id) for county_id in counties_info.geoid], 'geoid': counties_info.geoid, 'name': counties_info.name})
states_info_df = DataFrame(
    {'state_geoid': states_info.geoid, 'state_name': states_info.name})

# merge dataframes and create "County, State" name column
merged_df = counties_info_df.merge(states_info_df, on='state_geoid')
merged_df['county_name'] = merged_df['name'] + ", " + merged_df['state_name']
merged_df = merged_df.loc[merged_df['state_geoid'].isin(states_list)]

# create and merge dataframes to be converted to csvs
df_2015 = DataFrame({'Date': [date(2015, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [
                    pop[0] for pop in population_2015], 'Adult': [pop[1] for pop in population_2015], 'Elderly': [pop[2] for pop in population_2015]})
df_2016 = DataFrame({'Date': [date(2016, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [
                    pop[0] for pop in population_2016], 'Adult': [pop[1] for pop in population_2016], 'Elderly': [pop[2] for pop in population_2016]})
df = concat([df_2015, df_2016])

# sort incorrectly and store as csv
df.sort_values('Young', inplace=True)
df.to_csv("./scratch/us_sw_counties_population.csv", index=False)

In [5]:
spec = DynamicGeoSpec(
    attributes=[
        geo_attrib('label', str, Shapes.N),
        geo_attrib('population', int, Shapes.N),
        geo_attrib('population_0-19', int, Shapes.N),
        geo_attrib('population_20-64', int, Shapes.N),
        geo_attrib('population_65+', int, Shapes.N),
        geo_attrib('population_by_age', int, Shapes.NxA(3))
    ],
    time_period=Year(2015),
    scope=CountyScope.in_states(['04', '08', '49', '35', '32']),
    source={
        'label': 'Census:name',
        'population': 'Census',
        'population_0-19': CSVSpec(file_path=Path("./scratch/us_sw_counties_population.csv"),
                                   time_col=0, key_col=1, data_col=2, key_type="county_state", skiprows=1),
        'population_20-64': CSVSpec(file_path=Path("./scratch/us_sw_counties_population.csv"),
                                    time_col=0, key_col=1, data_col=3, key_type="county_state", skiprows=1),
        'population_65+': CSVSpec(file_path=Path("./scratch/us_sw_counties_population.csv"),
                                  time_col=0, key_col=1, data_col=4, key_type="county_state", skiprows=1),
        'population_by_age': 'Census'
    }
)

In [6]:
geo = DynamicGeo.from_library(spec, adrio_maker_library)

geo.validate()

census_df = DataFrame({'Young': [pop[0] for pop in geo['population_by_age']], 'Adult': [
                      pop[1] for pop in geo['population_by_age']], 'Elderly': [pop[2] for pop in geo['population_by_age']]})
if not array_equal(geo['population_0-19'], census_df['Young']):
    raise Exception("Young data not equal.")
if not array_equal(geo['population_20-64'], census_df['Adult']):
    raise Exception("Adult data not equal.")
if not array_equal(geo['population_65+'], census_df['Elderly']):
    raise Exception("Elderly data not equal.")

In [7]:
# create and store 'vaccination_time_series.csv'
fips = '\'' + '\',\''.join(['08001', '35001', '04013', '04017']) + '\''
url = f"https://data.cdc.gov/resource/8xkx-amqh.csv?$select=date,fips,series_complete_yes&$where=fips%20in({fips})&$limit=1962781"
df = read_csv(url, dtype={'fips': str})

df['date'] = [datetime.fromisoformat(
    week.replace('/', '-')).date() for week in df['date']]

df.to_csv('./scratch/vaccination_time_series.csv', index=False)

In [8]:
spec = DynamicGeoSpec(
    attributes=[
        geo_attrib('label', str, Shapes.N),
        geo_attrib('population', int, Shapes.N),
        geo_attrib('vaccinations', int, Shapes.TxN),
    ],
    time_period=Year(2021),
    scope=CountyScope.in_counties(['08001', '04013', '35001']),
    source={
        'label': 'Census:name',
        'population': 'Census',
        'vaccinations': CSVSpecTime(file_path=Path("./scratch/vaccination_time_series.csv"),
                                    time_col=0, key_col=1, data_col=2, key_type="geoid", skiprows=1),
    }
)

In [9]:
geo = DynamicGeo.from_library(spec, adrio_maker_library)

geo.validate()

In [10]:
# create and store 'counties_commuters_2020.csv'
counties_list = ['08001', '35001', '04013', '04017']
df = census_maker.fetch_commuters(CountyScope.in_counties(counties_list), 2020)
df['res_geoid'] = df['res_state_code'] + df['res_county_code']
df['wrk_geoid'] = df['wrk_state_code'].apply(lambda x: x[1:]) + df['wrk_county_code']

df.sort_values(by='workers', inplace=True)

df.to_csv('./scratch/counties_commuters_2020.csv',
          columns=['res_geoid', 'wrk_geoid', 'workers'], index=False)

In [11]:
spec = DynamicGeoSpec(
    attributes=[
        geo_attrib('label', str, Shapes.N),
        geo_attrib('population', int, Shapes.N),
        geo_attrib('commuters', int, Shapes.NxN),
        geo_attrib('commuters_census', int, Shapes.NxN)
    ],
    time_period=Year(2020),
    scope=CountyScope.in_counties(['35001', '04013', '04017']),
    source={
        'label': 'Census:name',
        'population': 'Census',
        'commuters': CSVSpecMatrix(file_path=Path("./scratch/counties_commuters_2020.csv"),
                                   from_key_col=0, to_key_col=1, data_col=2, key_type="geoid", skiprows=1, time_col=None),
        'commuters_census': 'Census:commuters'
    }
)

In [12]:
geo = DynamicGeo.from_library(spec, adrio_maker_library)

geo.validate()
if not array_equal(geo['commuters'], geo['commuters_census']):
    raise Exception("Data not equal.")