# devlog 2024-06-12

_author: Trevor Johnson_

We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).

The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates CSV ADRIOs to load the data into NDArrays. Census ADRIOs are also created that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly.

## Creating sample .csv files

In [1]:
from datetime import date
from pathlib import Path
from urllib.parse import quote

import numpy as np
from pandas import DataFrame, read_csv, to_datetime

from epymorph.kit import *
from epymorph.adrio import acs5, commuting_flows, csv
from epymorph.geography.scope import GeoScope
from epymorph.geography.us_census import (
    get_counties,
    get_states,
)
from epymorph.util import dict_map


def placeholder_dim(scope: GeoScope, time_frame: TimeFrame) -> SimDimensions:
    return SimDimensions.build(
        tau_step_lengths=[1.0],
        start_date=time_frame.start_date,
        days=time_frame.duration_days,
        nodes=scope.nodes,
        compartments=1,
        events=0,
    )


TEST_FILE_PATH = Path("./scratch/test")
TEST_FILE_PATH.mkdir(parents=True, exist_ok=True)


def create_pei_population() -> None:
    # create 'pei_population.csv' if it doesn't exist
    file = TEST_FILE_PATH / "pei_population.csv"
    if file.exists():
        return

    scope = StateScope.in_states(
        ["AZ", "FL", "GA", "MD", "NY", "NC", "SC", "VA"],
        year=2015,
    )
    time_frame = TimeFrame.year(2015)
    dim = placeholder_dim(scope, time_frame)
    states_info = get_states(2015)

    (
        DataFrame(
            {
                "label": dict_map(states_info.state_fips_to_code, scope.node_ids),
                "population": (
                    acs5.Population()
                    .with_context(
                        scope=scope,
                        dim=dim,
                    )
                    .evaluate()
                ),
            }
        )
        .sort_values(by="population")
        .to_csv(file, header=False, index=False)
    )


def create_us_sw_counties_population() -> None:
    # create 'us_sw_counties_population.csv' if it doesn't exist
    file = TEST_FILE_PATH / "us_sw_counties_population.csv"
    if file.exists():
        return

    scope = CountyScope.in_states(["04", "08", "49", "35", "32"], year=2015)
    context = {
        "scope": scope,
        "dim": placeholder_dim(scope, TimeFrame.year(2015)),
        "params": {
            "population_by_age_table": acs5.PopulationByAgeTable(),
        },
    }

    counties_info = get_counties(2015)

    (
        DataFrame(
            {
                "Date": date(2015, 1, 1),
                "County": dict_map(counties_info.county_fips_to_name, scope.node_ids),
                "Young": (
                    acs5.PopulationByAge(0, 19).with_context(**context).evaluate()
                ),
                "Adult": (
                    acs5.PopulationByAge(20, 64).with_context(**context).evaluate()
                ),
                "Elderly": (
                    acs5.PopulationByAge(65, 79).with_context(**context).evaluate()
                ),
            }
        )
        .sort_values("Young")  # mis-sort to test ADRIO sorting
        .to_csv(file, index=False)
    )


def create_vaccination_time_series() -> None:
    # create 'vaccination_time_series.csv' if it doesn't exist
    file = TEST_FILE_PATH / "vaccination_time_series.csv"
    if file.exists():
        return

    fips = ",".join(f"'{node}'" for node in ["08001", "35001", "04013", "04017"])
    where = quote(f"fips in({fips}) and date between '2021-01-01' and '2021-12-31'")
    url = (
        "https://data.cdc.gov/resource/8xkx-amqh.csv?"
        "$select=date,fips,series_complete_yes"
        f"&$where={where}"
        "&$limit=10000"
    )

    result_df = read_csv(url, dtype={"fips": str})
    result_df["date"] = to_datetime(result_df["date"]).dt.date
    result_df.to_csv(file, index=False)


def create_counties_commuters() -> None:
    # create 'counties_commuters_2020.csv' if it doesn't exist
    file = TEST_FILE_PATH / "counties_commuters_2020.csv"
    if file.exists():
        return None

    scope = CountyScope.in_counties(["08001", "35001", "04013", "04017"], year=2020)
    time_frame = TimeFrame.year(2020)
    dim = placeholder_dim(scope, time_frame)

    commuters = (
        commuting_flows.Commuters()
        .with_context(
            scope=scope,
            dim=dim,
        )
        .evaluate()
    )

    # Convert square numpy array to DataFrame
    geoids = scope.node_ids
    home, work = np.meshgrid(geoids, geoids, indexing="ij")

    (
        DataFrame(
            {
                "res_geoid": home.flatten(),
                "wrk_geoid": work.flatten(),
                "workers": commuters.flatten(),
            }
        )
        .sort_values(by="workers")
        .to_csv(file, index=False)
    )


create_pei_population()
create_us_sw_counties_population()
create_vaccination_time_series()
create_counties_commuters()

## Load .csvs with ADRIOs and compare with known values

In [2]:
from epymorph.adrio.adrio import Adrio


def test_adrios(context: dict, a: Adrio, b: Adrio, description: str) -> None:
    with sim_messaging():
        a_value = a.with_context(**context).evaluate()
        b_value = b.with_context(**context).evaluate()
    if np.array_equal(a_value, b_value):
        print(f"{description} data ✓\n")
    else:
        raise Exception(f"{description} data not equal.")

In [3]:
# Check pei_population.csv
scope = StateScope.in_states(["12", "13", "24", "37", "45", "51"], year=2015)
context = {
    "scope": scope,
    "dim": placeholder_dim(scope, TimeFrame.year(2015)),
}

test_adrios(
    context,
    csv.CSV(
        file_path=TEST_FILE_PATH / "pei_population.csv",
        key_col=0,
        data_col=1,
        data_type=np.int64,
        key_type="state_abbrev",
        skiprows=None,
    ),
    acs5.Population(),
    "Population",
)

Loading epymorph.adrio.csv.CSV:
  |####################| 100%  (0.002s)
Loading epymorph.adrio.acs5.Population:
  |####################| 100%  (1.201s)
Population data ✓



In [4]:
# Check us_sw_counties_population.csv
scope = CountyScope.in_states(["04", "08", "49", "35", "32"], year=2015)
context = {
    "scope": scope,
    "dim": placeholder_dim(scope, TimeFrame.year(2015)),
    "params": {
        "population_by_age_table": acs5.PopulationByAgeTable(),
    },
}

test_adrios(
    context,
    csv.CSV(
        file_path=TEST_FILE_PATH / "us_sw_counties_population.csv",
        key_col=1,
        data_col=2,
        data_type=np.int64,
        key_type="county_state",
        skiprows=1,
    ),
    acs5.PopulationByAge(0, 19),
    "Young",
)

test_adrios(
    context,
    csv.CSV(
        file_path=TEST_FILE_PATH / "us_sw_counties_population.csv",
        key_col=1,
        data_col=3,
        data_type=np.int64,
        key_type="county_state",
        skiprows=1,
    ),
    acs5.PopulationByAge(20, 64),
    "Adult",
)

test_adrios(
    context,
    csv.CSV(
        file_path=TEST_FILE_PATH / "us_sw_counties_population.csv",
        key_col=1,
        data_col=4,
        data_type=np.int64,
        key_type="county_state",
        skiprows=1,
    ),
    acs5.PopulationByAge(65, 79),
    "Elderly",
)

Loading epymorph.adrio.csv.CSV:
  |####################| 100%  (0.008s)
Loading epymorph.adrio.acs5.PopulationByAgeTable:
  |####################| 100%  (12.587s)
Loading epymorph.adrio.acs5.PopulationByAge:
  |####################| 100%  (0.000s)
Young data ✓

Loading epymorph.adrio.csv.CSV:
  |####################| 100%  (0.006s)
Loading epymorph.adrio.acs5.PopulationByAgeTable:
  |####################| 100%  (0.528s)
Loading epymorph.adrio.acs5.PopulationByAge:
  |####################| 100%  (0.000s)
Adult data ✓

Loading epymorph.adrio.csv.CSV:
  |####################| 100%  (0.009s)
Loading epymorph.adrio.acs5.PopulationByAgeTable:
  |####################| 100%  (0.507s)
Loading epymorph.adrio.acs5.PopulationByAge:
  |####################| 100%  (0.000s)
Elderly data ✓



In [5]:
# Check vaccination_time_series.csv
scope = CountyScope.in_counties(["08001", "04013", "35001"], year=2021)
context = {
    "scope": scope,
    "dim": placeholder_dim(scope, TimeFrame.year(2021)),
}

with sim_messaging():
    vax_csv = (
        csv.CSVTimeSeries(
            file_path=TEST_FILE_PATH / "vaccination_time_series.csv",
            time_col=0,
            time_frame=TimeFrame.year(2021),
            key_col=1,
            data_col=2,
            data_type=np.float64,
            key_type="geoid",
            skiprows=1,
        )
        .with_context(**context)
        .evaluate()
    )

if vax_csv.shape == (365, 3):
    print("Vaccination data ✓")
else:
    raise Exception("Vaccination data is an invalid shape.")

Loading epymorph.adrio.csv.CSVTimeSeries:
  |####################| 100%  (0.007s)
Vaccination data ✓


In [6]:
# Check counties_commuters_2020.csv
scope = CountyScope.in_counties(["35001", "04013", "04017"], year=2020)
context = {
    "scope": scope,
    "dim": placeholder_dim(scope, TimeFrame.year(2020)),
}

test_adrios(
    context,
    csv.CSVMatrix(
        file_path=TEST_FILE_PATH / "counties_commuters_2020.csv",
        from_key_col=0,
        to_key_col=1,
        data_col=2,
        data_type=np.int64,
        key_type="geoid",
        skiprows=1,
    ),
    commuting_flows.Commuters(),
    "Commuters",
)

Loading epymorph.adrio.csv.CSVMatrix:
  |####################| 100%  (0.004s)
Loading epymorph.adrio.commuting_flows.Commuters:
  |####################| 100%  (6.561s)
Commuters data ✓

