# devlog 2024-06-12

_author: Trevor Johnson_

We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).

The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates CSV ADRIOs to load the data into NDArrays. Census ADRIOs are also created that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly.

## Creating sample .csv files

In [1]:
from datetime import date, datetime
from pathlib import Path

import numpy as np
from pandas import DataFrame, read_csv

from epymorph import *
from epymorph.adrio import acs5, commuting_flows, csv
from epymorph.geography.us_census import (
    STATE,
    CountyScope,
    StateScope,
    get_us_counties,
    get_us_states,
)
from epymorph.simulation import TimeFrame
from epymorph.simulator.data import evaluate_param


def placeholder_rume(scope, time_frame, params):
    return SingleStrataRume.build(
        ipm=ipm_library["no"](),
        mm=mm_library["no"](),
        init=init.NoInfection(),
        scope=scope,
        time_frame=time_frame,
        params=params,
    )


def create_pei_population() -> None:
    # create 'pei_population.csv' if it doesn't exist
    if Path("./scratch/pei_population.csv").exists():
        return

    states_list = ["AZ", "FL", "GA", "MD", "NY", "NC", "SC", "VA"]
    scope = StateScope.in_states_by_code(states_list, year=2015)

    rume = placeholder_rume(
        scope, TimeFrame.year(2015), {"population": acs5.Population()}
    )
    result = evaluate_param(rume, "population")

    df = DataFrame({"label": states_list, "population": result})
    df.sort_values(by="population", inplace=True)
    df.to_csv("./scratch/pei_population.csv", header=False, index=False)


def create_us_sw_counties_population() -> None:
    # create 'us_sw_counties_population.csv' if it doesn't exist
    if Path("./scratch/us_sw_counties_population.csv").exists():
        return

    # get commuters data from asc5
    states_list = ["04", "08", "49", "35", "32"]
    scope = CountyScope.in_states(states_list, year=2015)
    rume = placeholder_rume(
        scope,
        TimeFrame.year(2015),
        {
            "population_by_age_table": acs5.PopulationByAgeTable(),
            "population_00-19": acs5.PopulationByAge(0, 19),
            "population_20-64": acs5.PopulationByAge(20, 64),
            "population_65-79": acs5.PopulationByAge(65, 79),
        },
    )

    young = evaluate_param(rume, "population_00-19")
    adult = evaluate_param(rume, "population_20-64")
    elderly = evaluate_param(rume, "population_65-79")

    # get county and state info from shapefiles and convert to dataframes
    counties_info = get_us_counties(2010)
    states_info = get_us_states(2010)
    counties_info_df = DataFrame(
        {
            "state_geoid": [
                STATE.extract(county_id) for county_id in counties_info.geoid
            ],
            "geoid": counties_info.geoid,
            "name": counties_info.name,
        }
    )
    states_info_df = DataFrame(
        {
            "state_geoid": states_info.geoid,
            "state_name": states_info.name,
        }
    )

    # merge dataframes and create "County, State" name column
    merged_df = counties_info_df.merge(states_info_df, on="state_geoid")
    merged_df["county_name"] = merged_df["name"] + ", " + merged_df["state_name"]
    merged_df = merged_df.loc[merged_df["state_geoid"].isin(states_list)]

    # create and merge dataframes to be converted to csvs
    df = DataFrame(
        {
            "Date": [date(2015, 1, 1) for i in merged_df.index],
            "County": merged_df["county_name"],
            "Young": young,
            "Adult": adult,
            "Elderly": elderly,
        }
    )

    # sort incorrectly and store as csv
    df.sort_values("Young", inplace=True)
    df.to_csv("./scratch/us_sw_counties_population.csv", index=False)


def create_vaccination_time_series() -> None:
    # create 'vaccination_time_series.csv' if it doesn't exist
    if Path("./scratch/vaccination_time_series.csv").exists():
        return

    fips = ",".join(f"'{node}'" for node in ["08001", "35001", "04013", "04017"])
    url = f"https://data.cdc.gov/resource/8xkx-amqh.csv?$select=date,fips,series_complete_yes&$where=fips%20in({fips})&$limit=1962781"
    df = read_csv(url, dtype={"fips": str})

    df["date"] = [
        datetime.fromisoformat(week.replace("/", "-")).date() for week in df["date"]
    ]

    df = df[(df["date"] >= date(2021, 1, 1)) & (df["date"] <= date(2021, 12, 31))]

    df.to_csv("./scratch/vaccination_time_series.csv", index=False)


def create_counties_commuters() -> None:
    # create 'counties_commuters_2020.csv' if it doesn't exist
    if Path("./scratch/counties_commuters_2020.csv").exists():
        return None

    scope = CountyScope.in_counties(["08001", "35001", "04013", "04017"], year=2020)
    rume = placeholder_rume(
        scope=scope,
        time_frame=TimeFrame.year(2020),
        params={
            "commuters": commuting_flows.Commuters(),
        },
    )

    commuters = evaluate_param(rume, "commuters")

    # Convert square numpy array to DataFrame:
    geoids = scope.get_node_ids()
    home, work = np.meshgrid(geoids, geoids, indexing="ij")
    df = DataFrame(
        {
            "res_geoid": home.flatten(),
            "wrk_geoid": work.flatten(),
            "workers": commuters.flatten(),
        }
    )
    df.sort_values(by="workers", inplace=True)
    df.to_csv(
        "./scratch/counties_commuters_2020.csv",
        columns=["res_geoid", "wrk_geoid", "workers"],
        index=False,
    )


create_pei_population()
create_us_sw_counties_population()
create_vaccination_time_series()
create_counties_commuters()

## Load .csvs with ADRIOs and compare with known values

In [2]:
# Check pei_population.csv
rume = placeholder_rume(
    scope=StateScope.in_states(["12", "13", "24", "37", "45", "51"], year=2015),
    time_frame=TimeFrame.year(2015),
    params={
        "csv_result": csv.CSV(
            file_path=Path("./scratch/pei_population.csv"),
            key_col=0,
            data_col=1,
            data_type=np.int64,
            key_type="state_abbrev",
            skiprows=None,
        ),
        "census_result": acs5.Population(),
    },
)

if np.array_equal(
    evaluate_param(rume, "csv_result"), evaluate_param(rume, "census_result")
):
    print("✓")
else:
    raise Exception("Data not equal.")

✓


In [3]:
# Check us_sw_counties_population.csv
rume = placeholder_rume(
    scope=CountyScope.in_states(["04", "08", "49", "35", "32"], year=2015),
    time_frame=TimeFrame.year(2015),
    params={
        "young_csv": csv.CSV(
            file_path=Path("./scratch/us_sw_counties_population.csv"),
            key_col=1,
            data_col=2,
            data_type=np.int64,
            key_type="county_state",
            skiprows=1,
        ),
        "adult_csv": csv.CSV(
            file_path=Path("./scratch/us_sw_counties_population.csv"),
            key_col=1,
            data_col=3,
            data_type=np.int64,
            key_type="county_state",
            skiprows=1,
        ),
        "elderly_csv": csv.CSV(
            file_path=Path("./scratch/us_sw_counties_population.csv"),
            key_col=1,
            data_col=4,
            data_type=np.int64,
            key_type="county_state",
            skiprows=1,
        ),
        "population_by_age_table": acs5.PopulationByAgeTable(),
        "young_census": acs5.PopulationByAge(0, 19),
        "adult_census": acs5.PopulationByAge(20, 64),
        "elderly_census": acs5.PopulationByAge(65, 79),
    },
)

if np.array_equal(
    evaluate_param(rume, "young_csv"), evaluate_param(rume, "young_census")
):
    print("✓")
else:
    raise Exception("Young data not equal.")

if np.array_equal(
    evaluate_param(rume, "adult_csv"), evaluate_param(rume, "adult_census")
):
    print("✓")
else:
    raise Exception("Adult data not equal.")

if np.array_equal(
    evaluate_param(rume, "elderly_csv"), evaluate_param(rume, "elderly_census")
):
    print("✓")
else:
    raise Exception("Elderly data not equal.")

✓
✓
✓


In [4]:
# Check vaccination_time_series.csv
rume = placeholder_rume(
    scope=CountyScope.in_counties(["08001", "04013", "35001"], year=2021),
    time_frame=TimeFrame.year(2021),
    params={
        "vax_csv": csv.CSVTimeSeries(
            file_path=Path("./scratch/vaccination_time_series.csv"),
            time_col=0,
            time_frame=TimeFrame.year(2021),
            key_col=1,
            data_col=2,
            data_type=np.float64,
            key_type="geoid",
            skiprows=1,
        ),
    },
)

result = evaluate_param(rume, "vax_csv")
if result.shape == (365, 3):
    print("✓")
else:
    raise Exception("Vaccination data is an invalid shape.")

✓


In [5]:
# Check counties_commuters_2020.csv
rume = placeholder_rume(
    scope=CountyScope.in_counties(["35001", "04013", "04017"], year=2020),
    time_frame=TimeFrame.year(2020),
    params={
        "commuters_csv": csv.CSVMatrix(
            file_path=Path("./scratch/counties_commuters_2020.csv"),
            from_key_col=0,
            to_key_col=1,
            data_col=2,
            data_type=np.int64,
            key_type="geoid",
            skiprows=1,
        ),
        "commuters_census": commuting_flows.Commuters(),
    },
)

if np.array_equal(
    evaluate_param(rume, "commuters_csv"), evaluate_param(rume, "commuters_census")
):
    print("✓")
else:
    raise Exception("Data not equal.")

✓
