# devlog 2024-10-11

_Author: Tyler Coles_

Methods for group-and-aggregate on geo-series data.

epymorph output data can be thought of as three-dimensional -- the axes are 1. time, 2. geospatial node, 3. simulation data values (which can include simulation state and transition information; aka compartments and events). A very common requirement in data processing is to be able to summarize the geography axis of this data. For example, computing the time-series of infections summed across all nodes, or grouping by some geographic hierarchy and combining those -- say simulating at county level but summing to state level.

To determine the best approach, we generate some (suitably large) example data representing Census Tracts in four states, group by county, and sum to compute the time series for each of the data values present.

In [1]:
import numpy as np
import pandas as pd
from epymorph.geography.us_census import TractScope
from epymorph.geography.us_tiger import get_states
from epymorph.geography.us_geography import CensusGranularity

year = 2020
states = get_states(year).geoid[0:4]
scope = TractScope.in_states(states, year)
print(f"{scope.nodes=}")

agg_operation = "sum"
group_granularity = "county"
groups = CensusGranularity.of(group_granularity).truncate_list(scope.node_ids)
print(f"groups={len(groups)}")

# Generate some dummy data
T, N, Q = 366, scope.nodes, 6
data = np.arange(T * N * Q).reshape((T, N, Q))
print(f"{data.shape=}")

data[0, 0:4, :]

scope.nodes=4202
groups=187
data.shape=(366, 4202, 6)


array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])

### Approach 1: use numpy masks on a group-by-group basis

In [2]:
def apply_1(agg, scope, grouping, data):
    match agg:
        case "sum":
            op_f = np.ma.sum
        case "min":
            op_f = np.ma.min
        case "max":
            op_f = np.ma.max
        case x:
            raise ValueError(x)

    node_ids = scope.node_ids

    T, N, Q = data.shape
    groups = CensusGranularity.of(grouping).truncate_list(node_ids)

    # For each group, create a mask then compute the aggregate
    # and store in the group's result column.
    result = np.empty(shape=(T, len(groups), Q), dtype=data.dtype)
    for i, g in enumerate(groups):
        in_group = np.char.startswith(node_ids, g)
        group_mask = np.broadcast_to(
            np.invert(in_group).reshape((1, N, 1)),
            shape=(T, N, Q),
        )
        group_data = np.ma.masked_array(data, group_mask)
        result[:, i, :] = op_f(group_data, axis=1, keepdims=False)
    return result


%timeit apply_1(agg_operation, scope, group_granularity, data)
res1 = apply_1(agg_operation, scope, group_granularity, data)
print(f"{res1.shape=}")

14 s ± 268 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
res1.shape=(366, 187, 6)


### Approach 2: use numpy masks for the whole array at once

In [3]:
def apply_2(agg, scope, grouping, data):
    match agg:
        case "sum":
            op_f = np.ma.sum
        case "min":
            op_f = np.ma.min
        case "max":
            op_f = np.ma.max
        case x:
            raise ValueError(x)

    node_ids = scope.node_ids

    T, N, Q = data.shape
    groups = CensusGranularity.of(grouping).truncate_list(node_ids)
    G = len(groups)

    # For each group, create a mask then compute the aggregate
    # and store in the group's result column.

    mask = np.empty(shape=(N, G), dtype=np.bool_)
    for j, g in enumerate(groups):
        mask[:, j] = np.char.startswith(node_ids, g)
    mask = np.broadcast_to(
        np.invert(mask).reshape((1, N, G, 1)),
        shape=(T, N, G, Q),
    )

    reshaped = np.broadcast_to(
        data.reshape((T, N, 1, Q)),
        shape=(T, N, G, Q),
    )
    masked_data = np.ma.masked_array(reshaped, mask)
    result = op_f(masked_data, axis=1, keepdims=False)
    return result.data


%timeit apply_2(agg_operation, scope, group_granularity, data)
res2 = apply_2(agg_operation, scope, group_granularity, data)
print(f"{res2.shape=}")

6.21 s ± 50.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
res2.shape=(366, 187, 6)


### Approach 3: use Pandas, adding a column for group membership

In [4]:
def apply_3(agg, scope, grouping, data):
    gran = CensusGranularity.of(grouping)
    node_ids = scope.node_ids
    map_to_group = gran.truncate

    groups = np.array([map_to_group(x) for x in node_ids])

    T, N, Q = data.shape
    df = pd.DataFrame(data=data.reshape((-1, Q)))
    df["tick"] = np.repeat(np.arange(T), N)
    df["geo_group"] = np.tile(groups, T)
    df = (
        df.groupby(["tick", "geo_group"], sort=False)
        .agg(func=agg)
        .reset_index(drop=True)
    )
    return df.to_numpy().reshape((T, -1, Q))


%timeit apply_3(agg_operation, scope, group_granularity, data)
res3 = apply_3(agg_operation, scope, group_granularity, data)
print(f"{res3.shape=}")

218 ms ± 7.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
res3.shape=(366, 187, 6)


### Approach 4: a basic Python loop

In [5]:
def apply_4(agg, scope, grouping, data):
    from math import inf

    match agg:
        case "sum":
            op_f = lambda a, b: a + b  # noqa: E731
            agg_zero = 0
        case "min":
            op_f = lambda a, b: min(a, b)  # noqa: E731
            agg_zero = inf
        case "max":
            op_f = lambda a, b: max(a, b)  # noqa: E731
            agg_zero = -inf
        case x:
            raise ValueError(x)

    node_ids = scope.node_ids

    gran = CensusGranularity.of(grouping)
    groups = gran.truncate_list(node_ids)
    T, N, Q = data.shape
    G = len(groups)

    result = np.full(shape=(T, G, Q), fill_value=agg_zero, dtype=data.dtype)
    for i, n in enumerate(node_ids):
        g = groups.index(gran.truncate(n))
        for t in range(T):
            for q in range(Q):
                result[t, g, q] = op_f(result[t, g, q], data[t, i, q])
    return result


%timeit apply_4(agg_operation, scope, group_granularity, data)
res4 = apply_4(agg_operation, scope, group_granularity, data)
print(f"{res4.shape=}")

3.33 s ± 36.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
res4.shape=(366, 187, 6)


### Approach 5: hybrid numpy and iterate-over-groups

In [6]:
def apply_5(agg, scope, grouping, data):
    match agg:
        case "sum":
            op_f = np.sum
        case "min":
            op_f = np.min
        case "max":
            op_f = np.max
        case x:
            raise ValueError(x)

    node_ids = scope.node_ids

    groups = CensusGranularity.of(grouping).truncate_list(node_ids)
    T, _, Q = data.shape
    G = len(groups)

    result = np.empty(shape=(T, G, Q), dtype=data.dtype)
    for g, group in enumerate(groups):
        in_group = np.char.startswith(node_ids, group)
        result[:, g, :] = op_f(data[:, in_group, :], axis=1)
    return result


%timeit apply_5(agg_operation, scope, group_granularity, data)
res5 = apply_5(agg_operation, scope, group_granularity, data)
print(f"{res5.shape=}")

333 ms ± 568 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
res5.shape=(366, 187, 6)


In [7]:
from itertools import pairwise

# Check that we get the same answer with each approach.
all(np.array_equal(a, b) for a, b in pairwise([res1, res2, res3, res4, res5]))

True

## Conclusion:

Numpy's masked operations are horrible if your primary concern is performance. In this case, accomplishing the "mask" using selection syntax and iterating over the groups is a better option. Pandas is very close though, and this is quite sensitive to the number of groups in the result. The mechanism of grouping on an added column is very simple, so we might do it this way just to be consistent with time aggregates.

Note: I also tested using polars for this as a curiosity. It was actually slower than the Pandas approach, though it's possible I don't know enough about how to optimize it. Either way Polars is either not universally faster or not trivially so.