In [3]:
from pathlib import Path
from typing import Tuple, List
import pandas as pd

def read_timestamped_simplices(
    nverts_path: str | Path,
    simplices_path: str | Path,
    times_path: str | Path,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Read a timestamped simplex dataset (three integer-per-line files) and
    return:
      - simplices_df: one row per simplex with columns [sid, time, size, nodes]
      - incidence_df: one row per (sid, node) with columns [sid, node, time, size]
    """
    nverts_path     = Path(nverts_path)
    simplices_path  = Path(simplices_path)
    times_path      = Path(times_path)

    # --- Load as integer Series (robust to trailing spaces/newlines) ---
    nverts = pd.read_csv(nverts_path, header=None, names=["size"], dtype="Int64").iloc[:, 0]
    times  = pd.read_csv(times_path,  header=None, names=["time"], dtype="Int64").iloc[:, 0]
    nodes  = pd.read_csv(simplices_path, header=None, names=["node"], dtype="Int64").iloc[:, 0]

    # Basic integrity checks
    if nverts.isna().any() or times.isna().any() or nodes.isna().any():
        raise ValueError("Non-integer or missing values detected in one of the files.")
    if len(nverts) != len(times):
        raise ValueError(f"Length mismatch: nverts={len(nverts)} vs times={len(times)}")
    expected_nodes = int(nverts.sum())
    if len(nodes) != expected_nodes:
        raise ValueError(f"Nodes vector length ({len(nodes)}) != sum(nverts) ({expected_nodes}).")

    # --- Build simplex ids and slice nodes per simplex efficiently ---
    offsets = nverts.cumsum().shift(fill_value=0).astype(int)
    ends    = nverts.cumsum().astype(int)
    sid     = pd.RangeIndex(start=0, stop=len(nverts), step=1, name="sid")

    # Collect nodes per simplex
    nodes_per_simplex: List[List[int]] = [
        nodes.iloc[offsets[i]:ends[i]].astype(int).tolist() for i in sid
    ]

    simplices_df = pd.DataFrame({
        "sid": sid,
        "time": times.astype(int).values,
        "size": nverts.astype(int).values,
        "nodes": nodes_per_simplex,
    })

    incidence_df = (
        simplices_df[["sid", "time", "size", "nodes"]]
        .explode("nodes", ignore_index=True)
        .rename(columns={"nodes": "node"})
        .astype({"sid": int, "time": int, "size": int, "node": int})
        .sort_values(["sid"])
        .reset_index(drop=True)
    )

    return simplices_df, incidence_df


In [22]:

if __name__ == "__main__":
    # Example file stems; change "example" to your dataset prefix
    simplices_df, incidence_df = read_timestamped_simplices(
        "NDC-substances/NDC-substances-nverts.txt",
        "NDC-substances/NDC-substances-simplices.txt",
        "NDC-substances/NDC-substances-times.txt",
    )

    print(simplices_df.head())   # one row per simplex
    print(incidence_df.head())   # one row per (sid, node)
    # Optionally save:
    # simplices_df.to_csv("simplices_table.csv", index=False)
    # incidence_df.to_csv("incidence_table.csv", index=False)


   sid            time  size nodes
0    0  63474192000000     1   [1]
1    1  61521120000000     1   [2]
2    2  63551001600000     1   [3]
3    3  63551001600000     1   [3]
4    4  63594288000000     1   [4]
   sid            time  size  node
0    0  63474192000000     1     1
1    1  61521120000000     1     2
2    2  63551001600000     1     3
3    3  63551001600000     1     3
4    4  63594288000000     1     4


In [30]:
# Convert time (microseconds per day) → datetime
# simplices_df["datetime"] = pd.to_datetime(
#     simplices_df["time"] / (24*60*60),
#     unit="s"
# )
US_PER_DAY = 86_400_000_000  # microseconds per day
simplices_df["day_index"] = (simplices_df["time"] // US_PER_DAY).astype("int64")

# Optional: date only
simplices_df["date"] = simplices_df["datetime"].dt.date

# ------------------------------


incidence_df["day_index"] = (incidence_df["time"] // US_PER_DAY).astype("int64")
incidence_df["date"] = incidence_df["datetime"].dt.date


In [31]:
simplices_df.head()

Unnamed: 0,sid,time,size,nodes,datetime,date,day_index
0,0,63474192000000,1,[1],1993-04-12 22:50:00,1993-04-12,734
1,1,61521120000000,1,[2],1992-07-25 07:40:00,1992-07-25,712
2,2,63551001600000,1,[3],1993-04-23 05:46:40,1993-04-23,735
3,3,63551001600000,1,[3],1993-04-23 05:46:40,1993-04-23,735
4,4,63594288000000,1,[4],1993-04-29 00:56:40,1993-04-29,736


In [32]:
incidence_df.head()

Unnamed: 0,sid,time,size,node,datetime,date,day_index
0,0,63474192000000,1,1,1993-04-12 22:50:00,1993-04-12,734
1,1,61521120000000,1,2,1992-07-25 07:40:00,1992-07-25,712
2,2,63551001600000,1,3,1993-04-23 05:46:40,1993-04-23,735
3,3,63551001600000,1,3,1993-04-23 05:46:40,1993-04-23,735
4,4,63594288000000,1,4,1993-04-29 00:56:40,1993-04-29,736
