# Here I am trying to merge/align the behavioural anotation logs with ACC logs

In [None]:
import pandas as pd
import numpy as np


ACC_PATH = "../src/test_data/AlgorithmData-Acceleration-[20251001-20251031]-part2.csv"
ANN_PATH = "../src/test_data/11-44-01_2a_annotations.csv"
OUT_PATH = "../src/test_data/acc_labeled_testwindow.csv"

ACC_TIME_COL = "Collecting time"
ANN_START_COL = "start_datetime"
ANN_END_COL = "end_datetime"
ANN_BEHAVIOR_COL = "behavior"

def to_utc(dt_series):
    # Always produce tz-aware UTC timestamps
    return pd.to_datetime(dt_series, errors="coerce", utc=True)

# first load the annoations and convert to UTC tz-aware timestamps
ann = pd.read_csv(ANN_PATH)
ann[ANN_START_COL] = to_utc(ann[ANN_START_COL])
ann[ANN_END_COL] = to_utc(ann[ANN_END_COL])
ann = ann.dropna(subset=[ANN_START_COL, ANN_END_COL, ANN_BEHAVIOR_COL]).sort_values(ANN_START_COL).reset_index(drop=True)

print("Loaded annotations:", len(ann))

window_start = ann[ANN_START_COL].min()
window_end = ann[ANN_END_COL].max()
print("Annotation window:", window_start, "to", window_end)

# Precompute annotation windows as int64 ns for fast, tz-safe comparisons
ann_start_ns = ann[ANN_START_COL].astype("int64").to_numpy()
ann_end_ns   = ann[ANN_END_COL].astype("int64").to_numpy()
ann_beh      = ann[ANN_BEHAVIOR_COL].astype(str).to_numpy()

# processing acc in chunks to avoid memory issues, and labeling each chunk based on the annotation windows
chunksize = 250000
first = True
total_rows = 0

for chunk in pd.read_csv(ACC_PATH, chunksize=chunksize):
    if ACC_TIME_COL not in chunk.columns:
        raise ValueError(f"Column '{ACC_TIME_COL}' not found in accelerometer file. Found: {list(chunk.columns)}")

    chunk[ACC_TIME_COL] = to_utc(chunk[ACC_TIME_COL])
    chunk = chunk.dropna(subset=[ACC_TIME_COL])

    # Restrict to annotation time window 
    chunk = chunk[(chunk[ACC_TIME_COL] >= window_start) & (chunk[ACC_TIME_COL] <= window_end)]
    if len(chunk) == 0:
        continue

    # Convert acc times to int64 ns (tz-safe)
    t_ns = chunk[ACC_TIME_COL].astype("int64").to_numpy()

    labels = np.full(len(chunk), "None", dtype=object)

    # Label rows for each annotation bout
    for i in range(len(ann)):
        mask = (t_ns >= ann_start_ns[i]) & (t_ns <= ann_end_ns[i])
        if mask.any():
            labels[mask] = ann_beh[i]

    chunk["behavior_label"] = labels

    chunk.to_csv(OUT_PATH, index=False, mode="w" if first else "a", header=first)
    first = False
    total_rows += len(chunk)

print("Done.")
print("Rows written:", total_rows)
print("Output saved to:", OUT_PATH)


Loaded annotations: 9
Annotation window: 2025-10-11 11:44:01.133333+00:00 to 2025-10-11 11:45:48.666667+00:00
Done.
Rows written: 10813
Output saved to: ../src/test_data/acc_labeled_testwindow.csv


In [None]:
# just need to see the temp reolution of the annotation file, so we can match it in the ACC file before merging

import pandas as pd

ANN_PATH = "../src/test_data/11-44-01_2a_annotations.csv"
ann = pd.read_csv(ANN_PATH)

# show all columns so we can see exact start/end values
pd.set_option("display.max_columns", None)

# filter Sitting rows 
sitting = ann[ann["behavior"].astype(str).str.lower() == "sitting"]

print(sitting[["behavior", "start_datetime", "end_datetime"]].to_string(index=False))


behavior             start_datetime               end_datetime
 Sitting 2025-10-11 11:44:01.133333 2025-10-11 11:44:06.733333
 Sitting 2025-10-11 11:44:09.400000 2025-10-11 11:44:54.266667
 Sitting 2025-10-11 11:45:29.733333 2025-10-11 11:45:30.200000
 Sitting 2025-10-11 11:45:34.066667 2025-10-11 11:45:37.200000
