In [32]:
import pickle
import pandas as pd
from datetime import datetime
from collections import deque

In [7]:
# Load noyes data from pickle file
noyes_pickle_file = 'sample_data/cta_train_noyes.pkl'
with open(noyes_pickle_file, 'rb') as f:
    noyes_data = pickle.load(f)
print("Loaded noyes data from pickle file.")

Loaded noyes data from pickle file.


In [55]:
seen_live = set()   # (pred_arr, direction) to avoid rematching same live ETA
sched_queue = {
    "Howard": deque(),
    "Linden": deque()
}
matches = []

In [56]:
for etas, snap_ts in noyes_data:
    for eta in etas:
        direction = "Howard" if "Howard" in eta["stop_description"] else (
                    "Linden" if "Linden" in eta["stop_description"] else None)
        if direction is None:
            print(f"Skipping ETA with unknown direction: {eta['stop_description']}")
            continue

        arrival_time = eta["arrival_time"]
        if not arrival_time:
            print(f"Skipping ETA with no arrival time: {eta}")
            continue

        # (A) scheduled rows go into stack
        if eta["is_scheduled"]:
            sched_queue[direction].append({
                "scheduled_arr": arrival_time,
                "first_seen": snap_ts,
                "direction": direction,
                "run_number": eta["run_number"],
            })

        # (B) live ETAs try to pair with stack top (if not seen before)
        elif not eta["is_scheduled"]:
            key = (arrival_time, direction)
            if key in seen_live:
                continue
            seen_live.add(key)

            if sched_queue[direction]:
                # Match live ETA with the top of the scheduled stack
                sched = sched_queue[direction].popleft()
                matches.append({
                    "direction": direction,
                    "scheduled_arr": sched["scheduled_arr"],
                    "actual_arr": arrival_time,
                    "first_seen": sched["first_seen"],
                })

In [57]:
df_match = pd.DataFrame(matches)

In [58]:
df_match["error_min"] = (df_match["actual_arr"] - df_match["scheduled_arr"]).dt.total_seconds() / 60.0

In [59]:
print(df_match[["direction", "scheduled_arr", "actual_arr", "error_min"]].head())

  direction       scheduled_arr          actual_arr   error_min
0    Linden 2025-04-01 00:24:04 2025-04-01 06:43:55  379.850000
1    Linden 2025-04-01 00:39:04 2025-04-01 07:03:57  384.883333
2    Linden 2025-04-01 00:23:44 2025-04-01 07:17:14  413.500000
3    Linden 2025-04-01 00:38:44 2025-04-01 07:29:49  411.083333
4    Linden 2025-04-01 00:28:24 2025-04-01 06:39:28  371.066667


In [49]:
print(f"\nMean absolute error: {df_match['error_min'].abs().mean():.2f} min")


Mean absolute error: 2534.43 min


In [50]:
df_match.to_csv("sample_data/cta_train_noyes_matches.csv", index=False)