## Raw KoDa Data


In [1]:
from shared.parse import unzip_gtfs_archive
from koda.koda_fetch import fetch_gtfs_realtime_archive
from koda.koda_parse import get_rt_hour_dir_path, DATA_DIR
from koda.koda_transform import read_rt_hour_to_df
from shared.constants import OperatorsWithRT, FeedType
import os

operator = OperatorsWithRT.X_TRAFIK
feed_type = FeedType.TRIP_UPDATES
date = "2025-02-19"

rt_archive_path = fetch_gtfs_realtime_archive(operator, feed_type, date)
if rt_archive_path is None:
    raise ValueError(f"Failed to fetch realtime data for {operator.value} on {date}")
_ = unzip_gtfs_archive(rt_archive_path, DATA_DIR, remove_archive_after=False, force=True)

snapshots_path = get_rt_hour_dir_path(operator.value, feed_type.value, date, 12)
print(f"Snapshots path: {snapshots_path}")
os.listdir(snapshots_path)

File already exists.
Unzipping ./dev_data/koda_download/xt_rt_2025_02_19.7z
Snapshots path: ./dev_data/koda_data/xt_rt_2025_02_19\xt\TripUpdates\2025\02\19\12


['xt-tripupdates-2025-02-19T11-59-39Z.pb',
 'xt-tripupdates-2025-02-19T11-59-53Z.pb',
 'xt-tripupdates-2025-02-19T12-00-06Z.pb',
 'xt-tripupdates-2025-02-19T12-00-21Z.pb',
 'xt-tripupdates-2025-02-19T12-00-49Z.pb',
 'xt-tripupdates-2025-02-19T12-01-03Z.pb',
 'xt-tripupdates-2025-02-19T12-01-17Z.pb',
 'xt-tripupdates-2025-02-19T12-01-31Z.pb',
 'xt-tripupdates-2025-02-19T12-01-45Z.pb',
 'xt-tripupdates-2025-02-19T12-01-58Z.pb',
 'xt-tripupdates-2025-02-19T12-02-13Z.pb',
 'xt-tripupdates-2025-02-19T12-02-27Z.pb',
 'xt-tripupdates-2025-02-19T12-02-41Z.pb',
 'xt-tripupdates-2025-02-19T12-02-55Z.pb',
 'xt-tripupdates-2025-02-19T12-03-09Z.pb',
 'xt-tripupdates-2025-02-19T12-03-23Z.pb',
 'xt-tripupdates-2025-02-19T12-03-51Z.pb',
 'xt-tripupdates-2025-02-19T12-04-05Z.pb',
 'xt-tripupdates-2025-02-19T12-04-19Z.pb',
 'xt-tripupdates-2025-02-19T12-04-33Z.pb',
 'xt-tripupdates-2025-02-19T12-04-47Z.pb',
 'xt-tripupdates-2025-02-19T12-05-01Z.pb',
 'xt-tripupdates-2025-02-19T12-05-15Z.pb',
 'xt-tripup

In [2]:
# Parse, accumulate and deduplicate one folder (hour) of data
df, _ = read_rt_hour_to_df(operator, feed_type, date, 12)
df.head()

Unnamed: 0,index,id,trip_id,start_date,schedule_relationship,vehicle_id,timestamp,stop_sequence,stop_id,arrival_delay,arrival_time,departure_delay,departure_time,arrival_uncertainty,departure_uncertainty
0,121,217990501021798027,217990000038496545,20250219,SCHEDULED,9031021000444381,1739962816,55,9022021483392002,34,1739962918,34,1739962918,,
1,122,217990501021798027,217990000038496545,20250219,SCHEDULED,9031021000444381,1739962816,56,9022021483126002,38,1739962958,45,1739962965,,
2,123,217990501021798027,217990000038496545,20250219,SCHEDULED,9031021000444381,1739962816,57,9022021483184001,44,1739963070,56,1739963082,,
3,124,217990501021798027,217990000038496545,20250219,SCHEDULED,9031021000444381,1739962816,58,9022021483155001,49,1739963155,58,1739963164,,
4,125,217990501021798027,217990000038496545,20250219,SCHEDULED,9031021000444381,1739962816,59,9022021483173001,59,1739963218,70,1739963229,,


## Processed KoDa Data

In [8]:
import pandas as pd
from koda.koda_pipeline import get_koda_data_for_day

pd.options.mode.copy_on_write = True

# Parse all data for a day, fetch static data (routes, trips, stops)
trips_df, route_types_map_df, stop_count_df, stop_location_map_df = get_koda_data_for_day(date, operator)
trips_df.head()
# stop_location_map_df.head()

Old or missing feather version (-1) found for xt on 2025-02-19
Cleaning ./dev_data/koda_data/xt_rt_2025_02_19
Fetching realtime data for xt on 2025-02-19
File already exists.
Unzipping ./dev_data/koda_download/xt_rt_2025_02_19.7z
Removing ./dev_data/koda_download/xt_rt_2025_02_19.7z
Unzipped realtime data to ./dev_data/koda_data\xt_rt_2025_02_19


Reading xt TripUpdates 2025-02-19:  12%|█▎        | 3/24 [00:01<00:07,  2.74it/s]

No data found in ./dev_data/koda_data/xt_rt_2025_02_19\xt\TripUpdates\2025\02\19\01
No data found in ./dev_data/koda_data/xt_rt_2025_02_19\xt\TripUpdates\2025\02\19\02
No data found in ./dev_data/koda_data/xt_rt_2025_02_19\xt\TripUpdates\2025\02\19\03


Reading xt TripUpdates 2025-02-19: 100%|██████████| 24/24 [01:28<00:00,  3.71s/it]


Removing ./dev_data/koda_data/xt_rt_2025_02_19\xt
Fetching static data for xt on 2025-02-19
File is ready.
Unzipping ./dev_data/koda_download/xt_static_2025_02_19.7z
Removing ./dev_data/koda_download/xt_static_2025_02_19.7z
Unzipped static data to ./dev_data/koda_data\xt_static_2025_02_19
Fetching static data for xt on 2025-02-19
File is ready.
Unzipping ./dev_data/koda_download/xt_static_2025_02_19.7z
File already unzipped to ./dev_data/koda_data\xt_static_2025_02_19.
Removing ./dev_data/koda_download/xt_static_2025_02_19.7z
Unzipped static data to ./dev_data/koda_data\xt_static_2025_02_19
Fetching static data for xt on 2025-02-19 for stops
File is ready.
Unzipping ./dev_data/koda_download/xt_static_2025_02_19.7z
File already unzipped to ./dev_data/koda_data\xt_static_2025_02_19.
Removing ./dev_data/koda_download/xt_static_2025_02_19.7z
Unzipped static data to ./dev_data/koda_data\xt_static_2025_02_19
Removing ./dev_data/koda_data/xt_static_2025_02_19


Unnamed: 0,id,trip_id,start_date,schedule_relationship,vehicle_id,timestamp,stop_sequence,stop_id,arrival_delay,arrival_time,arrival_uncertainty,departure_delay,departure_time,departure_uncertainty
0,217990501017848631,217990000038467560,20250218,SCHEDULED,9031021000444507,1739919594,35,9022021484234001,25.0,1739919011,0.0,25,1739919011,0.0
1,217990501017848631,217990000038467560,20250218,SCHEDULED,9031021000444507,1739919594,36,9022021484235001,27.0,1739919032,0.0,27,1739919032,0.0
2,217990501017848631,217990000038467560,20250218,SCHEDULED,9031021000444507,1739919594,47,9022021461003002,-10.0,1739919621,,0,1739919631,
3,217990501025165705,217990000038611825,20250218,SCHEDULED,9031021000557719,1739919814,50,9022021480334019,-54.0,1739919863,,0,1739919917,
4,217990501025165705,217990000038611825,20250218,SCHEDULED,9031021000557719,1739919814,51,9022021480119007,-146.0,1739919934,,0,1739920080,


## Transit Delay Features (delay_fg)

In [9]:
from shared.features import build_feature_group

# Join with static data, calculate time-dependent metrics, accumulate values (max, min, var...), bin values by hour at the end
final_metrics = build_feature_group(trips_df, route_types_map_df, stop_count_df=stop_count_df)
final_metrics.head()

Unnamed: 0,route_type,arrival_time_bin,mean_delay_change_seconds,max_delay_change_seconds,min_delay_change_seconds,var_delay_change_seconds,mean_arrival_delay_seconds,max_arrival_delay_seconds,min_arrival_delay_seconds,var_arrival_delay,...,max_departure_delay_seconds,min_departure_delay_seconds,var_departure_delay,mean_on_time_percent,mean_final_stop_delay_seconds,mean_arrival_delay_seconds_lag_5stops,mean_departure_delay_seconds_lag_5stops,mean_delay_change_seconds_lag_5stops,stop_count,trip_update_count
0,100,2025-02-19 04:00:00,-14.127778,100.0,-102.0,7169.7,153.283333,680.0,-129.0,82598.102381,...,935.0,0.0,142340.697619,78.888889,53.322222,0.0,0.0,0.0,4.0,9
1,100,2025-02-19 05:00:00,-11.569841,100.0,-240.0,8297.49765,63.362411,775.0,-320.0,56861.257877,...,935.0,3.0,71108.52403,86.744505,72.477302,26.2,86.1,0.0,21.0,26
2,100,2025-02-19 06:00:00,0.170015,503.0,-241.0,19571.617641,-15.335942,687.0,-320.0,40741.953803,...,1090.0,-13.0,50131.871626,81.088222,53.340449,-24.955556,87.822222,-16.493333,55.0,30
3,100,2025-02-19 07:00:00,-1.380041,273.0,-257.0,24147.705165,-4.140569,192.0,-243.0,11451.866596,...,779.0,-13.0,37626.657592,97.455357,240.673941,-13.942857,120.371429,11.32,76.0,24
4,100,2025-02-19 08:00:00,31.473256,411.0,-281.0,36596.478152,63.184952,609.0,-208.0,45578.586554,...,1077.0,-19.0,57470.293615,79.703784,313.068051,16.05,145.075,18.542857,77.0,27
