# Testing KoDA fetching, parsing and transforming

## Initial setup

In [1]:
import koda.koda_fetch as kf
import koda.koda_parse as kp
import koda.koda_transform as kt

In [2]:
operator = "xt"
date = "2023-01-05"
feed_type = "TripUpdates"
print(f"Fetching GTFS static data for {operator} on {date}")
static_zip_path = kf.fetch_gtfs_static_archive(operator, date)
print(f"Fetching GTFS realtime data for {operator} on {date}")
rt_zip_path = kf.fetch_gtfs_realtime_archive(operator, feed_type, date)

Fetching GTFS static data for xt on 2023-01-05
File already exists.
Fetching GTFS realtime data for xt on 2023-01-05
File already exists.


In [3]:
hour = "13"

# NOTE: The static data is in GTFS txt files in a flat folder structure
static_folder_path = kp.unzip_gtfs_archive(static_zip_path)
print(f"Unzipped static data to {static_folder_path}")

# NOTE: The realtime data is in Protocol Buffer files in a nested folder structure
rt_folder_path = kp.unzip_gtfs_archive(rt_zip_path)
print(f"Unzipped realtime data to {rt_folder_path}")

# operator, date = kp.get_rt_dir_info(rt_folder_path)
df = kt.read_rt_hour_to_df(operator, feed_type, date, hour)
print(f"Read {len(df)} rows from realtime data")

rt_feather_path = kt.get_rt_feather_path(operator, feed_type, date, hour)
df.to_feather(rt_feather_path, compression='zstd', compression_level=9)
print(f"Saved realtime data to {rt_feather_path}")
df

Unzipping ./dev_data/koda_download/xt_static_2023_01_05.7z
File already unzipped.
Unzipped static data to ./dev_data/koda_data\xt_static_2023_01_05
Unzipping ./dev_data/koda_download/xt_rt_2023_01_05.7z
File already unzipped.
Unzipped realtime data to ./dev_data/koda_data\xt_rt_2023_01_05
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T13.feather
Read 54402 rows from realtime data
Saved realtime data to ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T13.feather


Unnamed: 0,index,id,trip_id,start_date,schedule_relationship,timestamp,vehicle_id,stop_sequence,stop_id,arrival_delay,arrival_time,departure_delay,departure_time,arrival_uncertainty,departure_uncertainty
0,7,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,72,9022021483109001,456,1672919388,475,1672919407,0.0,0.0
1,17,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,82,9022021421038002,496,1672919959,507,1672919970,0.0,
2,30,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,95,9022021421073002,394,1672920600,395,1672920601,,
3,31,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,96,9022021421042002,391,1672920626,391,1672920626,,
4,32,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,97,9022021421047002,388,1672920649,418,1672920679,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54397,3310,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,46,9022021480077001,117,1672926750,122,1672926755,,
54398,3311,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,47,9022021480064001,88,1672926782,90,1672926784,,
54399,3312,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,48,9022021480062001,64,1672926808,69,1672926813,,
54400,3313,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,49,9022021480029001,11,1672926865,33,1672926887,,


In [4]:
# Warning: This may take some time
# NOTE: Some hours appear to have no realtime data (e.g. 2023-01-05-02)
df = kt.read_rt_day_to_df(operator, feed_type, date)
df

Reading xt TripUpdates 2023-01-05:   0%|          | 0/24 [00:00<?, ?it/s]

Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T0.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T1.feather
Reading 257 files with 14 processes


Reading xt TripUpdates 2023-01-05:  12%|█▎        | 3/24 [00:01<00:08,  2.36it/s]

Read 257 files
No data found in ./dev_data/koda_data/xt_rt_2023_01_05\xt\TripUpdates\2023\01\05\02
Reading 257 files with 14 processes


Reading xt TripUpdates 2023-01-05:  46%|████▌     | 11/24 [00:02<00:02,  5.79it/s]

Read 257 files
No data found in ./dev_data/koda_data/xt_rt_2023_01_05\xt\TripUpdates\2023\01\05\03
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T4.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T5.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T6.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T7.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T8.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T9.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T10.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T11.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T12.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T13.feather
Reading from ./dev_data/kod

Reading xt TripUpdates 2023-01-05: 100%|██████████| 24/24 [00:02<00:00,  8.53it/s]

Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T16.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T17.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T18.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T19.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T20.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T21.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T22.feather
Reading from ./dev_data/koda_data/xt_rt_2023_01_05/xt-tripupdates-2023-01-05T23.feather





Unnamed: 0,id,trip_id,start_date,schedule_relationship,timestamp,vehicle_id,stop_sequence,stop_id,arrival_delay,arrival_time,arrival_uncertainty,departure_delay,departure_time,departure_uncertainty
0,217990500811380121,217990000029636565,20230104,SCHEDULED,1672873041,9031021000444062,34,9022021484232001,21.0,1672872632,0.0,21,1672872632,0.0
1,217990500831603630,217990000029915771,20230104,SCHEDULED,1672873193,9031021001241753,32,9022021480209002,576.0,1672872652,0.0,576,1672872652,0.0
2,217990500831603630,217990000029915771,20230104,SCHEDULED,1672873193,9031021001241753,40,9022021480227002,516.0,1672873215,,516,1672873215,
3,217990500831603630,217990000029915771,20230104,SCHEDULED,1672873193,9031021001241753,41,9022021480219002,519.0,1672873247,,520,1672873248,
4,217990500831603630,217990000029915771,20230104,SCHEDULED,1672873193,9031021001241753,51,9022021480119007,454.0,1672874134,,474,1672874154,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12044,217990500828590669,217990000029047775,20230106,SCHEDULED,1672959371,9031021001271556,43,9022021480359001,35.0,1672962602,,39,1672962606,
12045,217990500828590669,217990000029047775,20230106,SCHEDULED,1672959371,9031021001271556,44,9022021480467001,28.0,1672962651,,34,1672962657,
12046,217990500828590669,217990000029047775,20230106,SCHEDULED,1672959371,9031021001271556,45,9022021480427001,25.0,1672962682,,28,1672962685,
12047,217990500828590669,217990000029047775,20230106,SCHEDULED,1672959371,9031021001271556,46,9022021480242001,25.0,1672962739,,32,1672962746,


## Tests based on pre-processed feather files

In [5]:
import koda.koda_transform as kt
import pandas as pd

# Read the saved feather file
operator = "xt"
date = "2023-01-05"
feed_type = "TripUpdates"
hour = "13"
rt_feather_path = kt.get_rt_feather_path(operator, feed_type, date, hour)
df = pd.read_feather(rt_feather_path)

In [6]:
print(f"Read {len(df)} rows from realtime data")
kt.drop_tripupdates_duplicates(df) # TODO: Check if duplicates are useful for our task
print(f"Removed duplicates and now have {len(df)} rows")

# Specify the columns to keep
columns_to_keep = [
    'trip_id', 'start_date', 'schedule_relationship', 'timestamp',
    'vehicle_id', 'stop_sequence', 'stop_id', 'arrival_delay',
    'arrival_time', 'departure_delay', 'departure_time'
]

# Extract the specified columns into a new DataFrame
df_filtered = df[columns_to_keep]

# Group by trip_id and calculate the mean of arrival_delay and departure_delay
average_delays = df.groupby('trip_id')[['arrival_delay', 'departure_delay']].mean().reset_index()


average_delays

Read 54402 rows from realtime data
Removed duplicates and now have 8615 rows


Unnamed: 0,trip_id,arrival_delay,departure_delay
0,217990000023580585,-18.185185,27.814815
1,217990000023581154,0.307692,6.269231
2,217990000023633371,-221.214286,-35.142857
3,217990000025164429,61.037037,77.333333
4,217990000025588572,50.709677,78.451613
...,...,...,...
285,217990000029906029,131.097561,144.682927
286,217990000029906077,122.219512,140.634146
287,217990000029916217,210.448276,218.413793
288,217990000029916276,164.423077,176.865385


## Merging with static data
`pykoda` has a `get_data_range` function which shows how to merge the realtime GTFS TripUpdates with the static GTFS data.
Currently unsure if we need any static data for the prediction task.