In [1]:
import koda.koda_fetch as kf
import koda.koda_parse as kp
import koda.koda_transform as kt

In [2]:
# Download GTFS data
operator = "xt"
date = "2023-01-05"
feed_type = "TripUpdates"
print(f"Fetching GTFS static data for {operator} on {date}")
static_zip_path = kf.fetch_gtfs_static_archive(operator, date)
print(f"Fetching GTFS realtime data for {operator} on {date}")
rt_zip_path = kf.fetch_gtfs_realtime_archive(operator, feed_type, date)

Fetching GTFS static data for xt on 2023-01-05
File already exists.
Fetching GTFS realtime data for xt on 2023-01-05
File already exists.


In [3]:
hour = "13"

# NOTE: The static data is in GTFS txt files in a flat folder structure
static_folder_path = kp.unzip_gtfs_archive(static_zip_path)
print(f"Unzipped static data to {static_folder_path}")

# NOTE: The realtime data is in Protocol Buffer files in a nested folder structure
rt_folder_path = kp.unzip_gtfs_archive(rt_zip_path)
print(f"Unzipped realtime data to {rt_folder_path}")

# operator, date = kp.get_rt_dir_info(rt_folder_path)
df = kt.read_rt_hour_to_df(operator, feed_type, date, hour)
print(f"Read {len(df)} rows from realtime data")

rt_feather_path = kt.get_rt_feather_path(rt_folder_path, operator, feed_type, date, hour)
df.to_feather(rt_feather_path, compression='zstd', compression_level=9)
print(f"Saved realtime data to {rt_feather_path}")
df

Unzipping ./dev_data/koda_download/xt_static_2023_01_05.7z
File already unzipped.
Unzipped static data to ./dev_data/koda_data\xt_static_2023_01_05
Unzipping ./dev_data/koda_download/xt_rt_2023_01_05.7z
File already unzipped.
Unzipped realtime data to ./dev_data/koda_data\xt_rt_2023_01_05
Reading 258 files with 14 processes
Read 54402 rows from realtime data
Saved realtime data to ./dev_data/koda_data\xt_rt_2023_01_05/xt-tripupdates-2023-01-05T13.feather


Unnamed: 0,index,id,trip_id,start_date,schedule_relationship,timestamp,vehicle_id,stop_sequence,stop_id,arrival_delay,arrival_time,departure_delay,departure_time,arrival_uncertainty,departure_uncertainty
0,7,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,72,9022021483109001,456,1672919388,475,1672919407,0.0,0.0
1,17,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,82,9022021421038002,496,1672919959,507,1672919970,0.0,
2,30,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,95,9022021421073002,394,1672920600,395,1672920601,,
3,31,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,96,9022021421042002,391,1672920626,391,1672920626,,
4,32,217990500824825801,217990000029118861,20230105,SCHEDULED,1672919979,9031021000444433,97,9022021421047002,388,1672920649,418,1672920679,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54397,3310,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,46,9022021480077001,117,1672926750,122,1672926755,,
54398,3311,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,47,9022021480064001,88,1672926782,90,1672926784,,
54399,3312,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,48,9022021480062001,64,1672926808,69,1672926813,,
54400,3313,217990500806555531,217990000029012757,20230105,SCHEDULED,1672923554,9031021001271564,49,9022021480029001,11,1672926865,33,1672926887,,


In [4]:
import pandas as pd

# Read the saved feather file
operator = "xt"
date = "2023-01-05"
feed_type = "TripUpdates"
hour = "13"
rt_feather_path = kt.get_rt_feather_path(operator, feed_type, date, hour)
df = pd.read_feather(rt_feather_path)
# Specify the columns to keep
columns_to_keep = [
    'trip_id', 'start_date', 'schedule_relationship', 'timestamp',
    'vehicle_id', 'stop_sequence', 'stop_id', 'arrival_delay',
    'arrival_time', 'departure_delay', 'departure_time'
]

# Extract the specified columns into a new DataFrame
df_filtered = df[columns_to_keep]
removed = df.drop_duplicates(inplace=True)
print(f"Removed {removed} duplicates")

# Group by trip_id and calculate the mean of arrival_delay and departure_delay
average_delays = df.groupby('trip_id')[['arrival_delay', 'departure_delay']].mean().reset_index()


average_delays

Removed None duplicates


Unnamed: 0,trip_id,arrival_delay,departure_delay
0,217990000023580585,-30.608696,19.760870
1,217990000023581154,15.879699,49.924812
2,217990000023633371,-221.214286,-35.142857
3,217990000025164429,77.046729,91.084112
4,217990000025588572,80.489362,97.840426
...,...,...,...
285,217990000029906029,107.625954,137.977099
286,217990000029906077,122.219512,140.634146
287,217990000029916217,204.698630,214.595890
288,217990000029916276,146.152574,154.628676
