** This file is the initial code with the data analysis and formatting

In [21]:
import requests
from dotenv import load_dotenv
import json
import os
import pandas as pd
import pprint
import zipfile
import io
import pandas as pd
import requests
import pytz

In [4]:
base_url = "https://nextrip-public-api.azure-api.net/octranspo/"
endpoint = "gtfs-rt-tp/beta/v1/TripUpdates?format=json"
url = f"{base_url}{endpoint}"

load_dotenv()

header = {
        "Ocp-Apim-Subscription-Key": os.getenv('api_key')
    }

In [5]:
try:
    response = requests.get(url, headers = header)
    response.raise_for_status()
    trip_updates_data = response.json()
    #print(json.dumps(trip_updates_data,indent=2))

except requests.exceptions.RequestException as e:
    print(f"Error making API reqest: {e}")
except Exception as e:
    print(f"An unexpected error occured: {e}")

In [7]:
parsed_updates = []

if trip_updates_data:
    for entity in trip_updates_data.get('Entity', []):
        if "TripUpdate" in entity and entity["TripUpdate"] is not None:
            trip_update = entity["TripUpdate"]
            
            trip_id = None
            route_id = None
            direction_id = None
            trip_info = trip_update.get("Trip")
            if trip_info:
                trip_id = trip_info.get("TripId")
                route_id = trip_info.get("RouteId")
                direction_id = trip_info.get("DirectionId")
            vehicle_id = None
            vehicle_info = trip_update.get("Vehicle")
            if vehicle_info:
                vehicle_id = vehicle_info.get("Id")

            for stop_time_update in trip_update.get("StopTimeUpdate",[]):
                stop_seq_no = stop_time_update.get("StopSequence")
                stop_id = stop_time_update.get("StopId")
                schedule_relationship = stop_time_update.get("ScheduleRelationship")
                arrival_data = stop_time_update.get("Arrival")
                if arrival_data is None:
                    arrival_data = {}
                arrival_timestamp = arrival_data.get("Time")
                departure_data = stop_time_update.get("Departure")
                if departure_data is None:
                    departure_data = {}
                departure_timestamp = departure_data.get("Time")

                parsed_updates.append({
                        'trip_id': trip_id,
                        'route_id': route_id,
                        'direction_id': direction_id,
                        'vehicle_id': vehicle_id,
                        'stop_id': stop_id,
                        'stop_sequence_no': stop_seq_no,
                        'schedule_relationship': schedule_relationship,
                        'arrival_timestamp': arrival_timestamp,
                        'departure_timestamp': departure_timestamp
                    })
                
    trip_updates_df = pd.DataFrame(parsed_updates)

    # pd.set_option('display.max_rows', None)        # Show all rows
    # pd.set_option('display.max_columns', None)     # Show all columns
    # pd.set_option('display.max_colwidth', None)    # Show full content of cells (useful for long strings)
    # pd.set_option('display.width', None)           # Set display width to allow more columns to be shown on one line

    # pd.reset_option('display.max_rows')
    # pd.reset_option('display.max_columns')
    # pd.reset_option('display.max_colwidth')
    # pd.reset_option('display.width')

    print("\n--- Complete DataFrame display ---")
    print(trip_updates_df.head())


--- Complete DataFrame display ---
    trip_id route_id  direction_id vehicle_id stop_id  stop_sequence_no  \
0  13110050       75             0       4732    4848                 6   
1  13110050       75             0       4732    4853                 7   
2  13110050       75             0       4732    3462                 8   
3  13110050       75             0       4732    2380                 9   
4  13110050       75             0       4732    2381                10   

   schedule_relationship  arrival_timestamp  departure_timestamp  
0                      0       1.753414e+09                  NaN  
1                      0       1.753415e+09                  NaN  
2                      0       1.753415e+09                  NaN  
3                      0       1.753415e+09                  NaN  
4                      0       1.753415e+09                  NaN  


In [20]:
print(trip_updates_df.head())
print(trip_updates_df.describe())
print(trip_updates_df.dtypes)

    trip_id route_id  direction_id vehicle_id stop_id  stop_sequence_no  \
0  13110050       75             0       4732    4848                 6   
1  13110050       75             0       4732    4853                 7   
2  13110050       75             0       4732    3462                 8   
3  13110050       75             0       4732    2380                 9   
4  13110050       75             0       4732    2381                10   

   schedule_relationship  arrival_timestamp  departure_timestamp  
0                      0       1.753414e+09                  NaN  
1                      0       1.753415e+09                  NaN  
2                      0       1.753415e+09                  NaN  
3                      0       1.753415e+09                  NaN  
4                      0       1.753415e+09                  NaN  
       direction_id  stop_sequence_no  schedule_relationship  \
count        4461.0       4461.000000                 4461.0   
mean            0.0

In [23]:
trip_updates_df['arrival_datetime_utc'] = pd.to_datetime(trip_updates_df['arrival_timestamp'], unit='s', errors='coerce', utc=True)
trip_updates_df['departure_datetime_utc'] = pd.to_datetime(trip_updates_df['departure_timestamp'], unit='s', errors='coerce', utc=True)

print(trip_updates_df[['arrival_timestamp', 'arrival_datetime_utc', 'departure_timestamp', 'departure_datetime_utc']].head())

print(trip_updates_df.dtypes)

   arrival_timestamp      arrival_datetime_utc  departure_timestamp  \
0       1.753414e+09 2025-07-25 03:34:31+00:00                  NaN   
1       1.753415e+09 2025-07-25 03:35:02+00:00                  NaN   
2       1.753415e+09 2025-07-25 03:35:37+00:00                  NaN   
3       1.753415e+09 2025-07-25 03:36:06+00:00                  NaN   
4       1.753415e+09 2025-07-25 03:37:18+00:00                  NaN   

  departure_datetime_utc  
0                    NaT  
1                    NaT  
2                    NaT  
3                    NaT  
4                    NaT  
trip_id                                object
route_id                               object
direction_id                            int64
vehicle_id                             object
stop_id                                object
stop_sequence_no                        int64
schedule_relationship                   int64
arrival_timestamp                     float64
departure_timestamp                   float

In [24]:
ottawa_tz = pytz.timezone('America/Toronto')
trip_updates_df['arrival_datetime_local'] = trip_updates_df['arrival_datetime_utc'].dt.tz_convert(ottawa_tz)
trip_updates_df['departure_datetime_local'] = trip_updates_df['departure_datetime_utc'].dt.tz_convert(ottawa_tz)
print(trip_updates_df[['arrival_datetime_utc', 'arrival_datetime_local', 'departure_datetime_utc', 'departure_datetime_local']].head())
print(trip_updates_df.dtypes)

       arrival_datetime_utc    arrival_datetime_local departure_datetime_utc  \
0 2025-07-25 03:34:31+00:00 2025-07-24 23:34:31-04:00                    NaT   
1 2025-07-25 03:35:02+00:00 2025-07-24 23:35:02-04:00                    NaT   
2 2025-07-25 03:35:37+00:00 2025-07-24 23:35:37-04:00                    NaT   
3 2025-07-25 03:36:06+00:00 2025-07-24 23:36:06-04:00                    NaT   
4 2025-07-25 03:37:18+00:00 2025-07-24 23:37:18-04:00                    NaT   

  departure_datetime_local  
0                      NaT  
1                      NaT  
2                      NaT  
3                      NaT  
4                      NaT  
trip_id                                              object
route_id                                             object
direction_id                                          int64
vehicle_id                                           object
stop_id                                              object
stop_sequence_no                             

In [10]:
dataframes = {}
def load_static_files_to_df(url):
    try:
        response = requests.get(url, stream = True)
        response.raise_for_status()

        print("Opening zip file in memory")
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            filenames = z.namelist()
            print("Files in the zip folder are: ", filenames)

            for file in filenames:
                if file.endswith('.txt'):
                    try:
                        with z.open(file) as f:
                            df = pd.read_csv(f)
                            dataframes[file.replace('.txt', '')] = df
                            print(f"Successfully loaded {file} with {len(df)} rows.")
                    except pd.errors.EmptyDataError:
                        print("Warning: file is empty and could not be loaded")
                    except Exception as e:
                        print("Error loading file")
                else:
                    print(f"skiping non .txt file {file}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the zip file: {e}")
    except zipfile.BadZipFile as e:
        print(f"Error: The downloaded file is not a valid zip file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return dataframes

static_files_url = "https://oct-gtfs-emasagcnfmcgeham.z01.azurefd.net/public-access/GTFSExport.zip"

all_files_df = load_static_files_to_df(static_files_url)

if all_files_df:
    print("\n--- Summary of Loaded DataFrames ---")
    for name, df in all_files_df.items():
        print(f"\nDataFrame for {name}:")
        print(df.head()) # Display the first few rows of each DataFrame
        print(f"Shape: {df.shape}")
else:
    print("\nNo DataFrames were loaded or an error occurred.")

Opening zip file in memory
Files in the zip folder are:  ['agency.txt', 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt', 'calendar_dates.txt', 'shapes.txt', 'feed_info.txt']
Successfully loaded agency.txt with 1 rows.
Successfully loaded stops.txt with 5566 rows.
Successfully loaded routes.txt with 125 rows.
Successfully loaded trips.txt with 79811 rows.


  df = pd.read_csv(f)
  df = pd.read_csv(f)


Successfully loaded stop_times.txt with 3172835 rows.
Successfully loaded calendar.txt with 20 rows.
Successfully loaded calendar_dates.txt with 50 rows.
Successfully loaded shapes.txt with 689200 rows.
Successfully loaded feed_info.txt with 1 rows.

--- Summary of Loaded DataFrames ---

DataFrame for agency:
   agency_id agency_name                 agency_url agency_timezone  \
0          1  OC Transpo  https://www.octranspo.com  Canada/Eastern   

  agency_lang  agency_phone                      agency_fare_url  agency_email  
0          en  613-560-5000  https://www.octranspo.com/en/fares/           NaN  
Shape: (1, 8)

DataFrame for stops:
  stop_id stop_code                 stop_name  tts_stop_name  stop_desc  \
0    4043       NaN     DOMINION STN OFF ONLY            NaN        NaN   
1   10012       NaN  PROMENADE / TERRASSES #2            NaN        NaN   
2   10377       NaN         WESTBORO OFF ONLY            NaN        NaN   
3   10449       NaN           LAVAL / LAURIER   

In [17]:
print(all_files_df["stop_times"].head())
print(all_files_df["stop_times"].describe())
print(all_files_df["stop_times"].dtypes)

    trip_id arrival_time departure_time stop_id  stop_sequence stop_headsign  \
0  10000070     17:37:00       17:37:00   10149              1           NaN   
1  10000070     17:41:07       17:41:07     590              2           NaN   
2  10000070     17:45:00       17:45:00     591              3           NaN   
3  10000070     17:52:00       17:52:00    9477              4           NaN   
4  10000070     17:57:00       17:57:00    9476              5           NaN   

   pickup_type  drop_off_type  shape_dist_traveled  timepoint  
0            0              1                 0.00        1.0  
1            0              0              3797.77        0.0  
2            0              0              7248.69        1.0  
3            0              0              9676.43        1.0  
4            0              0             13444.37        1.0  
            trip_id  stop_sequence   pickup_type  drop_off_type  \
count  3.172835e+06   3.172835e+06  3.172835e+06   3.172835e+06   
m

In [19]:
print(all_files_df["stop_times"]['stop_sequence'].max())

100
