## Pulling TTC data

Data contains:
- All TTC transportation modes (subway, streetcars, and buses)
- Scheduling information (route definitions, stop patterns, stop locations, and schedules)

### Getting URL

In [1]:
import requests
	
# Toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/
	
# To hit our API, you'll be making requests to:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"
		
# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:	
url = base_url + "/api/3/action/package_show"
params = { "id": "merged-gtfs-ttc-routes-and-schedules"}
package = requests.get(url, params = params).json()



In [2]:
# let's see what's available
for resource in package["result"]["resources"]:
    print(f"Resource name: {resource['name']}")
    print(f"Resource format: {resource['format']}")
    print(f"Resource url: {resource['url']}")
    print(f"Last updated: {resource['last_modified']}")

Resource name: Complete GTFS
Resource format: ZIP
Resource url: https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/b811ead4-6eaf-4adb-8408-d389fb5a069c/resource/c920e221-7a1c-488b-8c5b-6d8cd4e85eaf/download/completegtfs.zip
Last updated: 2025-10-14T14:16:01


### Downloading

In [1]:
from pathlib import Path

# set paths
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data" 
RAW_DIR = DATA_DIR / "raw"
GTFS_DIR = DATA_DIR / "gtfs"

# make dir if doesn't yet exist 
for d in [DATA_DIR, RAW_DIR, GTFS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

In [4]:
# downlaods GTFS zip
ZIP_PATH = RAW_DIR / "ttc_gtfs.zip"

gtfs_url = None
for resource in package["result"]["resources"]:
    if resource["format"].lower() == "zip":
        gtfs_url = resource["url"]
        print("GTFS download URL:", gtfs_url)
        break

if gtfs_url:
    print(f"Downloading TTC GTFS data to {ZIP_PATH} ...")
    response = requests.get(gtfs_url)
    response.raise_for_status()  # Raises error if request fails

    with open(ZIP_PATH, "wb") as f:
        f.write(response.content)
    print("Downloaded TTC GTFS data -> ttc_gtfs.zip")
else:
    print("GTFS ZIP not found in package.") 

GTFS download URL: https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/b811ead4-6eaf-4adb-8408-d389fb5a069c/resource/c920e221-7a1c-488b-8c5b-6d8cd4e85eaf/download/completegtfs.zip
Downloading TTC GTFS data to /Users/michaelb/Documents/Code Projects/ttc-transit-data/data/raw/ttc_gtfs.zip ...
Downloaded TTC GTFS data -> ttc_gtfs.zip


In [5]:
# verify and extract
import zipfile

print(f"ðŸ“¦ Extracting files to {GTFS_DIR}")
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall(GTFS_DIR)

print("âœ… Extraction complete. Extracted files:")
for name in zip_ref.namelist():
    print(" -", name)

ðŸ“¦ Extracting files to /Users/michaelb/Documents/Code Projects/ttc-transit-data/data/gtfs
âœ… Extraction complete. Extracted files:
 - agency.txt
 - calendar.txt
 - calendar_dates.txt
 - routes.txt
 - shapes.txt
 - stops.txt
 - stop_times.txt
 - trips.txt


### Exploring the data

In [22]:
import pandas as pd

# load GTFS tables 
stops = pd.read_csv(GTFS_DIR / "stops.txt")
routes = pd.read_csv(GTFS_DIR / "routes.txt")
trips = pd.read_csv(GTFS_DIR / "trips.txt")
stop_times = pd.read_csv(GTFS_DIR / "stop_times.txt")
calendar = pd.read_csv(GTFS_DIR / "calendar.txt")
shapes = pd.read_csv(GTFS_DIR / "shapes.txt")

# Quick check
print("Stops shape:", stops.shape)
print(stops.head())

  trips = pd.read_csv(GTFS_DIR / "trips.txt")
  stop_times = pd.read_csv(GTFS_DIR / "stop_times.txt")


Stops shape: (9350, 12)
   stop_id  stop_code                                 stop_name  stop_desc  \
0        1       7978        Millwood Rd at McRae Dr South Side        NaN   
1        2        155                Bathurst St at Horsham Ave        NaN   
2        3       1514          Royal York Rd at King Georges Rd        NaN   
3        4       2593           Finch Ave West at Driftwood Ave        NaN   
4        5       1231  Lawrence Ave East at Wanless Cres (West)        NaN   

    stop_lat   stop_lon  zone_id  stop_url  location_type  parent_station  \
0  43.704652 -79.367698      NaN       NaN            NaN             NaN   
1  43.768449 -79.441504      NaN       NaN            NaN             NaN   
2  43.650225 -79.512691      NaN       NaN            NaN             NaN   
3  43.757924 -79.513523      NaN       NaN            NaN             NaN   
4  43.727149 -79.392116      NaN       NaN            NaN             NaN   

   stop_timezone  wheelchair_boarding  
0   

  shapes = pd.read_csv(GTFS_DIR / "shapes.txt")


In [3]:
# number of stops 
print(f"Total stops: {stops.shape[0]}")

# Stops per route
stops_per_route = trips.merge(stop_times, on="trip_id").merge(routes, on="route_id")
stops_per_route.groupby("route_short_name")["stop_id"].nunique().sort_values(ascending=False)

Total stops: 9350


route_short_name
300    230
301    210
52     193
96     192
501    176
      ... 
101     16
944     14
4       10
900      6
882      2
Name: stop_id, Length: 224, dtype: int64

In [4]:
# Number of routes per mode
routes["route_type"].value_counts()

route_type
700    203
900     18
400      3
Name: count, dtype: int64

In [5]:
# subway routes are 400
routes[routes['route_type']==400]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
215,1,1,1,LINE 1 (YONGE-UNIVERSITY),,400,,D5C82B,000000
216,2,1,2,LINE 2 (BLOOR - DANFORTH),,400,,008000,FFFFFF
217,4,1,4,LINE 4 (SHEPPARD),,400,,B300B3,FFFFFF


In [6]:
# Trips per day (approx - looking at Monday for e.g.)
service_ids = calendar[calendar["monday"]==1]['service_id'] # filer for services that run on Mon
daily_trips = trips[trips['service_id'].isin(service_ids)] # only trips on Monday
daily_trips.groupby("route_id").size().sort_values(ascending=False)

route_id
504    1656
36     1323
63     1274
47     1186
32     1015
       ... 
404      16
400      14
405      14
403      12
882       1
Length: 222, dtype: int64

In [10]:
import folium
from IPython.display import display

# Map first 50 stops
m = folium.Map(location=[43.6532, -79.3832], zoom_start=12)
for idx, row in stops.head(50).iterrows():
    folium.CircleMarker(
        location=[row.stop_lat, row.stop_lon],
        radius=3,
        color='blue',
        fill=True
    ).add_to(m)
# display(m)

In [8]:
routes[routes['route_id']==1]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
215,1,1,1,LINE 1 (YONGE-UNIVERSITY),,400,,D5C82B,0


In [11]:
# get all stops for line 1 subway
trips_for_route = trips[trips['route_id'] == 1]

# merge with stop_times to find all stop events 
route_stop_events = stop_times.merge(
    trips_for_route[['trip_id']], 
    on='trip_id', 
    how='inner'
)

# extract list of unique stop ids
unique_stop_ids = route_stop_events['stop_id'].unique()

# filter master stops table 
final_stops_list = stops[stops['stop_id'].isin(unique_stop_ids)]
final_stops_list

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
8583,18373,13818,ST ANDREW STATION - SOUTHBOUND PLATFORM,,43.647605,-79.385004,,,,,,0
9147,14111,14111,Finch Station - Southbound Platform,,43.780147,-79.415692,,,,,,1
9148,13789,13789,North York Centre Station - Southbound Platform,,43.767347,-79.412492,,,,,,1
9149,13860,13860,Sheppard-Yonge Station - Southbound Platform,,43.760348,-79.410691,,,,,,1
9150,13792,13792,York Mills Station - Southbound Platform,,43.743248,-79.405991,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9291,15665,15665,Downsview Park Station - Southbound Platform,,43.753311,-79.478693,,,,,,1
9292,15661,15661,Highway 407 Station - Northbound Platform,,43.783359,-79.523454,,,,,,1
9293,15660,15660,Highway 407 Station - Southbound Platform,,43.783359,-79.523454,,,,,,1
9294,15662,15662,Vaughan Metropolitan Centre Station - Subway P...,,43.794021,-79.527906,,,,,,1


### Headway calc example 

Let's calculate headway for route_id == 1 (I think this is the main subway line)

To do this we'll:
- join stops_time df with the trips_df b/c stops has stop time info, while trips has routeId and serviceId (day of week) info
- convert everything to time from midnight so we can do math 
- sort the df so we do this for each service day (sort of days of week, but can run multiple overlapping services on same day)
- calculate headway by duplicating df and shifting by 1

In [12]:
temp = stop_times.merge(
        trips[['trip_id', 'route_id', 'service_id']], 
        on='trip_id', 
        how='inner'
    ) 

In [13]:
temp = temp[temp['route_id']==1]

In [14]:
temp.sort_values(['arrival_time'])

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,route_id,service_id
4170069,131337109,05:31:21,05:31:21,14111,1,,0,0,,,1,1
4170070,131337109,05:33:45,05:33:45,13789,2,,0,0,1.5224,,1,1
4170071,131337109,05:35:00,05:35:00,13860,3,,0,0,2.3171,,1,1
4170149,131337110,05:36:08,05:36:08,14111,1,,0,0,,,1,1
4170072,131337109,05:38:06,05:38:06,13792,4,,0,0,4.2739,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4221806,131339169,26:30:26,26:30:26,15662,38,,0,0,38.9837,,1,2
4290344,131341848,26:30:26,26:30:26,15662,38,,0,0,38.9837,,1,4
4170059,131337406,26:31:52,26:31:52,15656,36,,0,0,36.0378,,1,1
4170060,131337406,26:33:55,26:33:55,15661,37,,0,0,37.4393,,1,1


In [15]:
def time_to_seconds(time_str):
    if pd.isna(time_str):
        return 0 
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

temp['arrival_time_sec'] = temp['arrival_time'].apply(time_to_seconds)


In [16]:
temp = temp.sort_values(['service_id', 'stop_id', 'arrival_time_sec'])


In [17]:
temp['previous_arrival_time_sec'] = temp.groupby(['stop_id', 'service_id'])['arrival_time_sec'].shift(1)
temp['headway_sec'] = temp['arrival_time_sec'] - temp['previous_arrival_time_sec']

In [18]:
# Filter out the first stop in each group (which will have a NaN for headway)
headways = temp.dropna(subset=['headway_sec'])
headways

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,route_id,service_id,arrival_time_sec,previous_arrival_time_sec,headway_sec
4170150,131337110,05:38:32,05:38:32,13789,2,,0,0,1.5224,,1,1,20312,20025.0,287.0
4170568,131337111,05:43:19,05:43:19,13789,2,,0,0,1.5224,,1,1,20599,20312.0,287.0
4170530,131337112,05:48:01,05:48:01,13789,2,,0,0,1.5224,,1,1,20881,20599.0,282.0
4170492,131337114,05:52:43,05:52:43,13789,2,,0,0,1.5224,,1,1,21163,20881.0,282.0
4170454,131337115,05:57:25,05:57:25,13789,2,,0,0,1.5224,,1,1,21445,21163.0,282.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4294675,131342075,25:19:40,25:19:40,18373,21,,0,0,22.9676,,1,4,91180,90820.0,360.0
4294637,131342076,25:25:40,25:25:40,18373,21,,0,0,22.9676,,1,4,91540,91180.0,360.0
4294599,131342077,25:31:40,25:31:40,18373,21,,0,0,22.9676,,1,4,91900,91540.0,360.0
4294561,131342078,25:37:40,25:37:40,18373,21,,0,0,22.9676,,1,4,92260,91900.0,360.0


In [19]:
# aggregate to get stop-level stats
stop_stats = headways.groupby('stop_id').agg(
    avg_headway_minutes = ('headway_sec', lambda x : x.mean() / 60),
    median_headway_minutes = ('headway_sec', lambda x: x.median() / 60),
    total_trips_served=('trip_id', 'count')
).reset_index()


In [20]:
# add stop names 
stop_stats = stop_stats.merge(
    stops[['stop_id', 'stop_name']], 
    on='stop_id', 
    how='left'
).drop_duplicates(subset=['stop_id'])

In [21]:
stop_stats

Unnamed: 0,stop_id,avg_headway_minutes,median_headway_minutes,total_trips_served,stop_name
0,13789,4.904202,4.566667,936,North York Centre Station - Southbound Platform
1,13790,4.849146,4.566667,976,North York Centre Station - Northbound Platform
2,13791,4.849146,4.566667,976,York Mills Station - Northbound Platform
3,13792,4.904006,4.550000,936,York Mills Station - Southbound Platform
4,13793,4.904024,4.558333,936,Lawrence Station - Southbound Platform
...,...,...,...,...,...
71,15664,4.951775,4.566667,967,Downsview Park Station - Northbound Platform
72,15665,5.214212,5.000000,876,Downsview Park Station - Southbound Platform
73,15666,4.951775,4.550000,967,York University - Northbound Platform
74,15667,5.214593,5.000000,876,York University - Southbound Platform


### Route geometries

In [23]:
shapes.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,shp-100-02,43.677011,-79.358044,1,0.0
1,shp-100-02,43.676933,-79.358197,2,15.08
2,shp-100-02,43.676855,-79.358351,3,30.16
3,shp-100-02,43.676784,-79.358457,4,41.8
4,shp-100-02,43.676712,-79.358563,5,53.44
