# Generating Transit Travel Times with r5py

This notebook is modified based on Acess presentation by Dr Willem to calculate access to opportunities using r5py.

This is a generally computationally intensive process, and has been the main technical hurdle to overcome in this process.

In this workbook, we are going to generate transit travel times using a relativley new Python library `r5py`. R5py is designed to allow Python users to access the open-source R5 engine, a powerful engine that is the spiritual successort to OpenTripPlanner. You can ready more about r5py via [their documentation](https://r5py.readthedocs.io), or check out [R5's own github page](https://github.com/conveyal/r5).
In this workbook we are going to generate transit travel time using the Python library r5py.

In our example analysis, we want to answer two questions:
- How is the access to hospitals distributed across different populations?
- How is the access to child care spaces distributed across different populations?
- How does peak-period and evening service change this access for various populations?

For this we need to generate *two* travel time matrices. One for a peak period (7-9am) and for an evening period (9-11pm).

The beauty of the R5 engine is that it allows us to measure a median peak-period value very easily. To do this, we start our analysis at the beginning of our specified time period and set the duration of the analysis.

Let's start by loading the appropriate data and setting some key settings for R5py. One little quirk we are going to need is that R5py requires our origin/destination points to be named `id`, not anything else like `dauid`. We'll make that change now.

In [8]:
import os
import sys
import shapely
import numpy as np
#os.environ['GDAL_DATA'] = os.path.join(f'{os.sep}'.join(sys.executable.split(os.sep)[:-1]), 'Library', 'share', 'gdal')

import pandas as pd
import geopandas as gpd

import warnings
warnings.filterwarnings('ignore')
# This sets the amount of memory we are using for R5py calcualtions
#sys.argv.append(["--max-memory", "8G"])

#da_centroids = gpd.read_file("data/UnAdj_100m_2020.geojson").rename(columns={"pointid":"id"})#
#dest_po = gpd.read_file("data/guadalajara_greenspace_perimeterpoints.geojson")
#dest_po["id"] = range(len(da_centroids)+ 1, len(da_centroids) + len(dest_po) + 1) #adding id column

da_centroids = gpd.read_file("data/data_CA/da_centroids_with_locations.geojson").rename(columns={"dauid":"id"}) #1675 rows
daycares = gpd.read_file("data/data_CA/daycare_locations.geojson") #368 (total: OD = 616400)
#hospitals = gpd.read_file("data/data_CA/hospital_locations.geojson") # 5 (OD = 8375)

print(len(da_centroids))
print(len(daycares))


1675
368


In [106]:
import geopandas as gpd
import pandas as pd
import h3
import datetime
from r5py import TransportNetwork, TravelTimeMatrixComputer, TransportMode
from tqdm import tqdm

def find_dest_h3(orig, dest, h3_res=9, ring_size=3):
    # h3_res = 7: ~5.2km^2; h3_res = 8: ~0.74km^2; h3_res = 9: ~0.11km^2
    # A k-ring of size k around a central hexagon includes all hexagons that are within k steps from the center.
    # When ring_size=1, the k-ring includes: (1) The central hexagon; (2) the six hexagons surrounding the central hex. (total of 7 hexs)
    # ring_size=2 means we have about 19 hexs

    # Ensure same CRS
    orig = orig.to_crs("EPSG:4326")
    dest = dest.to_crs("EPSG:4326")
    
    # Convert origins and destinations to H3 indexes
    orig["h3_i"] = orig.apply(lambda i: h3.geo_to_h3(i.geometry.y, i.geometry.x, h3_res), axis=1)
    dest["h3_i"] = dest.apply(lambda i: h3.geo_to_h3(i.geometry.y, i.geometry.x, h3_res), axis=1)
    
    results = []

    orig_rings = orig.apply(lambda row: set(h3.k_ring(row["h3_i"], ring_size)), axis=1)
    
    for idx, o_ring in orig_rings.items():
        o = orig.loc[idx]
        matches = dest[dest["h3_i"].isin(o_ring)]

        for _, d in matches.iterrows():
            results.append({
                "origin_id": o["id"],
                "dest_id": d["id"],
                "o_geometry": o.geometry,
                "d_geometry": d.geometry})
    
    od_pairs = gpd.GeoDataFrame(results, geometry="o_geometry", crs=orig.crs)
    od_pairs["d_geometry"] = gpd.GeoSeries(od_pairs["d_geometry"], crs=orig.crs)

    grouped_od_pairs = od_pairs.groupby("origin_id").agg({"o_geometry": "first", "dest_id": lambda x: list(x), "d_geometry": lambda x: list(x)}).reset_index()
    grouped_od_pairs = gpd.GeoDataFrame(grouped_od_pairs, geometry="o_geometry", crs="EPSG:4326")

    return od_pairs, grouped_od_pairs

def compute_tt(grouped_od_pairs, transport_network, departure_time, time_window, modes):
    """
    Compute travel times for origin-destination pairs using r5py.
    """
    all_results = []

    # Iterate through each origin
    for _, row in tqdm(grouped_od_pairs.iterrows(), total=len(grouped_od_pairs), desc="Processing origins"):

        # Create GeoDataFrame for the current origin
        origin = gpd.GeoDataFrame({"id": [row.origin_id], "geometry": [row.o_geometry]}, crs=grouped_od_pairs.crs)
        
        # Create GeoDataFrame for all destinations for this origin
        destinations = gpd.GeoDataFrame({"id": row.dest_id, "geometry": row.d_geometry}, crs=grouped_od_pairs.crs)

        travel_time_computer = TravelTimeMatrixComputer(
                transport_network,
                origins=origin,
                destinations=destinations,
                departure=departure_time,
                departure_time_window=time_window,
                transport_modes=modes)

        travel_times = travel_time_computer.compute_travel_times()

        # Aggregate results for each origin (taking min)
        # if not travel_times.empty:
        #     min_index = travel_times["travel_time"].idxmin()
        #     min_travel_time = travel_times.loc[min_index, "travel_time"]
        #     min_dest_id = travel_times.loc[min_index, "to_id"]
        #     min_dest_geometry = destinations.loc[destinations["id"] == min_dest_id, "geometry"].iloc[0]

        # all_results.append({"origin_id": row.origin_id,"o_geometry": row.o_geometry,"min_travel_time": min_travel_time,
        #                 "min_dest_id": min_dest_id, "d_geometry": min_dest_geometry})
        all_results.append(travel_times)
           
    return pd.concat(all_results, ignore_index=True)

In [114]:
%%time
od_pairs, grouped_od_pairs = find_dest_h3(da_centroids, daycares, h3_res=8, ring_size=1)
print(f"Number of OD pairs: {len(od_pairs)}")

print(grouped_od_pairs)

#r5py
network = TransportNetwork("data/data_CA/Calgary.osm.pbf", ["data/data_CA/cgy-gtfs-2023-03-03.zip"])
departure_time = datetime.datetime(2023, 3, 15, 7, 0)
time_window = datetime.timedelta(hours=2)
modes = [TransportMode.TRANSIT, TransportMode.WALK]

#Compute Travel Time
travel_times_df = compute_tt(grouped_od_pairs, network, departure_time, time_window, modes)
travel_times_df


Number of OD pairs: 6639
     origin_id                   o_geometry                   dest_id  \
0     48060056  POINT (-114.09749 51.13788)   [38, 42, 188, 214, 290]   
1     48060057  POINT (-114.09353 51.13976)   [38, 42, 188, 214, 290]   
2     48060058  POINT (-114.09522 51.13583)   [38, 42, 188, 214, 290]   
3     48060059  POINT (-114.09047 51.13885)   [38, 42, 188, 214, 290]   
4     48060060  POINT (-114.08807 51.13915)   [38, 42, 188, 214, 290]   
...        ...                          ...                       ...   
1538  48062787  POINT (-114.13346 51.07331)  [88, 108, 237, 251, 339]   
1539  48062789  POINT (-114.04997 51.11896)           [126, 145, 154]   
1540  48062790  POINT (-114.06178 51.12943)                [126, 145]   
1541  48062791  POINT (-114.16624 51.04638)        [5, 297, 313, 324]   
1542  48062792     POINT (-114.16901 51.04)        [5, 297, 313, 324]   

                                             d_geometry  
0     [POINT (-114.098894 51.1334222), P

Processing origins: 100%|██████████| 1543/1543 [03:51<00:00,  6.67it/s]


CPU times: total: 5min 42s
Wall time: 4min 23s


Unnamed: 0,from_id,to_id,travel_time
0,48060056,38,11.0
1,48060056,42,24.0
2,48060056,188,22.0
3,48060056,214,25.0
4,48060056,290,16.0
...,...,...,...
6634,48062791,324,24.0
6635,48062792,5,26.0
6636,48062792,297,26.0
6637,48062792,313,29.0


In [115]:
travel_times_df.to_csv("H3_cal.csv")

## Set Up Your Transport Network

To calculate travel times, we need to set up a transport network (which happens to be called a `TransportNetwork` class). The transport network needs both an underlying OpenStreetMap PBF file as well as one or more GTFS feeds. So let's go ahead and set up our transport network, which takes as its first argument the path to our PBF file and as a second argument a list of paths to GTFS files (of which we only have one in Calgary).

In [6]:
from r5py import TransportNetwork

# transport_network = TransportNetwork(
#      "data/mi-transporte-osm-crop.osm.pbf",
#      ["data/improved-gtfs-mi-transporte.zip"]
#  )

transport_network = TransportNetwork(
    "data/data_CA/Calgary.osm.pbf",
    ["data/data_CA/cgy-gtfs-2023-03-03.zip"]
)

could not delete C:\Users\ADMINI~1\AppData\Local\Temp\2\r5pylrv4op00\TransportNetwork_1521f9e0890_00322f9\Calgary.osm.pbf.mapdb.p, keeping in [WindowsPath('C:/Users/ADMINI~1/AppData/Local/Temp/2/r5pylrv4op00/TransportNetwork_1521f9e0890_00322f9/Calgary.osm.pbf.mapdb'), WindowsPath('C:/Users/ADMINI~1/AppData/Local/Temp/2/r5pylrv4op00/TransportNetwork_1521f9e0890_00322f9/Calgary.osm.pbf.mapdb.p'), WindowsPath('C:/Users/ADMINI~1/AppData/Local/Temp/2/r5pylrv4op00/TransportNetwork_1521f9e0890_00322f9/cgy-gtfs-2023-03-03.zip')]


This will build us a transport network which we can use to compute travel times. So let's go ahead and do that next!
## Computing Travel Times

We create a travel time matrix computer (`TravelTimeMatrixComputer`) which lets us specify a whole bunch of potential parameters, most importantly origins and destinations. Our origins are the DA centroids, and our (first) destination will be the hospital centroids.

**Note:** *The current version of r5py requires that we use the `TransitMode` and `LegMode` objects to specify our modes of travel. Future versions will allow us to just pass along strings like `"WALK"` or `"TRANSIT"`.*

In [10]:
%%time
import datetime
from r5py import TravelTimeMatrixComputer, TransportMode

#travel_time_computer = TravelTimeMatrixComputer(
#    transport_network,
#    origins=da_centroids,
#    destinations=dest_po,
#    departure=datetime.datetime(2021, 1, 1, 7, 0),
#    departure_time_window=datetime.timedelta(minutes=10),
#    max_time=datetime.timedelta(minutes=30),
#    transport_modes=[TransportMode.TRANSIT, TransportMode.WALK]
#)

travel_time_computer = TravelTimeMatrixComputer(
   transport_network,
   origins=da_centroids,
   destinations=daycares,
   departure=datetime.datetime(2023, 3, 15, 7, 0),
   departure_time_window=datetime.timedelta(hours=2),
   transport_modes=[TransportMode.TRANSIT, TransportMode.WALK]
)

travel_time_computer.compute_travel_times()

CPU times: total: 4min 13s
Wall time: 4min 8s


Unnamed: 0,from_id,to_id,travel_time
0,48060056,0,110.0
1,48060056,1,101.0
2,48060056,2,118.0
3,48060056,3,89.0
4,48060056,4,80.0
...,...,...,...
616395,48062794,363,
616396,48062794,364,118.0
616397,48062794,365,112.0
616398,48062794,366,77.0


### Compute the Travel Times
Now we're ready to run our computation. This will take a little while to run, and we'll write the matrix directly to a file. In order to avoid writing over the redundant datasets, lets write it to a new file `mx_hospitals_am_workshop.csv`.

In [13]:
# MODULAR VERSION
from r5py import TravelTimeMatrixComputer, TransportMode
import os
import dill
import numpy as np
import math
import multiprocessing as mp
#import pickle
from functools import partial

def load_or_create_dataframe(csv_file_path):
    """
    Load the CSV file if it exists, otherwise create an empty DataFrame.
    """
    if os.path.exists(csv_file_path):
        return pd.read_csv(csv_file_path)
    else:
        return pd.DataFrame()

def calculate_num_batches(geodataframe, batch_size):
    """
    Calculate the number of batches based on the size of the geodataframe.
    """
    return math.ceil(len(geodataframe) / batch_size)

def process_batch(args):
    c_gdf1, c_gdf2 = args

    # transport_network = TransportNetwork(
    #     "data/data_CA/Calgary.osm.pbf",
    #     ["data/data_CA/cgy-gtfs-2023-03-03.zip"])
    travel_time_computer_c = TravelTimeMatrixComputer(
                transport_network,
                origins=c_gdf1,
                destinations=c_gdf2,
                departure=datetime.datetime(2023, 3, 15, 7, 0),
                departure_time_window=datetime.timedelta(hours=2),
                transport_modes= [TransportMode.TRANSIT, TransportMode.WALK])
    return travel_time_computer_c.compute_travel_times()

def process_batches(origin, destinations, BS_O, BS_D, csv_file_path):
    """
    Process the data in batches and append results to a CSV file.
    """
    # Load or create the DataFrame
    travel_times_df = load_or_create_dataframe(csv_file_path)
    
    # Calculate the number of batches
    num_b_o = calculate_num_batches(origin, BS_O)
    num_b_d = calculate_num_batches(destinations, BS_D)

    # Using multiprocessing
    pool = mp.Pool(processes=4)
    batch_args = []

    # Perform the computation for the first geodataframe in batches
    for i in range(num_b_o):
        start_idx_gdf1 = i * BS_O
        end_idx_gdf1 = min(start_idx_gdf1 + BS_O, len(origin))
        chunk_gdf1 = origin.iloc[start_idx_gdf1:end_idx_gdf1]

        # Perform the computation for the second geodataframe in batches
        for j in range(num_b_d):
            start_idx_gdf2 = j * BS_D
            end_idx_gdf2 = min(start_idx_gdf2 + BS_D, len(destinations))
            chunk_gdf2 = destinations.iloc[start_idx_gdf2:end_idx_gdf2]

            batch_args.append((chunk_gdf1, chunk_gdf2))

    results = pool.map(process_batch, batch_args)
    pool.close()
    pool.join()

    # for result in results:
    #     result.to_csv(csv_file_path, mode='a', header=not os.path.exists(csv_file_path), index=False)

    #travel_times_df = pd.concat(results, ignore_index=True)

    return results




In [11]:
transport_network = TransportNetwork(
    "data/data_CA/Calgary.osm.pbf",
    ["data/data_CA/cgy-gtfs-2023-03-03.zip"])

In [4]:
%%time
# Define the batch size for processing
BS_O = 1000  # Adjust as needed - origin
BS_D = 1000     # Adjust as needed - destination

# Define the path to the CSV file
csv_file_path = 'data/full_matrix.csv'

# transport_network = TransportNetwork(
#     "data/data_CA/Calgary.osm.pbf",
#     ["data/data_CA/cgy-gtfs-2023-03-03.zip"])

# Process batches and get the final DataFrame
final_travel_times_df = process_batches(da_centroids, daycares, BS_O, BS_D, csv_file_path)

print("FIN!")


In [None]:
def load_last_processed_batch():
    try:
        with open('data/guadalajara/last_processed_batch.pkl', 'rb') as f:
            return dill.load(f)
    except FileNotFoundError:
        return 0
last_processed_batch = load_last_processed_batch()
last_processed_batch

(27, 6)

In [None]:
import csv
with open('data/guadalajara/travel_times_walking.csv', 'r') as f:
    reader = csv.reader(f)
    dill.dump(list(reader), open('data/guadalajara/guadalajara.pkl', 'wb'))

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001AA18288520>>
Traceback (most recent call last):
  File "C:\Users\heinl\anaconda3\envs\ghn\lib\site-packages\ipykernel\ipkernel.py", line 785, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
  File "C:\Users\heinl\anaconda3\envs\ghn\lib\site-packages\ipykernel\ipkernel.py", line 785, in <setcomp>
    active_threads = {thread.ident for thread in threading.enumerate()}
  File "C:\Users\heinl\anaconda3\envs\ghn\lib\threading.py", line 1154, in ident
    assert self._initialized, "Thread.__init__() not called"
KeyboardInterrupt: 
