In [1]:
import pandas as pd

# Load the Parquet file
df = pd.read_csv('gkne-dk5s.csv')

# Show basic info
print(df.shape)
print(df.columns)
print(df.head())


(10000000, 19)
Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'mta_tax', 'tip_amount', 'tolls_amount',
       'total_amount', 'imp_surcharge', 'extra', 'rate_code'],
      dtype='object')
  vendor_id          pickup_datetime         dropoff_datetime  \
0       VTS  2014-07-22T23:35:00.000  2014-07-22T23:36:00.000   
1       CMT  2014-08-07T22:31:24.000  2014-08-07T22:35:03.000   
2       VTS  2014-08-12T22:47:00.000  2014-08-12T22:53:00.000   
3       CMT  2014-07-05T02:09:22.000  2014-07-05T02:16:43.000   
4       CMT  2014-09-19T23:18:02.000  2014-09-19T23:32:59.000   

   passenger_count  trip_distance  pickup_longitude  pickup_latitude  \
0                1           0.71        -74.002725        40.728677   
1                2           0.90        -74.004349        40.723741   
2  

In [2]:
# remove null values at lat and lon
df = df.dropna(subset=['dropoff_longitude', 'dropoff_latitude', 'pickup_latitude', 'pickup_longitude'])

In [4]:
import osmnx as ox

# Download the road network for New York City
G = ox.graph_from_place('New York City, New York, USA', network_type='drive')
G_proj = ox.project_graph(G, to_latlong=True)

In [310]:
list(G_proj.edges(data=True))[0:5]

[(39076461,
  274283981,
  {'osmid': 25161349,
   'highway': 'motorway',
   'lanes': '2',
   'maxspeed': '50 mph',
   'name': 'Cross Island Parkway',
   'oneway': True,
   'ref': 'CI',
   'reversed': False,
   'length': 819.5016661477803,
   'geometry': <LINESTRING (-73.795 40.786, -73.795 40.786, -73.794 40.786, -73.794 40.786,...>,
   'speed_limit': 70,
   'travel_time': 42.14579997331442}),
 (39076461,
  42854803,
  {'osmid': 25161578,
   'highway': 'motorway_link',
   'oneway': True,
   'reversed': False,
   'length': 268.1440952459794,
   'geometry': <LINESTRING (-73.795 40.786, -73.793 40.787, -73.793 40.787, -73.793 40.787,...>,
   'speed_limit': 60,
   'travel_time': 16.088645714758762}),
 (274283981,
  3789687872,
  {'osmid': [39084897, 40944009, 40944010],
   'highway': 'motorway',
   'lanes': '3',
   'maxspeed': '50 mph',
   'name': 'Cross Island Parkway',
   'oneway': True,
   'ref': 'CI',
   'reversed': False,
   'length': 636.5460273480962,
   'geometry': <LINESTRING (-73

We have unclassified, residential, tertiary, secondary, primary, trunk, motorway, secondary_link,primary_link, trunk_link, motorway_link for ['highway']

In [151]:
#set a fixed speed limit for all edges based on the road type
dict_speed = {
    'motorway': 70,
    'trunk': 60,
    'primary': 50,
    'secondary': 40,
    'tertiary': 30,
    'residential': 20,
    'unclassified': 15,
    'secondary_link': 30,
    'primary_link': 40,
    'trunk_link': 50,
    'motorway_link': 60,
}

In [165]:
#get an estimated travel time for each edge at G_proj
for u, v, data in G_proj.edges(data=True):
    road_type = data.get('highway', 'residential')  # default to residential if not specified
    # in case it's a list, get the first type
    if isinstance(road_type, list):
        road_type = road_type[0]
    speed_limit = dict_speed.get(road_type, 20)  # default speed limit if road type not found
    data['speed_limit'] = speed_limit
    data['travel_time'] = data['length'] / (speed_limit * 1000 / 3600)  # convert speed to m/s and calculate time in seconds, add 5 seconds for stop time for each edge transition

In [166]:
# add edge weights based on travel time
print(G_proj[42445001][42444991][0])

{'osmid': [1025734022, 1025734023], 'highway': 'primary', 'lanes': '4', 'name': '9th Avenue', 'oneway': True, 'reversed': False, 'length': 87.41343891131758, 'geometry': <LINESTRING (-73.993 40.758, -73.993 40.758, -73.993 40.757, -73.994 40.757,...>, 'speed_limit': 50, 'travel_time': 6.293767601614866}


### Using travel_time (estimated) does give a 1% improvement at the end for xgboost

In [None]:
def recover_route(graph, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    orig_node = ox.distance.nearest_nodes(graph, pickup_lon, pickup_lat)
    dest_node = ox.distance.nearest_nodes(graph, dropoff_lon, dropoff_lat)

    #print(f"Pickup Node: {orig_node}, Dropoff Node: {dest_node}")
    #print(f"Pickup Coordinates: ({pickup_lat}, {pickup_lon}), Dropoff Coordinates: ({dropoff_lat}, {dropoff_lon})")
    # Compute the shortest path by length (distance)
    # route = ox.shortest_path(graph, orig_node, dest_node, weight='length')
    # Compute the shortest path by travel time
    route = ox.shortest_path(graph, orig_node, dest_node, weight='travel_time')
    return route

In [None]:
# sanity check for projection
#plot a histogram for lat distribution of nodes
import matplotlib.pyplot as plt
def plot_lat_distribution(graph):
    latitudes = [data['x'] for _, data in graph.nodes(data=True)]
    plt.hist(latitudes, bins=50, color='blue', alpha=0.7)
    plt.title('Latitude Distribution of Nodes')
    plt.xlabel('Latitude')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

# Plot the latitude distribution of nodes
plot_lat_distribution(G_proj)

In [270]:
recover_route(G_proj, df['pickup_latitude'].iloc[0], df['pickup_longitude'].iloc[0],
              df['dropoff_latitude'].iloc[0], df['dropoff_longitude'].iloc[0])

[4321748237,
 5854302557,
 42430187,
 42430205,
 42430217,
 42430231,
 42430233,
 42430235,
 42430237,
 42421889,
 42430241,
 42430247,
 42430249,
 42429374]

In [271]:
#scale down the data to 10000 rows
df_small = df.sample(n=10000, random_state=42)

In [272]:
pickup_lat = df_small['pickup_latitude']
pickup_lon = df_small['pickup_longitude']
dropoff_lat = df_small['dropoff_latitude']
dropoff_lon = df_small['dropoff_longitude']
df_small['route'] = recover_route(G_proj, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

#~70 seconds for 10,000 routes

In [273]:
import pickle
# Save the route to a pickle file
with open('routed_df_10000_shortest.pkl', 'wb') as f:
    pickle.dump(df_small, f)

In [None]:
import osmnx as ox
import matplotlib.pyplot as plt

# Example: plot a single route
def plot_route(graph, route):
    fig, ax = ox.plot_graph_route(graph, route, route_linewidth=1, node_size=0, bgcolor='white')
    plt.show()

plot_route(G_proj, df_small['route'].iloc[0])

# Naive modelling

In [306]:
len(G_proj.edges)

139291

In [307]:
len(G_proj.nodes)

55264

In [275]:
df_small['route'].iloc[0]

[np.int64(42437368),
 42435663,
 42437363,
 42439972,
 42439968,
 42439964,
 42433611,
 42428268,
 np.int64(42430317)]

In [276]:
G_proj.edges([42445001])

OutMultiEdgeDataView([(42445001, 42434434), (42445001, 42444991)])

In [305]:
print(G_proj[42445001][42444991][0])  # Length of the edge in meters

{'osmid': [1025734022, 1025734023], 'highway': 'primary', 'lanes': '4', 'name': '9th Avenue', 'oneway': True, 'reversed': False, 'length': 87.41343891131758, 'geometry': <LINESTRING (-73.993 40.758, -73.993 40.758, -73.993 40.757, -73.994 40.757,...>, 'speed_limit': 50, 'travel_time': 6.293767601614866}


In [278]:
#eliminate those with empty routes: disconnected? ~0.4%
df_small = df_small[df_small['route'].apply(lambda x: x is not None and len(x) > 0)]
print(f"Remaining rows after filtering: {df_small.shape[0]}")

Remaining rows after filtering: 9960


In [279]:
def get_edge_length_sum(graph, route):
    edge_length = 0
    if not route:
        print("Warning: The route is empty.")
        return edge_length
    for i in range(len(route) - 1):
        u, v = route[i], route[i + 1]
        if v not in graph[u]:
            print(f"Warning: Edge {u} to {v} not found in the graph.")
            continue
        length = graph[u][v][0]['length']
        edge_length += length
    return edge_length

In [280]:
import numpy as np

df_small['pickup_datetime'] = pd.to_datetime(df_small['pickup_datetime'])
df_small['dropoff_datetime'] = pd.to_datetime(df_small['dropoff_datetime'])

df_small['duration_seconds'] = (df_small['dropoff_datetime'] - df_small['pickup_datetime']).dt.total_seconds()
df_small['pickup_hour'] = df_small['pickup_datetime'].dt.hour
df_small['pickup_weekday'] = df_small['pickup_datetime'].dt.weekday

# Then for each trip get the route length
df_small['route_length'] = df_small['route'].apply(lambda route: get_edge_length_sum(G_proj, route))

In [281]:
df_small.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'mta_tax', 'tip_amount', 'tolls_amount',
       'total_amount', 'imp_surcharge', 'extra', 'rate_code',
       'duration_seconds', 'pickup_hour', 'pickup_weekday', 'route',
       'route_length'],
      dtype='object')

In [282]:
#print average duration
print(f"Average duration: {df_small['duration_seconds'].mean()} seconds")

Average duration: 786.2980923694779 seconds


In [283]:
from sklearn.model_selection import train_test_split

features = ['route_length','pickup_latitude','pickup_longitude', 'pickup_hour', 'pickup_weekday', 'passenger_count']
X = df_small[features]
#add lat squared and lon squared
# X['pickup_latitude_squared'] = X['pickup_latitude'] ** 2
# X['pickup_longitude_squared'] = X['pickup_longitude'] ** 2
y = df_small['duration_seconds']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [284]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Baseline MAE: {mae:.2f} seconds")


Baseline MAE: 276.58 seconds


In [285]:
# Install xgboost if not already installed
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost MAE:", mean_absolute_error(y_test, y_pred_xgb))


XGBoost MAE: 221.3101988719649


In [286]:
from sklearn.neural_network import MLPRegressor

mlp_model = MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)

print("MLP MAE:", mean_absolute_error(y_test, y_pred_mlp))


MLP MAE: 255.9247616635821


### Get features of each type length travelled and number of turns

In [293]:
def get_route_features(graph, route):
    if not route or len(route) < 2:
        print(route)
        return None
    return pd.Series({
        'num_edges': len(route) - 1,
        'motor_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'motorway' or graph[route[i]][route[i + 1]][0].get('highway') == 'motorway_link'),
        'trunk_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'trunk' or graph[route[i]][route[i + 1]][0].get('highway') == 'trunk_link'),
        'primary_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'primary'or graph[route[i]][route[i + 1]][0].get('highway') == 'primary_link'),
        'secondary_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'secondary' or graph[route[i]][route[i + 1]][0].get('highway') == 'secondary_link'),
        'tertiary_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'tertiary'),
        'residential_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'residential'),
        'unclassified_len': sum(graph[route[i]][route[i + 1]][0]['length'] for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'unclassified'),
        'motor_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'motorway' or graph[route[i]][route[i + 1]][0].get('highway') == 'motorway_link'),
        'trunk_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'trunk' or graph[route[i]][route[i + 1]][0].get('highway') == 'trunk_link'),
        'primary_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'primary' or graph[route[i]][route[i + 1]][0].get('highway') == 'primary_link'),
        'secondary_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'secondary' or graph[route[i]][route[i + 1]][0].get('highway') == 'secondary_link'),
        'tertiary_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'tertiary'),
        'residential_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'residential'),
        'unclassified_segments': sum(1 for i in range(len(route) - 1) if graph[route[i]][route[i + 1]][0].get('highway') == 'unclassified')
    })
    return features

In [294]:
get_route_features(G_proj, df_small['route'].iloc[0])

num_edges                  8.000000
motor_len                  0.000000
trunk_len                  0.000000
primary_len              548.881130
secondary_len            321.842014
tertiary_len               0.000000
residential_len          274.020227
unclassified_len           0.000000
motor_segments             0.000000
trunk_segments             0.000000
primary_segments           2.000000
secondary_segments         4.000000
tertiary_segments          0.000000
residential_segments       2.000000
unclassified_segments      0.000000
dtype: float64

In [297]:
# try XGboost again combining time and route features
#make sure all ['route'] are not None
df_small = df_small[df_small['route'].apply(lambda x: x is not None and len(x) >= 2)]
route_features = df_small['route'].apply(lambda route: get_route_features(G_proj, route))
# Combine route features with the main DataFrame
df_combined = pd.concat([df_small.reset_index(drop=True), route_features.reset_index(drop=True)], axis=1)
# Define features including route features
features_with_route = ['route_length', 'num_edges', 'motor_len', 'trunk_len', 'primary_len', 'secondary_len', 'tertiary_len', 'residential_len', 'unclassified_len',
                       'motor_segments', 'trunk_segments', 'primary_segments', 'secondary_segments', 'tertiary_segments', 'residential_segments','unclassified_segments',
                       'pickup_latitude', 'pickup_longitude', 'pickup_hour', 'pickup_weekday', 'passenger_count']
X_combined = df_combined[features_with_route]
y_combined = df_combined['duration_seconds']
print(X_combined.shape, y_combined.shape)
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

(9694, 21) (9694,)


In [299]:
# get the baseline mae again
model_combined = LinearRegression()
model_combined.fit(X_train_combined, y_train_combined)
y_pred_combined = model_combined.predict(X_test_combined)
mae_combined = mean_absolute_error(y_test_combined, y_pred_combined)
print(f"Baseline MAE with route features: {mae_combined:.2f} seconds")

Baseline MAE with route features: 241.58 seconds


In [298]:
# Train XGBoost model with route features
xgb_model_combined = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
xgb_model_combined.fit(X_train_combined, y_train_combined)
y_pred_combined = xgb_model_combined.predict(X_test_combined)
print("XGBoost with Route Features MAE:", mean_absolute_error(y_test_combined, y_pred_combined))

XGBoost with Route Features MAE: 208.64890873610452
