In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import geopandas as gpd
from shapely.geometry import shape, Point
import ast


In [2]:
zip = pd.read_csv('data/US.txt', sep='\t', header=None)
zip = zip.iloc[:, [4, 1, 9, 10]]
zip.columns = ['State', 'ZIP Code', 'lat', 'lng']

# Business data


In [3]:
bsns = pd.read_csv('data/businesses.csv')
bsns['ZIP Code'] = pd.to_numeric(bsns['ZIP Code'], errors='coerce')
bsns = bsns.dropna(subset=['ZIP Code'])
bsns['ZIP Code'] = bsns['ZIP Code'].astype(int)

bsns = bsns[['Business Name','ZIP Code', 'Latitude', 'Longitude']]
bsns = pd.merge(bsns, zip, how='left', on='ZIP Code')
bsns['Latitude'] = bsns['Latitude'].fillna(bsns['lat'])
bsns['Longitude'] = bsns['Longitude'].fillna(bsns['lng'])
bsns = bsns.drop(columns=['lat', 'lng', 'State', 'ZIP Code'])
bsns = bsns.dropna(subset=['Latitude', 'Longitude'])
bsns['Location'] = bsns.apply(lambda x: Point(x['Longitude'], x['Latitude']), axis=1)
bsns = gpd.GeoDataFrame(bsns, geometry='Location', crs="EPSG:4326").to_crs(epsg=3857)
bsns.drop(columns=['Latitude', 'Longitude'], inplace=True)
bsns['Business Name'] = bsns['Business Name'].astype(str)
display(bsns.head())

  bsns = pd.read_csv('data/businesses.csv')


Unnamed: 0,Business Name,Location
0,Denis Spedalieri,POINT (-8235727.623 4963332.949)
1,SANJAY'S VARIETY STORE INC.,POINT (-8211964.586 4969967.308)
2,Gayla Hibner,POINT (-8237453.076 4971936.9)
3,"FAMILY CARE REFERRAL, LLC",POINT (-8235658.16 4976635.563)
4,Donna Hill,POINT (-8228124.502 4962687.224)


# Weather

In [4]:
weather = pd.read_csv('data/weather_data.csv')
weather['time'] = pd.to_datetime(weather['time'])
weather['time'] = weather['time'].dt.tz_localize('GMT').dt.tz_convert('America/New_York')
weather['time'] = weather['time'].dt.tz_localize(None)
weather['Temperature'].astype(float)
weather['Weather Code'].astype(int)
display(weather.head())


Unnamed: 0,time,Temperature,Weather Code
0,2007-12-31 19:00:00,1.6,0
1,2007-12-31 20:00:00,0.3,0
2,2007-12-31 21:00:00,0.3,0
3,2007-12-31 22:00:00,-0.3,0
4,2007-12-31 23:00:00,-0.6,0


# Schools

In [5]:
schools = pd.read_csv('data/schools.csv')
schools['Name'] = schools['nta_name'].astype(str) + ' ' + schools['location_category_description'].astype(str)
schools = schools[['Name', 'latitude', 'longitude']]
schools['Location'] = schools.apply(lambda r: Point(r['longitude'], r['latitude']), axis=1)
schools = gpd.GeoDataFrame(schools, geometry='Location', crs="EPSG:4326").to_crs(epsg=3857)
schools.drop(columns=['longitude', 'latitude'], inplace=True)
schools['Name'] = schools['Name'].astype(str)
display(schools.head())

Unnamed: 0,Name,Location
0,Sunset Park West Elementary,POINT (-8238913.587 4960700.272)
1,Prospect Lefferts Gardens-Wingate Junior High-...,POINT (-8232251.672 4961795.46)
2,Clinton Hill Elementary,POINT (-8232657.321 4965594.938)
3,East New York Elementary,POINT (-8224203.384 4962100.238)
4,Stuyvesant Heights Elementary,POINT (-8228956.059 4966025.055)


# Events

In [6]:
events = pd.read_csv('data/events.csv', nrows=10000)
events = events[['Event Name', 'Start Date/Time', 'End Date/Time', 'Police Precinct']]
events['Police Precinct'] = events['Police Precinct'].apply(lambda x: str(x).split(',')[0])
events['Police Precinct'] = events['Police Precinct'].astype(int)
events['Event Name'] = events['Event Name'].astype(str)
events['Start Date/Time'] = pd.to_datetime(events['Start Date/Time'])
events['End Date/Time'] = pd.to_datetime(events['End Date/Time'])
display(events.head())


  events['Start Date/Time'] = pd.to_datetime(events['Start Date/Time'])
  events['End Date/Time'] = pd.to_datetime(events['End Date/Time'])


Unnamed: 0,Event Name,Start Date/Time,End Date/Time,Police Precinct
0,Big Apple Circus,2017-11-18 19:00:00,2017-11-18 20:00:00,20
1,Mt. Eden Farmer's Market,2017-11-16 08:00:00,2017-11-16 16:00:00,44
2,Columbia Greenmarket Thursday,2017-11-21 08:00:00,2017-11-21 17:00:00,26
3,Lawn Maintenance,2017-11-23 00:00:00,2017-11-23 23:58:00,13
4,"October, November December model aircraft flying",2017-11-22 09:00:00,2017-11-22 20:00:00,122


In [7]:
precincts = pd.read_csv('data/police_precincts.csv')
precincts['the_geom'] = precincts['the_geom'].apply(ast.literal_eval)
precincts = precincts[['precinct', 'the_geom']]


precincts['geometry'] = precincts['the_geom'].apply(shape)
precincts.drop(columns=['the_geom'], inplace=True)
gdf = gpd.GeoDataFrame(precincts, geometry='geometry')
gdf['centroid'] = gdf.geometry.centroid


gdf['lat'] = gdf.centroid.y
gdf['lng'] = gdf.centroid.x

gdf.drop(columns=['centroid', 'geometry'], inplace=True)

In [9]:
gdf['precinct'] = gdf['precinct'].astype(int)

In [8]:
from shapely.geometry import shape, Point
merged = pd.merge(events, gdf, how='left', left_on='Police Precinct', right_on='precinct')
merged = merged.dropna(subset=['lat', 'lng'])
merged = merged.drop(columns=['precinct', 'Police Precinct'])
merged['Location'] = merged.apply(lambda r: Point(r['lng'], r['lat']), axis=1)
merged = gpd.GeoDataFrame(merged, geometry='Location', crs="EPSG:4326").to_crs(epsg=3857)
merged.drop(columns=['lat', 'lng'], inplace=True)

In [None]:
merged.tail()

# Merging

In [23]:
df = pd.read_parquet('data/part.191.parquet')
df = df.head(100000)
# drop nan values from pickup and dropoff locations
df = df.dropna(subset=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])
df['PU Location'] = df.apply(lambda r: Point(r['pickup_longitude'], r['pickup_latitude']), axis=1)
df['DO Location'] = df.apply(lambda r: Point(r['dropoff_longitude'], r['dropoff_latitude']), axis=1)


pickup_gdf = gpd.GeoDataFrame(df, 
                               geometry=df['PU Location'], 
                               crs="EPSG:4326").to_crs(epsg=3857).copy()
dropoff_gdf = gpd.GeoDataFrame(df, 
                                geometry=df['DO Location'], 
                                crs="EPSG:4326").to_crs(epsg=3857).copy()
events_gdf_proj = gpd.GeoDataFrame(merged[['Start Date/Time', 'Event Name']], 
                                    geometry=merged['Location'], 
                                    crs="EPSG:3857")

In [24]:
enriched_pickup_gdf = gpd.sjoin_nearest(pickup_gdf, schools, how='left', distance_col='dist')
enriched_pickup_gdf = enriched_pickup_gdf.groupby(['pickup_datetime', 'geometry']).first().reset_index()

In [26]:
enriched_pickup_gdf['rounded_time'] = enriched_pickup_gdf['pickup_datetime'].dt.round('H')
weather_merge = pd.merge(enriched_pickup_gdf, weather, how='left', left_on='rounded_time', right_on='time')
display(weather_merge.head())
display(enriched_pickup_gdf.head())

  enriched_pickup_gdf['rounded_time'] = enriched_pickup_gdf['pickup_datetime'].dt.round('H')


Unnamed: 0,pickup_datetime,geometry,vendor_id,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,PU Location,DO Location,index_right,Name,dist,rounded_time,time,Temperature,Weather Code
0,2024-11-30 22:31:24,POINT (-8237100.465 4977922.898),2,2024-11-30 22:39:27,1,1.53,1,0,1,10.0,1.0,0.5,3.0,0.0,1.0,18.0,2.5,0.0,-73.995132,40.766239,-73.987648,40.775967,POINT (-73.99513244628906 40.766239166259766),POINT (-73.9876480102539 40.77596664428711),899,Clinton High school,293.204848,2024-11-30 23:00:00,2024-11-30 23:00:00,-2.1,0
1,2024-11-30 22:52:48,POINT (-8236511.9 4977337.001),2,2024-11-30 23:01:19,1,0.92,1,0,1,9.3,1.0,0.5,2.86,0.0,1.0,17.16,2.5,0.0,-73.989845,40.762253,-73.999916,40.748428,POINT (-73.9898452758789 40.76225280761719),POINT (-73.99991607666016 40.74842834472656),986,Clinton Secondary School,189.147142,2024-11-30 23:00:00,2024-11-30 23:00:00,-2.1,0
2,2024-11-30 23:03:00,POINT (-8236800.662 4975315.508),2,2024-11-30 23:46:45,1,2.96,1,0,1,35.900002,1.0,0.5,6.14,0.0,1.0,47.040001,2.5,0.0,-73.992439,40.748497,-73.959633,40.766949,POINT (-73.99243927001953 40.748497009277344),POINT (-73.95963287353516 40.76694869995117),1086,Midtown-Midtown South High school,261.694336,2024-11-30 23:00:00,2024-11-30 23:00:00,-2.1,0
3,2024-11-30 23:24:37,POINT (-8235883.418 4976979.313),2,2024-11-30 23:51:15,1,3.19,1,0,1,24.0,1.0,0.5,2.0,0.0,1.0,31.0,2.5,0.0,-73.9842,40.759819,-73.997383,40.72834,POINT (-73.98419952392578 40.75981903076172),POINT (-73.99738311767578 40.72834014892578),1061,Midtown-Midtown South High school,332.642377,2024-11-30 23:00:00,2024-11-30 23:00:00,-2.1,0
4,2024-11-30 23:30:58,POINT (-8236393.847 4976052.638),2,2024-11-30 23:52:32,1,3.93,1,0,1,24.0,1.0,0.5,5.8,0.0,1.0,34.799999,2.5,0.0,-73.988785,40.753513,-73.973045,40.791706,POINT (-73.98878479003906 40.75351333618164),POINT (-73.9730453491211 40.79170608520508),1063,Midtown-Midtown South High school,619.345908,2024-12-01 00:00:00,2024-12-01 00:00:00,-2.5,0


Unnamed: 0,pickup_datetime,geometry,vendor_id,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,PU Location,DO Location,index_right,Name,dist,rounded_time
0,2024-11-30 22:31:24,POINT (-8237100.465 4977922.898),2,2024-11-30 22:39:27,1,1.53,1,0,1,10.0,1.0,0.5,3.0,0.0,1.0,18.0,2.5,0.0,-73.995132,40.766239,-73.987648,40.775967,POINT (-73.99513244628906 40.766239166259766),POINT (-73.9876480102539 40.77596664428711),899,Clinton High school,293.204848,2024-11-30 23:00:00
1,2024-11-30 22:52:48,POINT (-8236511.9 4977337.001),2,2024-11-30 23:01:19,1,0.92,1,0,1,9.3,1.0,0.5,2.86,0.0,1.0,17.16,2.5,0.0,-73.989845,40.762253,-73.999916,40.748428,POINT (-73.9898452758789 40.76225280761719),POINT (-73.99991607666016 40.74842834472656),986,Clinton Secondary School,189.147142,2024-11-30 23:00:00
2,2024-11-30 23:03:00,POINT (-8236800.662 4975315.508),2,2024-11-30 23:46:45,1,2.96,1,0,1,35.900002,1.0,0.5,6.14,0.0,1.0,47.040001,2.5,0.0,-73.992439,40.748497,-73.959633,40.766949,POINT (-73.99243927001953 40.748497009277344),POINT (-73.95963287353516 40.76694869995117),1086,Midtown-Midtown South High school,261.694336,2024-11-30 23:00:00
3,2024-11-30 23:24:37,POINT (-8235883.418 4976979.313),2,2024-11-30 23:51:15,1,3.19,1,0,1,24.0,1.0,0.5,2.0,0.0,1.0,31.0,2.5,0.0,-73.9842,40.759819,-73.997383,40.72834,POINT (-73.98419952392578 40.75981903076172),POINT (-73.99738311767578 40.72834014892578),1061,Midtown-Midtown South High school,332.642377,2024-11-30 23:00:00
4,2024-11-30 23:30:58,POINT (-8236393.847 4976052.638),2,2024-11-30 23:52:32,1,3.93,1,0,1,24.0,1.0,0.5,5.8,0.0,1.0,34.799999,2.5,0.0,-73.988785,40.753513,-73.973045,40.791706,POINT (-73.98878479003906 40.75351333618164),POINT (-73.9730453491211 40.79170608520508),1063,Midtown-Midtown South High school,619.345908,2024-12-01 00:00:00


## Events join



In [None]:
import numpy as np
taxi_data = dropoff_gdf.copy()
time_window = pd.Timedelta(weeks=520)
# Initialize new columns
taxi_data['closest_event_id'] = None
taxi_data['closest_event_distance'] = None
taxi_data['closest_event_time_diff'] = None  # in hours

for idx, taxi in taxi_data.iterrows():
    # Filter events within time window
    time_mask = (merged['Start Date/Time'] >= taxi['dropoff_datetime'] - time_window) & \
                (merged['Start Date/Time'] <= taxi['dropoff_datetime'] + time_window)
    nearby_events = merged[time_mask].copy()
    
    if not nearby_events.empty:
        # Calculate distances
        nearby_events['distance_to_taxi'] = nearby_events.geometry.distance(taxi['geometry'])
        
        # Find closest event
        closest_idx = nearby_events['distance_to_taxi'].idxmin()
        closest_event = nearby_events.loc[closest_idx]
        
        # Update taxi row
        taxi_data.at[idx, 'closest_event_id'] = closest_idx
        taxi_data.at[idx, 'closest_event_distance'] = closest_event['distance_to_taxi']
        taxi_data.at[idx, 'closest_event_time_diff'] = (
            closest_event['Start Date/Time'] - taxi['dropoff_datetime']).total_seconds() / 3600

In [30]:
large = gpd.sjoin(pickup_gdf, events_gdf_proj, how='left', distance='1000', predicate='dwithin')

In [31]:
display(large.head())

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,PU Location,DO Location,geometry,index_right,Start Date/Time,Event Name
0,2,2024-12-01 00:12:27,2024-12-01 00:31:12,1,9.76,1,0,1,38.0,6.0,0.5,4.72,0.0,1.0,51.970001,0.0,1.75,-73.873627,40.774376,-73.995247,40.695797,POINT (-73.87362670898438 40.774375915527344),POINT (-73.99524688720703 40.695796966552734),POINT (-8223574.508 4979118.908),,NaT,
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),1520.0,2017-11-25 08:00:00,flea market
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),6954.0,2017-11-13 11:00:00,South Village Farmer's Market
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),2812.0,2017-11-14 01:00:00,Ai WeiWei Installation
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),3627.0,2017-11-19 08:00:00,flea market


In [32]:
import numpy as np
large['time_diff'] = np.abs((large['Start Date/Time'] - large['pickup_datetime']).dt.total_seconds() / 3600)
# large = large[large['time_diff'] <= 5]  # Filter for time difference within 1 hour

In [33]:
display(large.head())

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,PU Location,DO Location,geometry,index_right,Start Date/Time,Event Name,time_diff
0,2,2024-12-01 00:12:27,2024-12-01 00:31:12,1,9.76,1,0,1,38.0,6.0,0.5,4.72,0.0,1.0,51.970001,0.0,1.75,-73.873627,40.774376,-73.995247,40.695797,POINT (-73.87362670898438 40.774375915527344),POINT (-73.99524688720703 40.695796966552734),POINT (-8223574.508 4979118.908),,NaT,,
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),1520.0,2017-11-25 08:00:00,flea market,61503.934444
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),6954.0,2017-11-13 11:00:00,South Village Farmer's Market,61788.934444
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),2812.0,2017-11-14 01:00:00,Ai WeiWei Installation,61774.934444
1,2,2024-11-30 23:56:04,2024-12-01 00:28:15,1,7.62,1,0,1,37.299999,1.0,0.5,8.46,0.0,1.0,50.759998,2.5,0.0,-74.008987,40.735035,-73.940773,40.818256,POINT (-74.00898742675781 40.73503494262695),POINT (-73.9407730102539 40.81825637817383),POINT (-8238642.794 4973337.586),3627.0,2017-11-19 08:00:00,flea market,61647.934444
