In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import holidays
import dask.dataframe as dd

In [2]:
taxi_zones = gpd.read_file("data/taxi-zones/taxi_zones.shp")
schools = gpd.read_file("data/school-locations/SchoolPoints_APS_2024_08_28.shp").to_crs(taxi_zones.crs)

# aggregating the data about the number of schools for each taxi zone
schools_per_zone = gpd.sjoin(schools, taxi_zones, how="inner", predicate="within").groupby("LocationID").size().reset_index(name='number_of_schools')

In [3]:
def extract_coords(geom_str):
    try:
        coords_str = geom_str.split('(')[1].split(')')[0].split()
        lon = float(coords_str[0])
        lat = float(coords_str[1])
        return Point(lon, lat)
    except:
        return None

In [4]:
# aggregating the data about the number of points of interest for each taxi zone
poi_df = pd.read_csv("data/points-of-interest.csv")
poi_df["geometry"] =  poi_df['the_geom'].apply(extract_coords)
poi_gdf = gpd.GeoDataFrame(poi_df, geometry='geometry', crs="EPSG:4326").to_crs(taxi_zones.crs)
pois_per_zone = gpd.sjoin(poi_gdf, taxi_zones, how="inner", predicate="within").groupby("LocationID").size().reset_index(name='number_of_pois')

In [5]:
# aggregating the data about the number of universities for each taxi zone
university_df = pd.read_csv("data/universities.csv")
university_df["geometry"] =  university_df['the_geom'].apply(extract_coords)
university_gdf = gpd.GeoDataFrame(university_df, geometry='geometry', crs="EPSG:4326").to_crs(taxi_zones.crs)
universities_per_zone = gpd.sjoin(university_gdf, taxi_zones, how="inner", predicate="within").groupby("LocationID").size().reset_index(name='number_of_universities')

In [6]:
merged_df = taxi_zones[['LocationID', 'zone', 'borough']].copy()

merged_df = pd.merge(merged_df, schools_per_zone, on='LocationID', how='left')
merged_df = pd.merge(merged_df, pois_per_zone, on='LocationID', how='left')
merged_df = pd.merge(merged_df, universities_per_zone, on='LocationID', how='left')

merged_df['number_of_schools'] = merged_df['number_of_schools'].fillna(0).astype(int)
merged_df['number_of_pois'] = merged_df['number_of_pois'].fillna(0).astype(int)
merged_df['number_of_universities'] = merged_df['number_of_universities'].fillna(0).astype(int)

zones_final = merged_df[['LocationID', 'zone', 'borough', 'number_of_schools', 'number_of_pois', 'number_of_universities']]

zones_final.head()

Unnamed: 0,LocationID,zone,borough,number_of_schools,number_of_pois,number_of_universities
0,1,Newark Airport,EWR,0,0,0
1,2,Jamaica Bay,Queens,0,30,0
2,3,Allerton/Pelham Gardens,Bronx,6,46,0
3,4,Alphabet City,Manhattan,7,133,0
4,5,Arden Heights,Staten Island,2,8,0


In [7]:
zones_final.to_csv("zones_final.csv")

In [8]:
from dask.distributed import Client, LocalCluster

client = Client(n_workers=4, threads_per_worker=1, memory_limit='8GB')
print(f"Dask Dashboard link: {client.dashboard_link}")

Dask Dashboard link: http://127.0.0.1:8787/status


In [9]:
taxi_data = dd.read_parquet("data/taxi-data/*.parquet")
zones_data = dd.read_csv("zones_final.csv")
weather_data = dd.read_csv("data/weather-data.csv", skiprows = 3)

In [10]:
us_holidays = holidays.US() 

def check_holiday(date):
    if pd.isna(date):
        return False 
    return date in us_holidays

def is_holiday(date_series):
    return date_series.apply(check_holiday, meta=('tpep_pickup_datetime_rounded', 'bool'))

In [11]:
taxi_data['tpep_pickup_datetime_rounded'] = taxi_data['tpep_pickup_datetime'].dt.round('h')
weather_data['time'] = dd.to_datetime(weather_data['time']).dt.round('h')
taxi_data['is_holiday'] = taxi_data['tpep_pickup_datetime_rounded'].apply(
    check_holiday,
    meta=pd.Series([], dtype='bool', name='is_holiday')
)

merged_df = dd.merge(taxi_data, zones_data, left_on='PULocationID', right_on='LocationID', how='left')
merged_df = dd.merge(merged_df, zones_data, left_on='DOLocationID', right_on='LocationID', how='left', suffixes=('_pickup', '_dropoff'))
merged_df = dd.merge(merged_df, weather_data, left_on='tpep_pickup_datetime_rounded', right_on='time', how='left')
merged_df = merged_df.drop(columns=['LocationID_pickup', 'LocationID_dropoff', 'time', 'Unnamed: 0_pickup', 'Unnamed: 0_dropoff', 'tpep_pickup_datetime_rounded'])

merged_df["airport_fee"] = merged_df["airport_fee"].astype("float")

+------------------------------------------+----------------+----------------+
| Merge columns                            | left dtype     | right dtype    |
+------------------------------------------+----------------+----------------+
| ('tpep_pickup_datetime_rounded', 'time') | datetime64[us] | datetime64[ns] |
+------------------------------------------+----------------+----------------+
Cast dtypes explicitly to avoid unexpected results.


In [12]:
merged_df.to_parquet("data/merged_output_parquet/")

