In [2]:
import polars as pl

In [7]:
column_rename_mapping = {
    "VendorID": "vendor_id",
    "RatecodeID": "rate_code_id",
    "PULocationID": "pickup_location_id",
    "DOLocationID": "dropoff_location_id",
    "Airport_fee": "airport_fee",
}
march_yellow_df = pl.read_parquet("./../data/yellow_tripdata_2024-03.parquet").rename(
    column_rename_mapping
)

## Joining Dataframes

In [8]:
location_id_summary_df = (
    march_yellow_df.group_by(
        "pickup_location_id"
    ).agg(
        pl.len().alias("count_trips"),
        pl.col("trip_distance").min().name.suffix("_min"),
        pl.col("trip_distance").mean().name.suffix("_mean"),
        pl.col("trip_distance").max().name.suffix("_max")
    )
)
location_id_summary_df.head()

pickup_location_id,count_trips,trip_distance_min,trip_distance_mean,trip_distance_max
i32,u32,f64,f64,f64
25,1933,0.0,4.523895,63.16
87,19155,0.0,4.239439,833.2
69,548,0.0,5.367555,28.3
188,1190,0.0,5.240647,32.2
10,1304,0.0,10.631028,55.21


In [5]:
column_rename_mapping_zones = {
    "LocationID": "location_id",
    "Borough": "borough",
    "Zone": "zone",
}
zones_df = pl.read_parquet("./../data/taxi_zone_lookup.parquet").rename(column_rename_mapping_zones)
zones_df.head()

location_id,borough,zone,service_zone
i32,str,str,str
1,"""EWR""","""Newark Airport""","""EWR"""
2,"""Queens""","""Jamaica Bay""","""Boro Zone"""
3,"""Bronx""","""Allerton/Pelham Gardens""","""Boro Zone"""
4,"""Manhattan""","""Alphabet City""","""Yellow Zone"""
5,"""Staten Island""","""Arden Heights""","""Boro Zone"""


In [9]:
location_id_summary_df.join(
    zones_df,
    left_on="pickup_location_id",
    right_on="location_id"
)

pickup_location_id,count_trips,trip_distance_min,trip_distance_mean,trip_distance_max,borough,zone,service_zone
i32,u32,f64,f64,f64,str,str,str
1,372,0.0,1.34578,35.75,"""EWR""","""Newark Airport""","""EWR"""
2,8,0.0,11.98,20.23,"""Queens""","""Jamaica Bay""","""Boro Zone"""
3,151,0.0,7.159073,35.1,"""Bronx""","""Allerton/Pelham Gardens""","""Boro Zone"""
4,7586,0.0,2.742518,37.6,"""Manhattan""","""Alphabet City""","""Yellow Zone"""
5,1,0.0,0.0,0.0,"""Staten Island""","""Arden Heights""","""Boro Zone"""
…,…,…,…,…,…,…,…
261,18849,0.0,4.178514,49.06,"""Manhattan""","""World Trade Center""","""Yellow Zone"""
262,48890,0.0,8.245881,176329.23,"""Manhattan""","""Yorkville East""","""Yellow Zone"""
263,68956,0.0,8.095303,176836.3,"""Manhattan""","""Yorkville West""","""Yellow Zone"""
264,11567,0.0,3.261509,56.2,"""Unknown""","""N/A""","""N/A"""


In [11]:
march_yellow_df.select(
    "tpep_pickup_datetime",
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "total_amount"
).join(
    zones_df,
    left_on="pickup_location_id",
    right_on="location_id"
)

tpep_pickup_datetime,pickup_location_id,dropoff_location_id,trip_distance,total_amount,borough,zone,service_zone
datetime[ns],i32,i32,f64,f64,str,str,str
2024-03-01 00:18:51,142,239,1.3,16.3,"""Manhattan""","""Lincoln Square East""","""Yellow Zone"""
2024-03-01 00:26:00,238,24,1.1,15.2,"""Manhattan""","""Upper West Side North""","""Yellow Zone"""
2024-03-01 00:09:22,263,75,0.86,10.4,"""Manhattan""","""Yorkville West""","""Yellow Zone"""
2024-03-01 00:33:45,164,162,0.82,14.19,"""Manhattan""","""Midtown South""","""Yellow Zone"""
2024-03-01 00:05:43,263,7,4.9,30.4,"""Manhattan""","""Yorkville West""","""Yellow Zone"""
…,…,…,…,…,…,…,…
2024-03-31 23:16:45,130,218,4.27,15.77,"""Queens""","""Jamaica""","""Boro Zone"""
2024-03-31 23:29:28,79,100,0.0,24.8,"""Manhattan""","""East Village""","""Yellow Zone"""
2024-03-31 23:15:00,63,181,6.44,31.5,"""Brooklyn""","""Cypress Hills""","""Boro Zone"""
2024-03-31 23:27:53,161,148,3.88,31.58,"""Manhattan""","""Midtown Center""","""Yellow Zone"""


In [12]:
zone_columns = [
    "borough",
    "zone",
    "service_zone",
]

march_yellow_df.select(
    "tpep_pickup_datetime",
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "total_amount"
).join(
    zones_df,
    left_on="pickup_location_id",
    right_on="location_id"
).with_columns(
    pl.col(zone_columns).name.prefix("pickup_")
).drop(zone_columns)

tpep_pickup_datetime,pickup_location_id,dropoff_location_id,trip_distance,total_amount,pickup_borough,pickup_zone,pickup_service_zone
datetime[ns],i32,i32,f64,f64,str,str,str
2024-03-01 00:18:51,142,239,1.3,16.3,"""Manhattan""","""Lincoln Square East""","""Yellow Zone"""
2024-03-01 00:26:00,238,24,1.1,15.2,"""Manhattan""","""Upper West Side North""","""Yellow Zone"""
2024-03-01 00:09:22,263,75,0.86,10.4,"""Manhattan""","""Yorkville West""","""Yellow Zone"""
2024-03-01 00:33:45,164,162,0.82,14.19,"""Manhattan""","""Midtown South""","""Yellow Zone"""
2024-03-01 00:05:43,263,7,4.9,30.4,"""Manhattan""","""Yorkville West""","""Yellow Zone"""
…,…,…,…,…,…,…,…
2024-03-31 23:16:45,130,218,4.27,15.77,"""Queens""","""Jamaica""","""Boro Zone"""
2024-03-31 23:29:28,79,100,0.0,24.8,"""Manhattan""","""East Village""","""Yellow Zone"""
2024-03-31 23:15:00,63,181,6.44,31.5,"""Brooklyn""","""Cypress Hills""","""Boro Zone"""
2024-03-31 23:27:53,161,148,3.88,31.58,"""Manhattan""","""Midtown Center""","""Yellow Zone"""
