In [1]:
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd

In [2]:
cluster = LocalCluster(n_workers=1, threads_per_worker=1, memory_limit="25GB")
client = Client(cluster)
print("Dask client dashboard link:", client.dashboard_link)

Dask client dashboard link: http://127.0.0.1:8787/status


In [4]:
def fetch_yellow_taxi_data(columns):
    file_path = "data/trip_record_data/yellow_taxi/*.parquet"

    ddf = dd.read_parquet(file_path, columns=columns, engine="pyarrow")

    ddf["Year"] = ddf["tpep_pickup_datetime"].dt.year.astype("int32")
    ddf = ddf[(ddf["Year"] >= 2012) & (ddf["Year"] <= 2024)]

    return ddf


def fetch_green_taxi_data(columns):
    file_path = "data/trip_record_data/green_taxi/*.parquet"

    ddf = dd.read_parquet(file_path, columns=columns, engine="pyarrow")

    ddf["Year"] = ddf["lpep_pickup_datetime"].dt.year.astype("int32")
    ddf = ddf[(ddf["Year"] >= 2014) & (ddf["Year"] <= 2024)]

    return ddf


def fetch_for_hire_data():
    file_path = "data/trip_record_data/for_hire/*.parquet"

    ddf = dd.read_parquet(file_path, engine="pyarrow")

    ddf["Year"] = ddf["pickup_datetime"].dt.year.astype("int32")
    ddf = ddf[(ddf["Year"] >= 2015) & (ddf["Year"] <= 2024)]

    return ddf


def fetch_High_volume_data(columns):
    file_path = "data/trip_record_data/high_volume/*.parquet"

    ddf = dd.read_parquet(file_path, columns=columns, engine="pyarrow")

    ddf["Year"] = ddf["pickup_datetime"].dt.year.astype("int32")
    ddf = ddf[(ddf["Year"] >= 2021) & (ddf["Year"] <= 2024)]

    return ddf

In [None]:
columns = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "PULocationID",
    "DOLocationID",
    "payment_type",
    "fare_amount",
    "total_amount",
]

ddf = fetch_yellow_taxi_data(columns)

case1 = ddf["tpep_pickup_datetime"] == ddf["tpep_dropoff_datetime"]
case2 = ddf["tpep_pickup_datetime"] > ddf["tpep_dropoff_datetime"]
case3 = ddf["trip_distance"] <= 0
case4 = ddf["trip_distance"] > 100
case5 = ddf["passenger_count"] == 0
case6 = ddf["passenger_count"] <= 10
case7 = ddf["fare_amount"] <= 0
case8 = ddf["fare_amount"] > 350

selected_rows = ddf[~(case1 | case2 | case3 | case4 | case5 | case6 | case7 | case8)]

selected_rows.to_parquet("data/trip_record_data_filtered/yellow_taxi")

In [None]:
columns = [
    "lpep_pickup_datetime",
    "lpep_dropoff_datetime",
    "PULocationID",
    "DOLocationID",
    "passenger_count",
    "trip_distance",
    "payment_type",
    "fare_amount",
    "total_amount",
    "trip_type",
]

ddf = fetch_green_taxi_data(columns)

case1 = ddf["lpep_pickup_datetime"] == ddf["lpep_dropoff_datetime"]
case2 = ddf["lpep_pickup_datetime"] > ddf["lpep_dropoff_datetime"]
case3 = ddf["trip_distance"] <= 0
case4 = ddf["trip_distance"] > 100
case5 = ddf["passenger_count"] == 0
case6 = ddf["passenger_count"] <= 10
case7 = ddf["fare_amount"] <= 0
case8 = ddf["fare_amount"] > 350

selected_rows = ddf[~(case1 | case2 | case3 | case4 | case5 | case6 | case7 | case8)]

selected_rows.to_parquet("data/trip_record_data_filtered/green_taxi")

In [5]:
ddf = fetch_for_hire_data()
ddf["SR_Flag"] = ddf["SR_Flag"].astype("Int64")

case1 = ddf["pickup_datetime"] == ddf["dropOff_datetime"]
case2 = ddf["pickup_datetime"] > ddf["dropOff_datetime"]

selected_rows = ddf[~(case1 | case2)]


selected_rows.to_parquet("data/trip_record_data_filtered/for_hire")

In [5]:
columns = [
    "hvfhs_license_num",
    "request_datetime",
    "on_scene_datetime",
    "pickup_datetime",
    "dropoff_datetime",
    "PULocationID",
    "DOLocationID",
    "trip_miles",
    "trip_time",
    "tolls",
    "bcf",
    "sales_tax",
    "congestion_surcharge",
    "airport_fee",
    "base_passenger_fare",
    "tips",
]

ddf = fetch_High_volume_data(columns)
ddf["airport_fee"] = ddf["airport_fee"].astype("str")

case1 = ddf["pickup_datetime"] == ddf["dropoff_datetime"]
case2 = ddf["pickup_datetime"] > ddf["dropoff_datetime"]
case3 = ddf["trip_miles"] <= 0
case4 = ddf["trip_miles"] > 100
case5 = ddf["trip_time"] <= 0
case6 = ddf["base_passenger_fare"] <= 0
case7 = ddf["base_passenger_fare"] > 350

selected_rows = ddf[~(case1 | case2 | case3 | case4 | case5 | case6 | case7)]
selected_rows.to_parquet("data/trip_record_data_filtered/high_volume")