Part 1: Data Ingestion

In [None]:
import requests
from pathlib import Path
import polars as pl
import duckdb
import plotly.express as px

# Define URLs for required files
taxi_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"
zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

# Create data/raw directory if it doesn't exist
BASE_DIR = Path.cwd().resolve()
data_dir = BASE_DIR / "data" / "raw"
data_dir.mkdir(parents=True, exist_ok=True)

# Defines File paths for downloaded data
taxi_path = data_dir / "yellow_tripdata_2024-01.parquet"
zone_path = data_dir / "taxi_zone_lookup.csv"

# Download Files and write to specified paths
def download_file(url, path):
    if path.exists():
        return
     
    with requests.get(url, stream=True, timeout=30) as r:
        r.raise_for_status()
        with open(path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

download_file(taxi_url, taxi_path)
download_file(zone_url, zone_path)
print("\nFiles downloaded successfully!")

In [None]:
# Define expected columns
expected_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "DOLocationID", 
           "passenger_count", "trip_distance", "fare_amount", "tip_amount", "total_amount",
           "payment_type"]

datetime_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]

# Load Data with Polars
df = pl.read_parquet(taxi_path, columns=expected_columns)

def validate_data(df):
    # Check for missing columns
    missing_cols = set(expected_columns) - set(df.columns)
    if missing_cols:
        raise Exception(f"Missing expected columns: {missing_cols}")

    # Validate datetime columns
    for col in datetime_columns:
        if df[col].dtype != pl.Datetime:
            raise Exception(f"Invalid datetime values detected in column: {col}")
    return df

# Print Row Count and Summary
def print_summary(df):
    print("\n=== Dataset Summary ===")
    print(f"Total rows: {len(df):,}")
    print(f"Shape: {df.shape}")
    print("\nData Validated Successfully!")

df = validate_data(df)
print_summary(df)

Part 2: Data Transformation & Analysis

In [None]:
# Remove rows with nulls
def remove_nulls(df):
    num_rows = df.height

    critical_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", 
                    "DOLocationID", "fare_amount"]
    
    df = df.drop_nulls(critical_columns)

    removed_nulls = num_rows - df.height
    return df, removed_nulls

# Filter out invalid trips tracking reasons for removal
def filter_trips(df):
    current_rows = df.height

    df = df.filter(pl.col("trip_distance") > 0)
    invalid_distance = current_rows - df.height
    current_rows = df.height

    df = df.filter(pl.col("fare_amount") > 0)
    negative_fare = current_rows - df.height
    current_rows = df.height

    df = df.filter(pl.col("fare_amount") <= 500)
    exceeding_max = current_rows - df.height

    return df, invalid_distance, negative_fare, exceeding_max

# Filter out trips with dropoff before pickup
def filter_time(df):
    num_rows = df.height

    df = df.filter(pl.col("tpep_dropoff_datetime") >= pl.col("tpep_pickup_datetime"))

    removed_time = num_rows - df.height
    return df, removed_time

# Print summary of removals
def save_and_print(df, total_removed, removed_nulls, invalid_distance, negative_fare, exceeding_max, removed_time):
    print("\n=== Cleaned Dataset Summary ===")
    print(f"Total rows removed: {total_removed:,}")
    print(f"Removed null values: {removed_nulls:,}")
    print(f"Removed invalid distances: {invalid_distance:,}")
    print(f"Removed negative fares: {negative_fare:,}")
    print(f"Removed exceeding $500: {exceeding_max:,}")
    print(f"Removed invalid times: {removed_time:,}")

original_rows = df.height

df, removed_nulls = remove_nulls(df)
df, invalid_distance, negative_fare, exceeding_max = filter_trips(df)
df, removed_time = filter_time(df)

total_removed = original_rows - df.height

save_and_print(df, total_removed, removed_nulls, invalid_distance, negative_fare, exceeding_max, removed_time)

From the data cleaning summary above, it can be seen that there contained no null values in the critical columns of the dataset however, invalid distances occupied around 63.9% of rows removed from the dataset. The second-most error prone column was the fare amount where 36.1% of rows were removed due to mostly negative amounts with a samll number of amounts exceeding $500. Finally, there existed only 56 out of the total 94,522 rows that were removed due to invalid pickup and dropoff times.

In [None]:
# Create derived columns for trip duration, pickup hour, day of week, and trip speed
def create_derived_columns(df):
    df = df.with_columns([
        ((pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")).dt.total_seconds() / 60)
        .alias("trip_duration_minutes"),

        pl.col("tpep_pickup_datetime").dt.hour().alias("pickup_hour"),

        pl.col("tpep_pickup_datetime").dt.strftime("%A").alias("pickup_day_of_week")
    ]).with_columns([
        (pl.when(pl.col("trip_duration_minutes") > 0)
         .then(pl.col("trip_distance") / (pl.col("trip_duration_minutes") / 60))
         .otherwise(0)
        ).alias("trip_speed_mph"),
    ])

    return df
        
df = create_derived_columns(df)
print("\nDerived columns created successfully!\n")
print(df.schema)

In [None]:
# Create data/clean directory and define output path
BASE_DIR = Path().cwd().resolve()
data_dir = BASE_DIR / "data" / "clean"
data_dir.mkdir(parents=True, exist_ok=True)

output_path = data_dir / "yellow_tripdata_2024-01_clean.parquet"

# Save cleaned data file
df.write_parquet(output_path)

print("\nCleaned data saved successfully!")

In [None]:
# Create a DuckDB connection 
con = duckdb.connect()

# Load the zones data into a Polars DataFrame
zones = pl.read_csv("data/raw/taxi_zone_lookup.csv")

# Register the Polars DataFrames as a DuckDB table
con.register("trips", df.to_arrow())
con.register("zones", zones.to_arrow())

The following query shows the top 10 busiest zones by their total number of trips including that zone in descending order.

In [None]:
busiest_pickup_zones = con.execute("""
    SELECT
        z.Zone,
        COUNT(*) AS total_trips
    FROM trips t
    JOIN zones z
    ON t.PULocationID = z.LocationID
    GROUP BY z.Zone
    ORDER BY total_trips DESC
    LIMIT 10;
    """).fetchdf()

print(busiest_pickup_zones)

The above query accurately shows that the busiest zone with a total of 140,141 trips was Midtown Center while the tenth most busy zone with a total of 86,468 trips was Upper West Side South. Between these two, we can see the zones ranked second to ninth ranging from 140,121 to 87,894 total trips.

The following query shows the average fare amount paid for trips at every pickup hour of the day ordered by the hour.

In [None]:
avg_fare_hourly = con.execute("""
    SELECT
        pickup_hour,
        AVG(fare_amount) AS avg_fare
    FROM trips
    GROUP BY pickup_hour
    ORDER BY pickup_hour;
    """).fetchdf()

print(avg_fare_hourly)

The above query accurately shows that the average fare seen at every hour of the day remains approximately within $16 to $27. The largest average fare of approximately $27.50 was seen between the hours 5:00 to 5:59 AM while the lowest average fare of approimately $16.63 was seen between the hours 2:00 to 2:59 AM.

The following query shows the percentage of total trips that were paid for by each payment type available descending by percentage.

In [None]:
trips_by_payment = con.execute("""
    SELECT
        payment_type,
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER () AS percentage
    FROM trips
    GROUP BY payment_type
    ORDER BY percentage DESC;
    """).fetchdf()

print(trips_by_payment)

The above query accurately shows that payment type 1, correlating to Credit Card, was the most used payment method for trips being used for approimately 80.1% of the total trips with payment type 2, correlating to Cash, following behind with approximately 14.7% of total trips. Payment type 0 being Unknown accounted for approximately 4.0% of total trips with payment types 4 and 3 being Disputes and No Charges accounting for 0.8% and 0.4% of total trips respectively. 

The following query shows the average tip amount for each day of the week as percentage for only payments made with a credit card ordered by the day of the week.

In [None]:
tip_percentage_card = con.execute("""
    SELECT
        pickup_day_of_week,
        AVG(CASE WHEN fare_amount > 0 THEN tip_amount / fare_amount END) * 100 AS avg_tip_percentage
    FROM trips
    WHERE payment_type = 1
    GROUP BY pickup_day_of_week
    ORDER BY CASE pickup_day_of_week
        WHEN 'Monday' THEN 1
        WHEN 'Tuesday' THEN 2
        WHEN 'Wednesday' THEN 3
        WHEN 'Thursday' THEN 4
        WHEN 'Friday' THEN 5
        WHEN 'Saturday' THEN 6
        WHEN 'Sunday' THEN 7
    END;
    """).fetchdf()

print(tip_percentage_card)

The above query accurately shows the average percentage tip amount paid using only a Credit Card remained fairly consistent across each day of the week between the ranges of approximately 25.1% and 29.7%. This shows that a consistent tip was guaranteed on every day of the week for trips paid for using credit cards with a small increase on Thursdays with approximately 29.7% and a very slight increase on Saturdays with approximately 26.3% of the total amount being allocated towards tips received.

The following query shows the top 5 most common pickup and dropoff zone pairs by the total amount of trips including both zones in descending order of trips.

In [None]:
common_trip_routes = con.execute("""
    SELECT
        zp.Zone AS pickup_zone,
        zd.Zone AS dropoff_zone,
        COUNT(*) AS trip_count
    FROM trips t
    JOIN zones zp
        ON t.PULocationID = zp.LocationID
    JOIN zones zd
        ON t.DOLocationID = zd.LocationID
    GROUP BY pickup_zone, dropoff_zone
    ORDER BY trip_count DESC
    LIMIT 5;
    """).fetchdf()

print(common_trip_routes)

The above query accurately shows that the most common pickup and dropoff zone pairs consisted of either trips from the Upper East Side South to the Upper East Side North or vice versa with 21,641 and 19,199 total trips respectively. This is then followed closely with trips with the Upper East Side North and the Upper East Side South with 15,193 and 14,115 trips respectively. Finally the fifth most common zone pair consisted of trips from Midtown Center to the Upper East Side South. From these results, we can see that the Upper East Side North and South was a very common trip route for most of the trips recorded in the dataset.

Part 3: Dashboard Development

In [None]:
# Convert Polars DataFrame to Pandas for visualizations
df = df.to_pandas()

top_zones = (
    df["PULocationID"]
    .value_counts()
    .head(10)
    .reset_index()
)
top_zones.columns = ["PULocationID", "count"]

fig1 = px.bar(
    top_zones,
    x="PULocationID",
    y="count",
    title="Top 10 Pickup Zones by Trip Count"
)
fig1.show()