Part 1: Data Ingestion

In [458]:
import requests
from pathlib import Path
import polars as pl
import duckdb
import plotly.express as px

# Define URLs for required files
taxi_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"
zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

# Create data/raw directory if it doesn't exist
BASE_DIR = Path.cwd().resolve()
data_dir = BASE_DIR / "data" / "raw"
data_dir.mkdir(parents=True, exist_ok=True)

# Defines File paths for downloaded data
taxi_path = data_dir / "yellow_tripdata_2024-01.parquet"
zone_path = data_dir / "taxi_zone_lookup.csv"

# Download Files and write to specified paths
def download_file(url, path):
    if path.exists():
        return
     
    with requests.get(url, stream=True, timeout=30) as r:
        r.raise_for_status()
        with open(path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

download_file(taxi_url, taxi_path)
download_file(zone_url, zone_path)
print("\nFiles downloaded successfully!")


Files downloaded successfully!


In [459]:
# Define expected columns
expected_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "DOLocationID", 
           "passenger_count", "trip_distance", "fare_amount", "tip_amount", "total_amount",
           "payment_type"]

datetime_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]

# Load Data with Polars
df = pl.read_parquet(taxi_path, columns=expected_columns)

def validate_data(df):
    # Check for missing columns
    missing_cols = set(expected_columns) - set(df.columns)
    if missing_cols:
        raise Exception(f"Missing expected columns: {missing_cols}")

    # Validate datetime columns
    for col in datetime_columns:
        if df[col].dtype != pl.Datetime:
            raise Exception(f"Invalid datetime values detected in column: {col}")
    return df

# Print Row Count and Summary
def print_summary(df):
    print("\n=== Dataset Summary ===")
    print(f"Total rows: {len(df):,}")
    print(f"Shape: {df.shape}")
    print("\nData Validated Successfully!")

df = validate_data(df)
print_summary(df)


=== Dataset Summary ===
Total rows: 2,964,624
Shape: (2964624, 10)

Data Validated Successfully!


Part 2: Data Transformation & Analysis

In [460]:
# Remove rows with nulls
def remove_nulls(df):
    num_rows = df.height

    critical_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", 
                    "DOLocationID", "fare_amount"]
    
    df = df.drop_nulls(critical_columns)

    removed_nulls = num_rows - df.height
    return df, removed_nulls

# Filter out invalid trips tracking reasons for removal
def filter_trips(df):
    current_rows = df.height

    df = df.filter(pl.col("trip_distance") > 0)
    invalid_distance = current_rows - df.height
    current_rows = df.height

    df = df.filter(pl.col("fare_amount") > 0)
    negative_fare = current_rows - df.height
    current_rows = df.height

    df = df.filter(pl.col("fare_amount") <= 500)
    exceeding_max = current_rows - df.height

    return df, invalid_distance, negative_fare, exceeding_max

# Filter out trips with dropoff before pickup
def filter_time(df):
    num_rows = df.height

    df = df.filter(pl.col("tpep_dropoff_datetime") >= pl.col("tpep_pickup_datetime"))

    removed_time = num_rows - df.height
    return df, removed_time

# Print summary of removals
def save_and_print(df, total_removed, removed_nulls, invalid_distance, negative_fare, exceeding_max, removed_time):
    print("\n=== Cleaned Dataset Summary ===")
    print(f"Total rows removed: {total_removed:,}")
    print(f"Removed null values: {removed_nulls:,}")
    print(f"Removed invalid distances: {invalid_distance:,}")
    print(f"Removed negative fares: {negative_fare:,}")
    print(f"Removed exceeding $500: {exceeding_max:,}")
    print(f"Removed invalid times: {removed_time:,}")

original_rows = df.height

df, removed_nulls = remove_nulls(df)
df, invalid_distance, negative_fare, exceeding_max = filter_trips(df)
df, removed_time = filter_time(df)

total_removed = original_rows - df.height

save_and_print(df, total_removed, removed_nulls, invalid_distance, negative_fare, exceeding_max, removed_time)

print("\n=== Null Value Counts After Cleaning ===")
print(df.null_count())


=== Cleaned Dataset Summary ===
Total rows removed: 94,996
Removed null values: 0
Removed invalid distances: 60,371
Removed negative fares: 34,539
Removed exceeding $500: 30
Removed invalid times: 56

=== Null Value Counts After Cleaning ===
shape: (1, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ tpep_pick ┆ tpep_drop ┆ PULocatio ┆ DOLocatio ┆ … ┆ fare_amou ┆ tip_amoun ┆ total_amo ┆ payment_ │
│ up_dateti ┆ off_datet ┆ nID       ┆ nID       ┆   ┆ nt        ┆ t         ┆ unt       ┆ type     │
│ me        ┆ ime       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ ---       ┆ u32       ┆ u32       ┆   ┆ u32       ┆ u32       ┆ u32       ┆ u32      │
│ u32       ┆ u32       ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 0         ┆ 0        

From the data cleaning summary above, it can be seen that there contained no null values in the critical columns of the dataset however, invalid distances occupied around 63.9% of rows removed from the dataset. The second-most error prone column was the fare amount where 36.1% of rows were removed due to mostly negative amounts with a samll number of amounts exceeding $500. Finally, there existed only 56 out of the total 94,522 rows that were removed due to invalid pickup and dropoff times.

In [461]:
# Create derived columns for trip duration, pickup hour, day of week, and trip speed
def create_derived_columns(df):
    df = df.with_columns([
        ((pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")).dt.total_seconds() / 60)
        .alias("trip_duration_minutes"),

        pl.col("tpep_pickup_datetime").dt.hour().alias("pickup_hour"),

        pl.col("tpep_pickup_datetime").dt.strftime("%A").alias("pickup_day_of_week")
    ]).with_columns([
        (pl.when(pl.col("trip_duration_minutes") > 0)
         .then(pl.col("trip_distance") / (pl.col("trip_duration_minutes") / 60))
         .otherwise(0)
        ).alias("trip_speed_mph"),
    ])

    return df
        
df = create_derived_columns(df)
print("\nDerived columns created successfully!\n")
print(df.schema)


Derived columns created successfully!

Schema([('tpep_pickup_datetime', Datetime(time_unit='ns', time_zone=None)), ('tpep_dropoff_datetime', Datetime(time_unit='ns', time_zone=None)), ('PULocationID', Int32), ('DOLocationID', Int32), ('passenger_count', Int64), ('trip_distance', Float64), ('fare_amount', Float64), ('tip_amount', Float64), ('total_amount', Float64), ('payment_type', Int64), ('trip_duration_minutes', Float64), ('pickup_hour', Int8), ('pickup_day_of_week', String), ('trip_speed_mph', Float64)])


In [None]:
# Create data/clean directory and define output path
BASE_DIR = Path().cwd().resolve()
data_dir = BASE_DIR / "data" / "clean"
data_dir.mkdir(parents=True, exist_ok=True)

output_path = data_dir / "yellow_tripdata_2024-01_clean.parquet"

# Save cleaned data file
df.write_parquet(output_path)

print("\nCleaned data saved successfully!")

In [463]:
# Create a DuckDB connection 
con = duckdb.connect()

# Load the zones data into a Polars DataFrame
zones = pl.read_csv("data/raw/taxi_zone_lookup.csv")

# Register the Polars DataFrames as a DuckDB table
con.register("trips", df.to_arrow())
con.register("zones", zones.to_arrow())

<_duckdb.DuckDBPyConnection at 0x2052b305530>

The following query shows the top 10 busiest zones by their total number of trips including that zone in descending order.

In [464]:
busiest_pickup_zones = con.execute("""
    SELECT
        z.Zone,
        COUNT(*) AS total_trips
    FROM trips t
    JOIN zones z
    ON t.PULocationID = z.LocationID
    GROUP BY z.Zone
    ORDER BY total_trips DESC
    LIMIT 10;
    """).fetchdf()

print(busiest_pickup_zones)

                           Zone  total_trips
0                Midtown Center       140141
1         Upper East Side South       140121
2                   JFK Airport       138431
3         Upper East Side North       133962
4                  Midtown East       104345
5     Times Sq/Theatre District       102958
6  Penn Station/Madison Sq West       102153
7           Lincoln Square East       101794
8             LaGuardia Airport        87694
9         Upper West Side South        86468


The above query accurately shows that the busiest zone with a total of 140,141 trips was Midtown Center while the tenth most busy zone with a total of 86,468 trips was Upper West Side South. Between these two, we can see the zones ranked second to ninth ranging from 140,121 to 87,894 total trips.

The following query shows the average fare amount paid for trips at every pickup hour of the day ordered by the hour.

In [465]:
avg_fare_hourly = con.execute("""
    SELECT
        pickup_hour,
        AVG(fare_amount) AS avg_fare
    FROM trips
    GROUP BY pickup_hour
    ORDER BY pickup_hour;
    """).fetchdf()

print(avg_fare_hourly)

    pickup_hour   avg_fare
0             0  19.683174
1             1  17.735544
2             2  16.629330
3             3  18.535687
4             4  23.450572
5             5  27.499000
6             6  22.027144
7             7  18.753589
8             8  17.827180
9             9  17.947561
10           10  18.050722
11           11  17.631596
12           12  17.799636
13           13  18.422035
14           14  19.274121
15           15  19.113794
16           16  19.459499
17           17  18.121259
18           18  17.015774
19           19  17.629029
20           20  18.052724
21           21  18.295795
22           22  19.112263
23           23  20.246207


The above query accurately shows that the average fare seen at every hour of the day remains approximately within $16 to $27. The largest average fare of approximately $27.50 was seen between the hours 5:00 to 5:59 AM while the lowest average fare of approimately $16.63 was seen between the hours 2:00 to 2:59 AM.

The following query shows the percentage of total trips that were paid for by each payment type available descending by percentage.

In [466]:
trips_by_payment = con.execute("""
    SELECT
        payment_type,
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER () AS percentage
    FROM trips
    GROUP BY payment_type
    ORDER BY percentage DESC;
    """).fetchdf()

print(trips_by_payment)

   payment_type  percentage
0             1   80.093308
1             2   14.731143
2             0    4.014492
3             4    0.792995
4             3    0.368062


The above query accurately shows that payment type 1, correlating to Credit Card, was the most used payment method for trips being used for approimately 80.1% of the total trips with payment type 2, correlating to Cash, following behind with approximately 14.7% of total trips. Payment type 0 being Unknown accounted for approximately 4.0% of total trips with payment types 4 and 3 being Disputes and No Charges accounting for 0.8% and 0.4% of total trips respectively. 

The following query shows the average tip amount for each day of the week as percentage for only payments made with a credit card ordered by the day of the week.

In [467]:
tip_percentage_card = con.execute("""
    SELECT
        pickup_day_of_week,
        AVG(CASE WHEN fare_amount > 0 THEN tip_amount / fare_amount END) * 100 AS avg_tip_percentage
    FROM trips
    WHERE payment_type = 1
    GROUP BY pickup_day_of_week
    ORDER BY CASE pickup_day_of_week
        WHEN 'Monday' THEN 1
        WHEN 'Tuesday' THEN 2
        WHEN 'Wednesday' THEN 3
        WHEN 'Thursday' THEN 4
        WHEN 'Friday' THEN 5
        WHEN 'Saturday' THEN 6
        WHEN 'Sunday' THEN 7
    END;
    """).fetchdf()

print(tip_percentage_card)

  pickup_day_of_week  avg_tip_percentage
0             Monday           25.513977
1            Tuesday           25.729989
2          Wednesday           25.706582
3           Thursday           29.734458
4             Friday           25.595719
5           Saturday           26.293897
6             Sunday           25.100984


The above query accurately shows the average percentage tip amount paid using only a Credit Card remained fairly consistent across each day of the week between the ranges of approximately 25.1% and 29.7%. This shows that a consistent tip was guaranteed on every day of the week for trips paid for using credit cards with a small increase on Thursdays with approximately 29.7% and a very slight increase on Saturdays with approximately 26.3% of the total amount being allocated towards tips received.

The following query shows the top 5 most common pickup and dropoff zone pairs by the total amount of trips including both zones in descending order of trips.

In [468]:
common_trip_routes = con.execute("""
    SELECT
        zp.Zone AS pickup_zone,
        zd.Zone AS dropoff_zone,
        COUNT(*) AS trip_count
    FROM trips t
    JOIN zones zp
        ON t.PULocationID = zp.LocationID
    JOIN zones zd
        ON t.DOLocationID = zd.LocationID
    GROUP BY pickup_zone, dropoff_zone
    ORDER BY trip_count DESC
    LIMIT 5;
    """).fetchdf()

print(common_trip_routes)

             pickup_zone           dropoff_zone  trip_count
0  Upper East Side South  Upper East Side North       21641
1  Upper East Side North  Upper East Side South       19199
2  Upper East Side North  Upper East Side North       15193
3  Upper East Side South  Upper East Side South       14115
4         Midtown Center  Upper East Side South       10139


The above query accurately shows that the most common pickup and dropoff zone pairs consisted of either trips from the Upper East Side South to the Upper East Side North or vice versa with 21,641 and 19,199 total trips respectively. This is then followed closely with trips with the Upper East Side North and the Upper East Side South with 15,193 and 14,115 trips respectively. Finally the fifth most common zone pair consisted of trips from Midtown Center to the Upper East Side South. From these results, we can see that the Upper East Side North and South was a very common trip route for most of the trips recorded in the dataset.

Part 3: Dashboard Development

In [469]:
# Convert Polars DataFrames to Pandas for visualizations
df = df.to_pandas()
zones = zones.to_pandas()

# Sample 100,000 rows for manageable analysis 
df = df.sample(n=100000, random_state=42) 

# Remove outliers 
df_filtered = df[
    (df['fare_amount'] > 0) & (df['fare_amount'] < 200) &
    (df['trip_distance'] > 0) & (df['trip_distance'] < 50)
]

In [470]:
# Counting top 10 pickup zones
top_zones = (
    df["PULocationID"]
    .value_counts()
    .head(10)
    .reset_index()
)
top_zones.columns = ["PULocationID", "Count"]

# Merge with zones to get zone names
top_zones = top_zones.merge(
    zones[["LocationID", "Zone"]],
    left_on="PULocationID",
    right_on="LocationID",
    how="left"
)

# Sort by count for better visualization
top_zones = top_zones.sort_values("Count", ascending=True)

# Create Horizontal Bar Chart for top pickup zones with visual enhancements
fig1 = px.bar(
    top_zones,
    x="Count",
    y="Zone",
    orientation="h",
    text="Count",
    color="Count",
    color_continuous_scale="Greens",
    hover_data={"PULocationID": True, "Count": True},
)

# Bar Chart Formatting
fig1.update_traces(textposition="outside")

fig1.update_layout(
    title={"text": "Top 10 Pickup Zones by Trip Count", "x":0.5},
    xaxis_title="Trip Count",
    yaxis_title="Pickup Zone",
    template="plotly_dark",
    margin=dict(l=250)
)
fig1.show()

In [471]:
# Groups average fare amount by pickup hour
hourly_fare = (
    df.groupby("pickup_hour")["fare_amount"]
    .mean()
    .reset_index()
)

# Create Line chart for average fare by hour of day
fig2 = px.line(
    hourly_fare,
    x="pickup_hour",
    y="fare_amount",
    markers=True,
    labels={
        "pickup_hour": "Hour of Day",
        "fare_amount": "Average Fare ($)"
    }
)

# Formatting x and y axes
fig2.update_xaxes(
    tickmode="linear",
    tick0=0,
    dtick=1,
    title="Hour of Day"
)

fig2.update_yaxes(tickprefix="$", tickformat=".2f", title="Average Fare ($)")

fig2.update_layout(
    template="plotly_dark",
    title={"text": "Average Fare by Hour of Day", "x": 0.5}
)
fig2.show()

In [472]:
# Create histogram for trip distance distribution
fig3 = px.histogram(
    df_filtered,
    x="trip_distance",
    nbins=50,
    title="Distribution of Trip Distances",
    labels={"trip_distance": "Trip Distance (miles)", "count": "Number of Trips"},
    template="plotly_white",
    color_discrete_sequence=["green"],
    opacity=0.8,
    hover_data={"trip_distance": True}
)

# Histogram formatting
fig3.update_traces(marker_line_width=0.5, marker_line_color="white", hovertemplate='Trip Distance: %{x}<br>Count: %{y}<extra></extra>')

fig3.update_layout(
    title=dict(text="Distribution of Trip Distances (Trips ≤ 50 miles)", x=0.5, font=dict(size=20)),
    xaxis=dict(range=[0, 20], title="Trip Distance (miles)", tick0=0, dtick=2),
    yaxis=dict(title="Number of Trips", showgrid=True),
    font=dict(family="Arial", size=12),
    bargap=0.05
)
fig3.show()

In [473]:
# Define payment type labels
payment_labels = {
    1: "Credit Card",
    2: "Cash",
    3: "No Charge",
    4: "Dispute",
    0: "Unknown",
}

color_sequence = ["#2ca02c", "#1f77b4", "#ff7f0e", "#d62728", "#7f7f7f"]

# Count trips by payment type
payment_counts = df["payment_type"].value_counts().reset_index()
payment_counts.columns = ["payment_type", "count"]

# Replace numeric codes with labels for plotting
payment_counts["payment_type"] = payment_counts["payment_type"].map(payment_labels)

# Create pie chart
fig4 = px.pie(
    payment_counts,
    names="payment_type",
    values="count",
    title="Payment Type Breakdown",
    hole=0.4,
    template="plotly_white",
    color_discrete_sequence=color_sequence
)

# Format pie chart to show percentages and labels
fig4.update_traces(
    textinfo="percent+label",
    textposition="inside",
    hovertemplate='%{label}: %{percent:.1%} (%{value} trips)<extra></extra>'
)

fig4.update_layout(
    title=dict(text="Payment Type Breakdown", x=0.5, font=dict(size=20)),
    font=dict(family="Arial", size=12)
)
fig4.show()

In [474]:
# Create heatmap data by counting trips for each combination of pickup day of week and hour
heatmap_data = (
    df.groupby(["pickup_day_of_week", "pickup_hour"])
    .size()
    .reset_index(name="trip_count")
)

# Define weekday order for consistent plotting
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Create heatmap for trips by day of week and hour
fig5 = px.density_heatmap(
    heatmap_data,
    x="pickup_hour",
    y="pickup_day_of_week",
    z="trip_count",
    category_orders={"pickup_day_of_week": weekday_order},
    title="Trips by Day of Week and Hour",
    labels={
        "pickup_hour": "Hour of Day",
        "pickup_day_of_week": "Day of Week",
        "trip_count": "Number of Trips"
    },
    template="plotly_white"
)

# Formatting heatmap
fig5.update_traces(
    zmin=0,
    zmax=heatmap_data["trip_count"].max()
)

fig5.update_yaxes(categoryorder="array", categoryarray=weekday_order[::-1])
fig5.update_xaxes(tick0=0, dtick=1)

fig5.update_layout(
    title=dict(text="Trips by Day of Week and Hour", x=0.5, font=dict(size=20)),
    font=dict(family="Arial", size=12),
    width=1000,
    height=600,
    margin=dict(l=80, r=80, t=100, b=80),
    coloraxis_colorbar=dict(
        title="Number of Trips",
        tickformat=",",
        thickness=15,
        lenmode="fraction",
        len=0.75
    )
)
fig5.show()