# Preprocessing TLC data

This notebook cleans the following datasets: 
1. Yellow taxi data from 2023-12 to 2024-05
2. Green taxi data from 2023-12 to 2024-05


In [10]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [11]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


In [12]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [13]:
# Read 2023-2024 TLC data
df = spark.read.parquet('../data/landing/tlc_data/*.parquet')


In [14]:
# Read 2023-6 to 2024-5 yellow data
path = "../data/landing/tlc_data"
ydf_2023_12 = spark.read.parquet(path + "/Y-2023-12.parquet")
ydf_2024_1 = spark.read.parquet(path + "/Y-2024-01.parquet")
ydf_2024_2 = spark.read.parquet(path + "/Y-2024-02.parquet")
ydf_2024_3 = spark.read.parquet(path + "/Y-2024-03.parquet")
ydf_2024_4 = spark.read.parquet(path + "/Y-2024-04.parquet")
ydf_2024_5 = spark.read.parquet(path + "/Y-2024-05.parquet")

In [15]:
# Read 2023-12 to 2024-5 green data
path = "../data/landing/tlc_data"
gdf_2023_12 = spark.read.parquet(path + "/G-2023-12.parquet")
gdf_2024_1 = spark.read.parquet(path + "/G-2024-01.parquet")
gdf_2024_2 = spark.read.parquet(path + "/G-2024-02.parquet")
gdf_2024_3 = spark.read.parquet(path + "/G-2024-03.parquet")
gdf_2024_4 = spark.read.parquet(path + "/G-2024-04.parquet")
gdf_2024_5 = spark.read.parquet(path + "/G-2024-05.parquet")

## TLC datasets inspection

In [16]:
# Calculate the total row count for yellow taxi data from 2023-6 to 2024-5
yellow_count = (
    ydf_2023_12.count() + 
    ydf_2024_1.count() + 
    ydf_2024_2.count() + 
    ydf_2024_3.count() + 
    ydf_2024_4.count() + 
    ydf_2024_5.count()
)

# Calculate the total row count for green taxi data from 2023-6 to 2024-5
green_count = (
    gdf_2023_12.count() + 
    gdf_2024_1.count() + 
    gdf_2024_2.count() + 
    gdf_2024_3.count() + 
    gdf_2024_4.count() + 
    gdf_2024_5.count()
)

# Display the green count
print(f"The total green count is {green_count}.")

# Display the yellow count
print(f"The total yellow count is {yellow_count}.")

# Calculate the total row count for all taxi data from 2023-6 to 2024-5
total_count = yellow_count + green_count
# Display the total count
print(f"The total count is {total_count}.")

The total green count is 349274.
The total yellow count is 20169467.
The total count is 20518741.


In [17]:
# Get columns of each DataFrame
columns_ydf = set(ydf_2024_5.columns)
columns_gdf = set(gdf_2024_5.columns)

# Find differences in columns
columns_only_in_df1 = columns_ydf - columns_gdf
columns_only_in_df2 = columns_gdf - columns_ydf

print(f"Columns only in yellowDF: {columns_only_in_df1}")
print(f"Columns only in greenDF: {columns_only_in_df2}")


Columns only in yellowDF: {'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'Airport_fee'}
Columns only in greenDF: {'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'ehail_fee', 'trip_type'}


In [18]:
# Report number of features in each DataFrame
print(f"Number of features in yellowDF: {len(ydf_2024_5.columns)}")
print(f"Number of features in greenDF: {len(gdf_2024_5.columns)}")

Number of features in yellowDF: 19
Number of features in greenDF: 20


## Data Cleaning

In [19]:
# combine the datasets 
ydfs = [ydf_2023_12, 
       ydf_2024_1, ydf_2024_2, ydf_2024_3, ydf_2024_4, ydf_2024_5]
gdfs = [gdf_2023_12, 
       gdf_2024_1, gdf_2024_2, gdf_2024_3, gdf_2024_4, gdf_2024_5]


In [20]:
from pyspark.sql import DataFrame
from functools import reduce

# Function to union two DataFrames
def union_dfs(df1, df2):
    return df1.unionByName(df2)

# Combine all yellow taxi data
yellow_combined = reduce(union_dfs, ydfs)

# Combine all green taxi data
green_combined = reduce(union_dfs, gdfs) 

In [21]:
# Show descriptive statistics for taxi data
yellow_combined.describe().show()
green_combined.describe().show()

24/08/26 04:10:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CodeCache: size=131072Kb used=32806Kb max_used=32823Kb free=98265Kb
 bounds [0x00000001069e8000, 0x0000000108a28000, 0x000000010e9e8000]
 total_blobs=12399 nmethods=11424 adapters=885
 compilation: disabled (not enough contiguous free space left)


                                                                                

+-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+-----------------+------------------+---------------------+------------------+--------------------+------------------+
|summary|          VendorID|   passenger_count|     trip_distance|        RatecodeID|store_and_fwd_flag|     PULocationID|      DOLocationID|      payment_type|       fare_amount|             extra|            mta_tax|       tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|       Airport_fee|
+-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+-----------------+------------------+---------------------+------------------+--------------------+---------



+-------+-------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+-------------------+---------+---------------------+------------------+------------------+------------------+--------------------+
|summary|           VendorID|store_and_fwd_flag|        RatecodeID|      PULocationID|      DOLocationID|   passenger_count|    trip_distance|      fare_amount|             extra|           mta_tax|        tip_amount|       tolls_amount|ehail_fee|improvement_surcharge|      total_amount|      payment_type|         trip_type|congestion_surcharge|
+-------+-------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+-------------------+---------+---------------------+------------------+----------------

                                                                                

### Unify the columns of the two dataframes

In [22]:
# Drop Airport_fee and trip_type columns from the combined data
yellow_combined = yellow_combined.drop("Airport_fee")
green_combined = green_combined.drop("trip_type")

# Set ehail_fee to 0 for yellow taxi data
yellow_combined = yellow_combined.withColumn("ehail_fee", lit(0))

tpep: Taxicab Passenger Enhancement Program for yellow taxi <br> 
lpep: Livery Passenger Enhancement Program for green taxi

In [23]:
# Rename datetime columns to be consistent
from pyspark.sql.functions import col

yellow_combined = yellow_combined.withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
                               .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

green_combined = green_combined.withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
                             .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')


### Combine yellow and green taxi data

In [24]:
# Combine yellow and green taxi data
combined = yellow_combined.unionByName(green_combined)

### Anomaly handling
Filter out anomaly with business logic

In [25]:
# print the schema of the combined data
combined.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)



In [26]:
from pyspark.sql.functions import min, max

columns_to_check = [
    'passenger_count', 
    'trip_distance', 
    'fare_amount', 
    'extra', 
    'mta_tax', 
    'tip_amount', 
    'tolls_amount', 
    'improvement_surcharge',
    'total_amount',
    'congestion_surcharge',
    'ehail_fee'
]

# Create a dictionary to store min and max for each column
min_max_dict = {col: df.agg(min(col).alias(f"min_{col}"), max(col).alias(f"max_{col}")).collect()[0] for col in columns_to_check}

# Print the results
for col, values in min_max_dict.items():
    print(f"{col}: Min = {values[f'min_{col}']}, Max = {values[f'max_{col}']}")


passenger_count: Min = 0, Max = 9
trip_distance: Min = 0.0, Max = 345729.44
fare_amount: Min = -1087.3, Max = 386983.63
extra: Min = -39.17, Max = 10002.5
mta_tax: Min = -0.5, Max = 52.09
tip_amount: Min = -330.88, Max = 4174.0
tolls_amount: Min = -91.3, Max = 1702.88
improvement_surcharge: Min = -1.0, Max = 1.0
total_amount: Min = -1094.05, Max = 386987.63
congestion_surcharge: Min = -2.75, Max = 2.75
ehail_fee: Min = None, Max = None


In [27]:
from pyspark.sql.functions import col

# Apply all filters in a single chain
combined = combined.filter(
    # Filter out rows with passenger count greater than 6 or less than 1
    (col("passenger_count").between(1, 6)) &
    # Filter out rows with fare amount less than 3
    (col("fare_amount") >= 3) &
    # Filter out rows with trip distance less than 0.5
    (col("trip_distance") >= 0.5) &
    # Filter out rows with tip amount less than 0 
    (col("tip_amount") >= 0) &
    # Filter out rows with tolls amount less than 0
    (col("tolls_amount") >= 0) &
    # Filter out rows with extra amount less than 0
    (col("extra") >= 0) &
    # Filter out mtax_tax less than 0
    (col("mta_tax") >= 0) &
    # Filter out rows with improvement surcharge less than 0
    (col("improvement_surcharge") >= 0) &
    # Filter out rows with total amount less than 3
    (col("total_amount") >= 3) &
    # Filter out rows with congestion surcharge less than 0
    (col("congestion_surcharge") >= 0) &
    # Filter the pick up datetime to between 2023-06 to 2024-05
    (col("pickup_datetime").between("2023-06-01 00:00:00", "2024-05-31 00:00:00")) &
    # Filter the drop off datetime to between 2023-06 to 2024-05
    (col("dropoff_datetime").between("2023-06-01 00:00:00", "2024-05-31 00:00:00"))

)


In [28]:
combined.describe().show()



+-------+-------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+-------------------+------------------+------------------+---------------------+------------------+--------------------+---------+
|summary|           VendorID|   passenger_count|    trip_distance|        RatecodeID|store_and_fwd_flag|      PULocationID|     DOLocationID|       payment_type|       fare_amount|             extra|            mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|ehail_fee|
+-------+-------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+-------------------+------------------+------------------+---------------------+------------------+--------------------+---------+
|  count| 

                                                                                

In [29]:
# show range for pickup_datetime and dropoff_datetime
combined.select(
    min("pickup_datetime").alias("min_pickup_datetime"), 
    max("pickup_datetime").alias("max_pickup_datetime"),
    min("dropoff_datetime").alias("min_dropoff_datetime"), 
    max("dropoff_datetime").alias("max_dropoff_datetime")
).show()



+-------------------+-------------------+--------------------+--------------------+
|min_pickup_datetime|max_pickup_datetime|min_dropoff_datetime|max_dropoff_datetime|
+-------------------+-------------------+--------------------+--------------------+
|2023-11-24 21:03:34|2024-05-30 23:56:52| 2023-11-24 21:22:29| 2024-05-31 00:00:00|
+-------------------+-------------------+--------------------+--------------------+



                                                                                

In [31]:
# Step 1: Convert pickup and dropoff timestamps to string format
combined = combined.withColumn("pickup_str", F.col("pickup_datetime").cast("string"))
combined = combined.withColumn("dropoff_str", F.col("dropoff_datetime").cast("string"))

# Step 2: Convert the strings back to timestamps
combined = combined.withColumn("pickup_ts", F.to_timestamp("pickup_str"))
combined = combined.withColumn("dropoff_ts", F.to_timestamp("dropoff_str"))

# Step 3: Convert the timestamps to long (Unix epoch seconds)
combined = combined.withColumn("pickup_long", F.col("pickup_ts").cast("long"))
combined = combined.withColumn("dropoff_long", F.col("dropoff_ts").cast("long"))

# Step 4: Calculate the trip duration in minutes
combined = combined.withColumn("trip_duration", F.round((F.col("dropoff_long") - F.col("pickup_long")) / 60))

# Drop intermediate columns if no longer needed
combined = combined.drop("pickup_str", "dropoff_str", "pickup_ts", "dropoff_ts", "pickup_long", "dropoff_long")

# Show the result
combined.show(5)


+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+
|       1|2023-12-01 00:59:44|2023-12-01 01:13:22|              2|          2.2|         1|                 N|         114|         186|           1|       13.5

In [32]:
from pyspark.sql.functions import col as pyspark_col

# Filter out rows with trip duration less than 1 minute or greater than 180 minutes
combined = combined.filter(
    (pyspark_col("trip_duration") >= 1) & (pyspark_col("trip_duration") <= 180)
)

In [34]:
# export the cleaned data to raw folder
combined.write.mode('overwrite').parquet("../data/raw/tlc_data")

24/08/26 04:14:44 WARN MemoryManager: Total allocation exceeds 95.00% (928,409,179 bytes) of heap memory
Scaling row group sizes to 98.82% for 7 writers
24/08/26 04:14:44 WARN BasicWriteTaskStatsTracker: Expected 1 files, but only saw 0. This could be due to the output format not writing empty files, or files being not immediately visible in the filesystem.
24/08/26 04:14:44 WARN MemoryManager: Total allocation exceeds 95.00% (928,409,179 bytes) of heap memory
Scaling row group sizes to 86.46% for 8 writers
24/08/26 04:15:22 WARN MemoryManager: Total allocation exceeds 95.00% (928,409,179 bytes) of heap memory
Scaling row group sizes to 98.82% for 7 writers
24/08/26 04:15:22 WARN BasicWriteTaskStatsTracker: Expected 1 files, but only saw 0. This could be due to the output format not writing empty files, or files being not immediately visible in the filesystem.
24/08/26 04:15:22 WARN MemoryManager: Total allocation exceeds 95.00% (928,409,179 bytes) of heap memory
Scaling row group size

 ## Datatype Conversion

## Handling duplicates

## Standardization 