# Preprocessing TLC data

This notebook cleans the following datasets: 
1. Yellow taxi data from 2023-12 to 2024-05
2. Green taxi data from 2023-12 to 2024-05


In [31]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [32]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)


In [33]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [34]:
# Read 2023-2024 TLC data
df = spark.read.parquet('../data/landing/tlc_data/*.parquet')


In [35]:
# Read 2023-6 to 2024-5 yellow data
path = "../data/landing/tlc_data"
ydf_2023_12 = spark.read.parquet(path + "/Y-2023-12.parquet")
ydf_2024_1 = spark.read.parquet(path + "/Y-2024-01.parquet")
ydf_2024_2 = spark.read.parquet(path + "/Y-2024-02.parquet")
ydf_2024_3 = spark.read.parquet(path + "/Y-2024-03.parquet")
ydf_2024_4 = spark.read.parquet(path + "/Y-2024-04.parquet")
ydf_2024_5 = spark.read.parquet(path + "/Y-2024-05.parquet")

In [36]:
# Read 2023-12 to 2024-5 green data
path = "../data/landing/tlc_data"
gdf_2023_12 = spark.read.parquet(path + "/G-2023-12.parquet")
gdf_2024_1 = spark.read.parquet(path + "/G-2024-01.parquet")
gdf_2024_2 = spark.read.parquet(path + "/G-2024-02.parquet")
gdf_2024_3 = spark.read.parquet(path + "/G-2024-03.parquet")
gdf_2024_4 = spark.read.parquet(path + "/G-2024-04.parquet")
gdf_2024_5 = spark.read.parquet(path + "/G-2024-05.parquet")

## TLC datasets inspection

In [37]:
# Calculate the total row count for yellow taxi data from 2023-6 to 2024-5
yellow_count = (
    ydf_2023_12.count() + 
    ydf_2024_1.count() + 
    ydf_2024_2.count() + 
    ydf_2024_3.count() + 
    ydf_2024_4.count() + 
    ydf_2024_5.count()
)

# Calculate the total row count for green taxi data from 2023-6 to 2024-5
green_count = (
    gdf_2023_12.count() + 
    gdf_2024_1.count() + 
    gdf_2024_2.count() + 
    gdf_2024_3.count() + 
    gdf_2024_4.count() + 
    gdf_2024_5.count()
)

# Display the green count
print(f"The total green count is {green_count}.")

# Display the yellow count
print(f"The total yellow count is {yellow_count}.")

# Calculate the total row count for all taxi data from 2023-6 to 2024-5
total_count = yellow_count + green_count
# Display the total count
print(f"The total count is {total_count}.")

The total green count is 349274.
The total yellow count is 20169467.
The total count is 20518741.


In [38]:
# Get columns of each DataFrame
columns_ydf = set(ydf_2024_5.columns)
columns_gdf = set(gdf_2024_5.columns)

# Find differences in columns
columns_only_in_df1 = columns_ydf - columns_gdf
columns_only_in_df2 = columns_gdf - columns_ydf

print(f"Columns only in yellowDF: {columns_only_in_df1}")
print(f"Columns only in greenDF: {columns_only_in_df2}")


Columns only in yellowDF: {'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'Airport_fee'}
Columns only in greenDF: {'ehail_fee', 'lpep_dropoff_datetime', 'trip_type', 'lpep_pickup_datetime'}


In [39]:
# Report number of features in each DataFrame
print(f"Number of features in yellowDF: {len(ydf_2024_5.columns)}")
print(f"Number of features in greenDF: {len(gdf_2024_5.columns)}")

Number of features in yellowDF: 19
Number of features in greenDF: 20


## Data Cleaning

In [40]:
# combine the datasets 
ydfs = [ydf_2023_12, 
       ydf_2024_1, ydf_2024_2, ydf_2024_3, ydf_2024_4, ydf_2024_5]
gdfs = [gdf_2023_12, 
       gdf_2024_1, gdf_2024_2, gdf_2024_3, gdf_2024_4, gdf_2024_5]


In [41]:
from pyspark.sql import DataFrame
from functools import reduce

# Function to union two DataFrames
def union_dfs(df1, df2):
    return df1.unionByName(df2)

# Combine all yellow taxi data
yellow_combined = reduce(union_dfs, ydfs)

# Combine all green taxi data
green_combined = reduce(union_dfs, gdfs) 

### Unify the columns of the two dataframes

In [42]:
# Drop Airport_fee and trip_type columns from the combined data
yellow_combined = yellow_combined.drop("Airport_fee")
green_combined = green_combined.drop("trip_type")

# Set ehail_fee to 0 for yellow taxi data
yellow_combined = yellow_combined.withColumn("ehail_fee", lit(0))

tpep: Taxicab Passenger Enhancement Program for yellow taxi <br> 
lpep: Livery Passenger Enhancement Program for green taxi

In [43]:
# Rename datetime columns to be consistent
from pyspark.sql.functions import col

yellow_combined = yellow_combined.withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
                               .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

green_combined = green_combined.withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
                             .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')


### Combine yellow and green taxi data

In [44]:
# Combine yellow and green taxi data
combined = yellow_combined.unionByName(green_combined)

### Anomaly handling 1
Filter out anomaly with business logic

In [45]:
# print the schema of the combined data
combined.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)



In [46]:
from pyspark.sql.functions import min, max

columns_to_check = [
    'passenger_count', 
    'trip_distance', 
    'fare_amount', 
    'extra', 
    'mta_tax', 
    'tip_amount', 
    'tolls_amount', 
    'improvement_surcharge',
    'total_amount',
    'congestion_surcharge',
    'ehail_fee'
]

# Create a dictionary to store min and max for each column
min_max_dict = {col: df.agg(min(col).alias(f"min_{col}"), max(col).alias(f"max_{col}")).collect()[0] for col in columns_to_check}

# Print the results
for col, values in min_max_dict.items():
    print(f"{col}: Min = {values[f'min_{col}']}, Max = {values[f'max_{col}']}")


passenger_count: Min = 0, Max = 9
trip_distance: Min = 0.0, Max = 345729.44
fare_amount: Min = -1087.3, Max = 386983.63
extra: Min = -39.17, Max = 10002.5
mta_tax: Min = -0.5, Max = 52.09
tip_amount: Min = -330.88, Max = 4174.0
tolls_amount: Min = -91.3, Max = 1702.88
improvement_surcharge: Min = -1.0, Max = 1.0
total_amount: Min = -1094.05, Max = 386987.63
congestion_surcharge: Min = -2.75, Max = 2.75
ehail_fee: Min = None, Max = None


In [47]:
from pyspark.sql.functions import col

# Apply all filters in a single chain
combined = combined.filter(
    # Filter out rows with passenger count greater than 6 or less than 1
    (col("passenger_count").between(1, 6)) &
    # Filter out rows with fare amount less than 3
    (col("fare_amount") >= 3) &
    # Filter out rows with trip distance less than 0.5 miles 
    (col("trip_distance") >= 0.5) &
    # Filter out rows with tip amount less than 0 
    (col("tip_amount") >= 0) &
    # Filter out rows with tolls amount less than 0
    (col("tolls_amount") >= 0) &
    # Filter out rows with extra amount less than 0
    (col("extra") >= 0) &
    # Filter out mtax_tax less than 0
    (col("mta_tax") >= 0) &
    # Filter out rows with improvement surcharge less than 0
    (col("improvement_surcharge") >= 0) &
    # Filter out rows with total amount less than 3
    (col("total_amount") >= 3) &
    # Filter out rows with congestion surcharge less than 0
    (col("congestion_surcharge") >= 0) &
    # Filter the pick up datetime to between 2023-06 to 2024-05
    (col("pickup_datetime").between("2023-06-01 00:00:00", "2024-05-31 00:00:00")) &
    # Filter the drop off datetime to between 2023-06 to 2024-05
    (col("dropoff_datetime").between("2023-06-01 00:00:00", "2024-05-31 00:00:00"))

)


### Data type conversion

In [48]:
# Convert 'store_and_fwd_flag' to boolean
combined = combined.withColumn("store_and_fwd_flag", when(col("store_and_fwd_flag") == "Y", True).otherwise(False))

In [49]:
# Step 1: Convert pickup and dropoff timestamps to string format
combined = combined.withColumn("pickup_str", F.col("pickup_datetime").cast("string"))
combined = combined.withColumn("dropoff_str", F.col("dropoff_datetime").cast("string"))

# Step 2: Convert the strings back to timestamps
combined = combined.withColumn("pickup_ts", F.to_timestamp("pickup_str"))
combined = combined.withColumn("dropoff_ts", F.to_timestamp("dropoff_str"))

# Step 3: Convert the timestamps to long (Unix epoch seconds)
combined = combined.withColumn("pickup_long", F.col("pickup_ts").cast("long"))
combined = combined.withColumn("dropoff_long", F.col("dropoff_ts").cast("long"))

# Step 4: Calculate the trip duration in minutes
combined = combined.withColumn("trip_duration", F.round((F.col("dropoff_long") - F.col("pickup_long")) / 60))

# Drop intermediate columns if no longer needed
combined = combined.drop("pickup_str", "dropoff_str", "pickup_ts", "dropoff_ts", "pickup_long", "dropoff_long")

# Show the result
combined.show(5)


+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+
|       1|2023-12-01 00:59:44|2023-12-01 01:13:22|              2|          2.2|         1|             false|         114|         186|           1|       13.5

In [50]:
# check datatypes
combined.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = false)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- trip_duration: double (nullable = true)



From Northern end of bronx to southern end of staten island is below 50 miles </br>
2 hours should be sufficient for travel 

In [51]:
from pyspark.sql.functions import col as pyspark_col

# Filter out rows with trip duration greater than 120 minutes
combined = combined.filter(
    (pyspark_col("trip_duration") <= 120)
)

# Filter out rows with trip distance greater than 50 miles 
combined = combined.filter(
    pyspark_col("trip_distance") <= 50
) 

### Handling missing values & duplicates

In [52]:
# Drop dulicate rows
combined = combined.dropDuplicates()

In [53]:
# Drop rows with missing values
combined = combined.dropna() 

### Anomaly handling by statistics

In [54]:
# keep 99% quantile of total_amount
total_amount_quantile = combined.approxQuantile("total_amount", [0.99], 0.01)[0]
combined = combined.filter(pyspark_col("total_amount") <= total_amount_quantile)



## Data cleaning outcomes

In [None]:
# check the percentage of rows that remain after filtering
original_count = total_count
filtered_count = combined.count()
percentage_remaining = (filtered_count / original_count) * 100
print(f"Percentage of rows remaining after filtering: {percentage_remaining:.2f}%")



Percentage of rows remaining after filtering: 82.31%


                                                                                

## Feature Engineering

In [None]:
# round time to hour 
combined = combined.withColumn("pickup_hour", F.hour("pickup_datetime"))

In [30]:
# extract date from pickup_datetime and dropoff_datetime
combined = combined.withColumn("pickup_date", F.to_date("pickup_datetime"))
combined = combined.withColumn("dropoff_date", F.to_date("dropoff_datetime"))

# extract time from pickup_datetime and dropoff_datetime
combined = combined.withColumn("pickup_time", F.date_format("pickup_datetime", "HH:mm:ss"))
combined = combined.withColumn("dropoff_time", F.date_format("dropoff_datetime", "HH:mm:ss"))

# drop pickup_datetime and dropoff_datetime
combined = combined.drop("pickup_datetime", "dropoff_datetime")

24/08/27 16:28:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `pickup_datetime` cannot be resolved. Did you mean one of the following? [`pickup_time`, `pickup_day`, `pickup_hour`, `dropoff_time`, `trip_duration`].;
'Project [VendorID#40, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, pickup_hour#2236, pickup_day#2390, dropoff_day#2413, pickup_time#2437, dropoff_time#2462, to_date('pickup_datetime, None, Some(Etc/UTC), false) AS pickup_date#2651]
+- Project [VendorID#40, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, pickup_hour#2236, pickup_day#2390, dropoff_day#2413, pickup_time#2437, dropoff_time#2462]
   +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, pickup_hour#2236, pickup_day#2390, dropoff_day#2413, pickup_time#2437, date_format(cast(dropoff_datetime#1084 as timestamp), HH:mm:ss, Some(Etc/UTC)) AS dropoff_time#2462]
      +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, pickup_hour#2236, pickup_day#2390, dropoff_day#2413, date_format(cast(pickup_datetime#1064 as timestamp), HH:mm:ss, Some(Etc/UTC)) AS pickup_time#2437]
         +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, pickup_hour#2236, pickup_day#2390, dayofweek(cast(dropoff_datetime#1084 as date)) AS dropoff_day#2413]
            +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, pickup_hour#2236, dayofweek(cast(pickup_datetime#1064 as date)) AS pickup_day#2390]
               +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700, hour(pickup_datetime#1064, Some(Etc/UTC)) AS pickup_hour#2236]
                  +- Filter (total_amount#56 <= 2372.79)
                     +- Filter atleastnnonnulls(20, VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700)
                        +- Deduplicate [DOLocationID#48, improvement_surcharge#55, PULocationID#47, trip_distance#44, tolls_amount#54, RatecodeID#45L, VendorID#40, tip_amount#53, payment_type#49L, fare_amount#50, passenger_count#43L, store_and_fwd_flag#1539, extra#51, dropoff_datetime#1084, ehail_fee#1144, congestion_surcharge#57, trip_duration#1700, total_amount#56, pickup_datetime#1064, mta_tax#52]
                           +- Filter (trip_distance#44 <= cast(50 as double))
                              +- Filter (trip_duration#1700 <= cast(120 as double))
                                 +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, trip_duration#1700]
                                    +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, pickup_str#1559, dropoff_str#1580, pickup_ts#1602, dropoff_ts#1625, pickup_long#1649L, ... 2 more fields]
                                       +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, pickup_str#1559, dropoff_str#1580, pickup_ts#1602, dropoff_ts#1625, pickup_long#1649L, cast(dropoff_ts#1625 as bigint) AS dropoff_long#1674L]
                                          +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, pickup_str#1559, dropoff_str#1580, pickup_ts#1602, dropoff_ts#1625, cast(pickup_ts#1602 as bigint) AS pickup_long#1649L]
                                             +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, pickup_str#1559, dropoff_str#1580, pickup_ts#1602, to_timestamp(dropoff_str#1580, None, TimestampType, Some(Etc/UTC), false) AS dropoff_ts#1625]
                                                +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, pickup_str#1559, dropoff_str#1580, to_timestamp(pickup_str#1559, None, TimestampType, Some(Etc/UTC), false) AS pickup_ts#1602]
                                                   +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, pickup_str#1559, cast(dropoff_datetime#1084 as string) AS dropoff_str#1580]
                                                      +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144, cast(pickup_datetime#1064 as string) AS pickup_str#1559]
                                                         +- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, CASE WHEN (store_and_fwd_flag#46 = Y) THEN true ELSE false END AS store_and_fwd_flag#1539, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1144]
                                                            +- Filter (((((((((((((passenger_count#43L >= cast(1 as bigint)) AND (passenger_count#43L <= cast(6 as bigint))) AND (fare_amount#50 >= cast(3 as double))) AND (trip_distance#44 >= 0.5)) AND (tip_amount#53 >= cast(0 as double))) AND (tolls_amount#54 >= cast(0 as double))) AND (extra#51 >= cast(0 as double))) AND (mta_tax#52 >= cast(0 as double))) AND (improvement_surcharge#55 >= cast(0 as double))) AND (total_amount#56 >= cast(3 as double))) AND (congestion_surcharge#57 >= cast(0 as double))) AND ((pickup_datetime#1064 >= cast(2023-06-01 00:00:00 as timestamp_ntz)) AND (pickup_datetime#1064 <= cast(2024-05-31 00:00:00 as timestamp_ntz)))) AND ((dropoff_datetime#1084 >= cast(2023-06-01 00:00:00 as timestamp_ntz)) AND (dropoff_datetime#1084 <= cast(2024-05-31 00:00:00 as timestamp_ntz))))
                                                               +- Union false, false
                                                                  :- Project [VendorID#40, pickup_datetime#1064, dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, cast(ehail_fee#1044 as double) AS ehail_fee#1144]
                                                                  :  +- Project [VendorID#40, pickup_datetime#1064, tpep_dropoff_datetime#42 AS dropoff_datetime#1084, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1044]
                                                                  :     +- Project [VendorID#40, tpep_pickup_datetime#41 AS pickup_datetime#1064, tpep_dropoff_datetime#42, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#1044]
                                                                  :        +- Project [VendorID#40, tpep_pickup_datetime#41, tpep_dropoff_datetime#42, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, 0 AS ehail_fee#1044]
                                                                  :           +- Project [VendorID#40, tpep_pickup_datetime#41, tpep_dropoff_datetime#42, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57]
                                                                  :              +- Union false, false
                                                                  :                 :- Relation [VendorID#40,tpep_pickup_datetime#41,tpep_dropoff_datetime#42,passenger_count#43L,trip_distance#44,RatecodeID#45L,store_and_fwd_flag#46,PULocationID#47,DOLocationID#48,payment_type#49L,fare_amount#50,extra#51,mta_tax#52,tip_amount#53,tolls_amount#54,improvement_surcharge#55,total_amount#56,congestion_surcharge#57,Airport_fee#58] parquet
                                                                  :                 :- Project [VendorID#78, tpep_pickup_datetime#79, tpep_dropoff_datetime#80, passenger_count#81L, trip_distance#82, RatecodeID#83L, store_and_fwd_flag#84, PULocationID#85, DOLocationID#86, payment_type#87L, fare_amount#88, extra#89, mta_tax#90, tip_amount#91, tolls_amount#92, improvement_surcharge#93, total_amount#94, congestion_surcharge#95, Airport_fee#96]
                                                                  :                 :  +- Relation [VendorID#78,tpep_pickup_datetime#79,tpep_dropoff_datetime#80,passenger_count#81L,trip_distance#82,RatecodeID#83L,store_and_fwd_flag#84,PULocationID#85,DOLocationID#86,payment_type#87L,fare_amount#88,extra#89,mta_tax#90,tip_amount#91,tolls_amount#92,improvement_surcharge#93,total_amount#94,congestion_surcharge#95,Airport_fee#96] parquet
                                                                  :                 :- Project [VendorID#116, tpep_pickup_datetime#117, tpep_dropoff_datetime#118, passenger_count#119L, trip_distance#120, RatecodeID#121L, store_and_fwd_flag#122, PULocationID#123, DOLocationID#124, payment_type#125L, fare_amount#126, extra#127, mta_tax#128, tip_amount#129, tolls_amount#130, improvement_surcharge#131, total_amount#132, congestion_surcharge#133, Airport_fee#134]
                                                                  :                 :  +- Relation [VendorID#116,tpep_pickup_datetime#117,tpep_dropoff_datetime#118,passenger_count#119L,trip_distance#120,RatecodeID#121L,store_and_fwd_flag#122,PULocationID#123,DOLocationID#124,payment_type#125L,fare_amount#126,extra#127,mta_tax#128,tip_amount#129,tolls_amount#130,improvement_surcharge#131,total_amount#132,congestion_surcharge#133,Airport_fee#134] parquet
                                                                  :                 :- Project [VendorID#154, tpep_pickup_datetime#155, tpep_dropoff_datetime#156, passenger_count#157L, trip_distance#158, RatecodeID#159L, store_and_fwd_flag#160, PULocationID#161, DOLocationID#162, payment_type#163L, fare_amount#164, extra#165, mta_tax#166, tip_amount#167, tolls_amount#168, improvement_surcharge#169, total_amount#170, congestion_surcharge#171, Airport_fee#172]
                                                                  :                 :  +- Relation [VendorID#154,tpep_pickup_datetime#155,tpep_dropoff_datetime#156,passenger_count#157L,trip_distance#158,RatecodeID#159L,store_and_fwd_flag#160,PULocationID#161,DOLocationID#162,payment_type#163L,fare_amount#164,extra#165,mta_tax#166,tip_amount#167,tolls_amount#168,improvement_surcharge#169,total_amount#170,congestion_surcharge#171,Airport_fee#172] parquet
                                                                  :                 :- Project [VendorID#192, tpep_pickup_datetime#193, tpep_dropoff_datetime#194, passenger_count#195L, trip_distance#196, RatecodeID#197L, store_and_fwd_flag#198, PULocationID#199, DOLocationID#200, payment_type#201L, fare_amount#202, extra#203, mta_tax#204, tip_amount#205, tolls_amount#206, improvement_surcharge#207, total_amount#208, congestion_surcharge#209, Airport_fee#210]
                                                                  :                 :  +- Relation [VendorID#192,tpep_pickup_datetime#193,tpep_dropoff_datetime#194,passenger_count#195L,trip_distance#196,RatecodeID#197L,store_and_fwd_flag#198,PULocationID#199,DOLocationID#200,payment_type#201L,fare_amount#202,extra#203,mta_tax#204,tip_amount#205,tolls_amount#206,improvement_surcharge#207,total_amount#208,congestion_surcharge#209,Airport_fee#210] parquet
                                                                  :                 +- Project [VendorID#230, tpep_pickup_datetime#231, tpep_dropoff_datetime#232, passenger_count#233L, trip_distance#234, RatecodeID#235L, store_and_fwd_flag#236, PULocationID#237, DOLocationID#238, payment_type#239L, fare_amount#240, extra#241, mta_tax#242, tip_amount#243, tolls_amount#244, improvement_surcharge#245, total_amount#246, congestion_surcharge#247, Airport_fee#248]
                                                                  :                    +- Relation [VendorID#230,tpep_pickup_datetime#231,tpep_dropoff_datetime#232,passenger_count#233L,trip_distance#234,RatecodeID#235L,store_and_fwd_flag#236,PULocationID#237,DOLocationID#238,payment_type#239L,fare_amount#240,extra#241,mta_tax#242,tip_amount#243,tolls_amount#244,improvement_surcharge#245,total_amount#246,congestion_surcharge#247,Airport_fee#248] parquet
                                                                  +- Project [VendorID#268, pickup_datetime#1104, dropoff_datetime#1124, passenger_count#275L, trip_distance#276, RatecodeID#272L, store_and_fwd_flag#271, PULocationID#273, DOLocationID#274, payment_type#285L, fare_amount#277, extra#278, mta_tax#279, tip_amount#280, tolls_amount#281, improvement_surcharge#283, total_amount#284, congestion_surcharge#287, ehail_fee#282]
                                                                     +- Project [VendorID#268, pickup_datetime#1104, lpep_dropoff_datetime#270 AS dropoff_datetime#1124, store_and_fwd_flag#271, RatecodeID#272L, PULocationID#273, DOLocationID#274, passenger_count#275L, trip_distance#276, fare_amount#277, extra#278, mta_tax#279, tip_amount#280, tolls_amount#281, ehail_fee#282, improvement_surcharge#283, total_amount#284, payment_type#285L, congestion_surcharge#287]
                                                                        +- Project [VendorID#268, lpep_pickup_datetime#269 AS pickup_datetime#1104, lpep_dropoff_datetime#270, store_and_fwd_flag#271, RatecodeID#272L, PULocationID#273, DOLocationID#274, passenger_count#275L, trip_distance#276, fare_amount#277, extra#278, mta_tax#279, tip_amount#280, tolls_amount#281, ehail_fee#282, improvement_surcharge#283, total_amount#284, payment_type#285L, congestion_surcharge#287]
                                                                           +- Project [VendorID#268, lpep_pickup_datetime#269, lpep_dropoff_datetime#270, store_and_fwd_flag#271, RatecodeID#272L, PULocationID#273, DOLocationID#274, passenger_count#275L, trip_distance#276, fare_amount#277, extra#278, mta_tax#279, tip_amount#280, tolls_amount#281, ehail_fee#282, improvement_surcharge#283, total_amount#284, payment_type#285L, congestion_surcharge#287]
                                                                              +- Union false, false
                                                                                 :- Relation [VendorID#268,lpep_pickup_datetime#269,lpep_dropoff_datetime#270,store_and_fwd_flag#271,RatecodeID#272L,PULocationID#273,DOLocationID#274,passenger_count#275L,trip_distance#276,fare_amount#277,extra#278,mta_tax#279,tip_amount#280,tolls_amount#281,ehail_fee#282,improvement_surcharge#283,total_amount#284,payment_type#285L,trip_type#286L,congestion_surcharge#287] parquet
                                                                                 :- Project [VendorID#308, lpep_pickup_datetime#309, lpep_dropoff_datetime#310, store_and_fwd_flag#311, RatecodeID#312L, PULocationID#313, DOLocationID#314, passenger_count#315L, trip_distance#316, fare_amount#317, extra#318, mta_tax#319, tip_amount#320, tolls_amount#321, ehail_fee#322, improvement_surcharge#323, total_amount#324, payment_type#325L, trip_type#326L, congestion_surcharge#327]
                                                                                 :  +- Relation [VendorID#308,lpep_pickup_datetime#309,lpep_dropoff_datetime#310,store_and_fwd_flag#311,RatecodeID#312L,PULocationID#313,DOLocationID#314,passenger_count#315L,trip_distance#316,fare_amount#317,extra#318,mta_tax#319,tip_amount#320,tolls_amount#321,ehail_fee#322,improvement_surcharge#323,total_amount#324,payment_type#325L,trip_type#326L,congestion_surcharge#327] parquet
                                                                                 :- Project [VendorID#348, lpep_pickup_datetime#349, lpep_dropoff_datetime#350, store_and_fwd_flag#351, RatecodeID#352L, PULocationID#353, DOLocationID#354, passenger_count#355L, trip_distance#356, fare_amount#357, extra#358, mta_tax#359, tip_amount#360, tolls_amount#361, ehail_fee#362, improvement_surcharge#363, total_amount#364, payment_type#365L, trip_type#366L, congestion_surcharge#367]
                                                                                 :  +- Relation [VendorID#348,lpep_pickup_datetime#349,lpep_dropoff_datetime#350,store_and_fwd_flag#351,RatecodeID#352L,PULocationID#353,DOLocationID#354,passenger_count#355L,trip_distance#356,fare_amount#357,extra#358,mta_tax#359,tip_amount#360,tolls_amount#361,ehail_fee#362,improvement_surcharge#363,total_amount#364,payment_type#365L,trip_type#366L,congestion_surcharge#367] parquet
                                                                                 :- Project [VendorID#388, lpep_pickup_datetime#389, lpep_dropoff_datetime#390, store_and_fwd_flag#391, RatecodeID#392L, PULocationID#393, DOLocationID#394, passenger_count#395L, trip_distance#396, fare_amount#397, extra#398, mta_tax#399, tip_amount#400, tolls_amount#401, ehail_fee#402, improvement_surcharge#403, total_amount#404, payment_type#405L, trip_type#406L, congestion_surcharge#407]
                                                                                 :  +- Relation [VendorID#388,lpep_pickup_datetime#389,lpep_dropoff_datetime#390,store_and_fwd_flag#391,RatecodeID#392L,PULocationID#393,DOLocationID#394,passenger_count#395L,trip_distance#396,fare_amount#397,extra#398,mta_tax#399,tip_amount#400,tolls_amount#401,ehail_fee#402,improvement_surcharge#403,total_amount#404,payment_type#405L,trip_type#406L,congestion_surcharge#407] parquet
                                                                                 :- Project [VendorID#428, lpep_pickup_datetime#429, lpep_dropoff_datetime#430, store_and_fwd_flag#431, RatecodeID#432L, PULocationID#433, DOLocationID#434, passenger_count#435L, trip_distance#436, fare_amount#437, extra#438, mta_tax#439, tip_amount#440, tolls_amount#441, ehail_fee#442, improvement_surcharge#443, total_amount#444, payment_type#445L, trip_type#446L, congestion_surcharge#447]
                                                                                 :  +- Relation [VendorID#428,lpep_pickup_datetime#429,lpep_dropoff_datetime#430,store_and_fwd_flag#431,RatecodeID#432L,PULocationID#433,DOLocationID#434,passenger_count#435L,trip_distance#436,fare_amount#437,extra#438,mta_tax#439,tip_amount#440,tolls_amount#441,ehail_fee#442,improvement_surcharge#443,total_amount#444,payment_type#445L,trip_type#446L,congestion_surcharge#447] parquet
                                                                                 +- Project [VendorID#468, lpep_pickup_datetime#469, lpep_dropoff_datetime#470, store_and_fwd_flag#471, RatecodeID#472L, PULocationID#473, DOLocationID#474, passenger_count#475L, trip_distance#476, fare_amount#477, extra#478, mta_tax#479, tip_amount#480, tolls_amount#481, ehail_fee#482, improvement_surcharge#483, total_amount#484, payment_type#485L, trip_type#486L, congestion_surcharge#487]
                                                                                    +- Relation [VendorID#468,lpep_pickup_datetime#469,lpep_dropoff_datetime#470,store_and_fwd_flag#471,RatecodeID#472L,PULocationID#473,DOLocationID#474,passenger_count#475L,trip_distance#476,fare_amount#477,extra#478,mta_tax#479,tip_amount#480,tolls_amount#481,ehail_fee#482,improvement_surcharge#483,total_amount#484,payment_type#485L,trip_type#486L,congestion_surcharge#487] parquet


In [29]:
# show the result
combined.show(5)

[Stage 99:>                                                         (0 + 1) / 1]

+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+----------+-----------+-----------+------------+
|VendorID|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|pickup_hour|pickup_day|dropoff_day|pickup_time|dropoff_time|
+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+----------+-----------+-----------+------------+
|       2|              2|         3.04|         1|             false|         137|         263|    

                                                                                

## Export file to raw folder

In [27]:
# export the combined data to parquet
combined.write.mode("overwrite").parquet("../data/raw/tlc_data/combined.parquet")

                                                                                