# Preprocessing TLC data

This notebook cleans the following datasets: 
1. Yellow taxi data from 2023-06 to 2024-05
2. Green taxi data from 2023-06 to 2024-05


In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [2]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


24/08/24 15:06:38 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 100.94.176.147 instead (on interface en0)
24/08/24 15:06:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/24 15:06:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [4]:
# Read 2023-2024 TLC data
df = spark.read.parquet('../data/landing/tlc_data/*.parquet')


                                                                                

In [5]:
# Read 2023-6 to 2024-5 yellow data
path = "../data/landing/tlc_data"
ydf_2023_6 = spark.read.parquet(path + "/Y-2023-06.parquet")
ydf_2023_7 = spark.read.parquet(path + "/Y-2023-07.parquet")
ydf_2023_8 = spark.read.parquet(path + "/Y-2023-08.parquet")
ydf_2023_9 = spark.read.parquet(path + "/Y-2023-09.parquet")
ydf_2023_10 = spark.read.parquet(path + "/Y-2023-10.parquet")
ydf_2023_11 = spark.read.parquet(path + "/Y-2023-11.parquet")
ydf_2023_12 = spark.read.parquet(path + "/Y-2023-12.parquet")
ydf_2024_1 = spark.read.parquet(path + "/Y-2024-01.parquet")
ydf_2024_2 = spark.read.parquet(path + "/Y-2024-02.parquet")
ydf_2024_3 = spark.read.parquet(path + "/Y-2024-03.parquet")
ydf_2024_4 = spark.read.parquet(path + "/Y-2024-04.parquet")
ydf_2024_5 = spark.read.parquet(path + "/Y-2024-05.parquet")

In [6]:
# Read 2023-6 to 2024-5 green data
path = "../data/landing/tlc_data"
gdf_2023_6 = spark.read.parquet(path + "/G-2023-06.parquet")
gdf_2023_7 = spark.read.parquet(path + "/G-2023-07.parquet")
gdf_2023_8 = spark.read.parquet(path + "/G-2023-08.parquet")
gdf_2023_9 = spark.read.parquet(path + "/G-2023-09.parquet")
gdf_2023_10 = spark.read.parquet(path + "/G-2023-10.parquet")
gdf_2023_11 = spark.read.parquet(path + "/G-2023-11.parquet")
gdf_2023_12 = spark.read.parquet(path + "/G-2023-12.parquet")
gdf_2024_1 = spark.read.parquet(path + "/G-2024-01.parquet")
gdf_2024_2 = spark.read.parquet(path + "/G-2024-02.parquet")
gdf_2024_3 = spark.read.parquet(path + "/G-2024-03.parquet")
gdf_2024_4 = spark.read.parquet(path + "/G-2024-04.parquet")
gdf_2024_5 = spark.read.parquet(path + "/G-2024-05.parquet")

In [7]:
# Show 2023-6 yellow data 
ydf_2023_6.show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-06-01 00:08:48|  2023-06-01 00:29:41|              1|          3.4|         1|                 N|         140|         238|           1|       21.9|  3.5|    0.5|       6.

In [8]:
# Show 2023-6 green data
gdf_2023_6.show(10)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2023-06-01 00:32:25|  2023-06-01 00:39:06|                 N|         1|          74|          42|              1|         0.84|        7.9|  1.0|    0.

## TLC datasets inspection

In [9]:
# Calculate the total row count for yellow taxi data from 2023-6 to 2024-5
yellow_count = (
    ydf_2023_6.count() + 
    ydf_2023_7.count() + 
    ydf_2023_8.count() + 
    ydf_2023_9.count() + 
    ydf_2023_10.count() + 
    ydf_2023_11.count() + 
    ydf_2023_12.count() + 
    ydf_2024_1.count() + 
    ydf_2024_2.count() + 
    ydf_2024_3.count() + 
    ydf_2024_4.count() + 
    ydf_2024_5.count()
)

# Display the total count
yellow_count

38916740

In [10]:
# Calculate the total row count for green taxi data from 2023-6 to 2024-5
green_count = (
    gdf_2023_6.count() + 
    gdf_2023_7.count() + 
    gdf_2023_8.count() + 
    gdf_2023_9.count() + 
    gdf_2023_10.count() + 
    gdf_2023_11.count() + 
    gdf_2023_12.count() + 
    gdf_2024_1.count() + 
    gdf_2024_2.count() + 
    gdf_2024_3.count() + 
    gdf_2024_4.count() + 
    gdf_2024_5.count()
)

# Display the total count
green_count

CodeCache: size=131072Kb used=33751Kb max_used=33751Kb free=97320Kb
 bounds [0x000000010a1e8000, 0x000000010c318000, 0x00000001121e8000]
 total_blobs=12830 nmethods=11838 adapters=903
 compilation: disabled (not enough contiguous free space left)




732489

In [11]:
# Calculate the total row count for all taxi data from 2023-6 to 2024-5
total_count = yellow_count + green_count
total_count

39649229

In [12]:
# Show the schema of 2024-5 yellow data
ydf_2024_5.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [13]:
# Show the schema of 2024-5 green data
gdf_2024_5.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [14]:
# Get columns of each DataFrame
columns_ydf = set(ydf_2024_5.columns)
columns_gdf = set(gdf_2024_5.columns)

# Find differences in columns
columns_only_in_df1 = columns_ydf - columns_gdf
columns_only_in_df2 = columns_gdf - columns_ydf

print(f"Columns only in yellowDF: {columns_only_in_df1}")
print(f"Columns only in greenDF: {columns_only_in_df2}")


Columns only in yellowDF: {'tpep_dropoff_datetime', 'Airport_fee', 'tpep_pickup_datetime'}
Columns only in greenDF: {'trip_type', 'lpep_dropoff_datetime', 'ehail_fee', 'lpep_pickup_datetime'}


## Data Cleaning

In [18]:
ydfs = [ydf_2023_6, ydf_2023_7, ydf_2023_8, ydf_2023_9, ydf_2023_10, ydf_2023_11, ydf_2023_12, 
       ydf_2024_1, ydf_2024_2, ydf_2024_3, ydf_2024_4, ydf_2024_5]
gdfs = [gdf_2023_6, gdf_2023_7, gdf_2023_8, gdf_2023_9, gdf_2023_10, gdf_2023_11, gdf_2023_12, 
       gdf_2024_1, gdf_2024_2, gdf_2024_3, gdf_2024_4, gdf_2024_5]


In [19]:
# Combine all yellow taxi data and green taxi data
from pyspark.sql import DataFrame

# Combine all yellow taxi data
yellow_combined = ydfs[0]
for df in ydfs[1:]:
    yellow_combined = yellow_combined.unionByName(df)

# Combine all green taxi data
green_combined = gdfs[0]
for df in gdfs[1:]:
    green_combined = green_combined.unionByName(df)

In [20]:
# Drop Airport_fee and trip_type columns from the combined data
yellow_combined = yellow_combined.drop("Airport_fee")
green_combined = green_combined.drop("trip_type")

In [21]:
# ehail fee is 0 for all yellow taxi data
yellow_combined = yellow_combined.withColumn("ehail_fee", lit(0))

In [22]:
# Drop fare_amount less than $3 initial price 
yellow_combined = yellow_combined.filter(yellow_combined.fare_amount >= 3)
green_combined = green_combined.filter(green_combined.fare_amount >= 3)

In [29]:
# Passenger count should be fewer than 6 and non-negative
yellow_combined = yellow_combined.filter(yellow_combined.passenger_count <= 6)
green_combined = green_combined.filter(green_combined.passenger_count <= 6)
yellow_combined = yellow_combined.filter(yellow_combined.passenger_count >= 0)
green_combined = green_combined.filter(green_combined.passenger_count >= 0)

In [70]:
# Drop negative trip distances
yellow_combined = yellow_combined.filter(yellow_combined.trip_distance >= 0)
green_combined = green_combined.filter(green_combined.trip_distance >= 0)

### Keeping feature names consistent 
tpep: Taxicab Passenger Enhancement Program for yellow taxi <br> 
lpep: Livery Passenger Enhancement Program for green taxi

In [25]:
# Rename datetime columns to be consistent
from pyspark.sql.functions import col

yellow_combined = yellow_combined.withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
                               .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

green_combined = green_combined.withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
                             .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')


In [26]:
# Drop total_amount less than $3
yellow_combined = yellow_combined.filter(yellow_combined.total_amount >= 3)
green_combined = green_combined.filter(green_combined.total_amount >= 3)

In [27]:
# schema of yellow_combined
yellow_combined.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: integer (nullable = false)



In [28]:
# schema of green_combined
green_combined.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [30]:
# Verify no difference in columns
columns_ydf = set(yellow_combined.columns)
columns_gdf = set(green_combined.columns)
columns_only_in_df1 = columns_ydf - columns_gdf
columns_only_in_df2 = columns_gdf - columns_ydf

print(f"Columns only in yellowDF: {columns_only_in_df1}")
print(f"Columns only in greenDF: {columns_only_in_df2}")


Columns only in yellowDF: set()
Columns only in greenDF: set()


### Outliers Detection

In [36]:
# combine yellow and green data
combined = yellow_combined.unionByName(green_combined)

In [41]:
# Show the schema of the combined data
combined.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)



In [45]:
# Show the first 5 rows of the combined data
combined.show(5)

+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+
|       1|2023-06-01 00:08:48|2023-06-01 00:29:41|              1|          3.4|         1|                 N|         140|         238|           1|       21.9|  3.5|    0.5|       6.7|         0.0|   

In [46]:
# Drop rows with null values in pickup and dropoff datetime columns
combined = combined.filter(F.col("dropoff_datetime").isNotNull() & F.col("pickup_datetime").isNotNull())

In [37]:
# check minimum and maximum datetime
combined.select(F.min("pickup_datetime"), F.max("pickup_datetime")).show()




+--------------------+--------------------+
|min(pickup_datetime)|max(pickup_datetime)|
+--------------------+--------------------+
| 2002-12-31 16:46:07| 2024-06-01 23:54:14|
+--------------------+--------------------+



                                                                                

In [52]:
# Filter the pick up datetime to between 2023-06 to 2024-05
combined = combined.filter(
    (F.col("pickup_datetime") >= "2023-06-01 00:00:00") & 
    (F.col("pickup_datetime") < "2024-06-01 00:00:00")
)
# Filter the drop off datetime to between 2023-06 to 2024-05
combined = combined.filter(
    (F.col("dropoff_datetime") >= "2023-06-01 00:00:00") & 
    (F.col("dropoff_datetime") < "2024-06-01 00:00:00")
)

In [53]:
# check minimum and maximum datetime
combined.select(F.min("pickup_datetime"), F.max("pickup_datetime")).show()



+--------------------+--------------------+
|min(pickup_datetime)|max(pickup_datetime)|
+--------------------+--------------------+
| 2023-06-01 00:00:00| 2024-05-31 23:58:59|
+--------------------+--------------------+



                                                                                

In [61]:
# Add time duration column to the combined data in minutes
combined = combined.withColumn(
    "duration", 
    F.round((F.col("dropoff_datetime").cast("long") - F.col("pickup_datetime").cast("long")) / 60)
)

AnalysisException: [DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION] Cannot resolve "CAST(dropoff_datetime AS BIGINT)" due to data type mismatch: cannot cast "TIMESTAMP_NTZ" to "BIGINT".;
'Project [VendorID#40, pickup_datetime#2232, dropoff_datetime#2252, passenger_count#43L, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#2767, trip_distance_km#4032, round(((cast(dropoff_datetime#2252 as bigint) - cast(pickup_datetime#2232 as bigint)) / 60), 0) AS duration#4597]
+- Project [VendorID#40, pickup_datetime#2232, dropoff_datetime#2252, passenger_count#43L, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#2767, trip_distance_km#4032]
   +- Project [VendorID#40, pickup_datetime#2232, dropoff_datetime#2252, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#2767, (trip_distance#44 * 1.60934) AS trip_distance_km#4032]
      +- Project [VendorID#40, pickup_datetime#2232, dropoff_datetime#2252, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#2767, (trip_distance#44 * 1.60934) AS trip_distance_km#3459]
         +- Filter ((dropoff_datetime#2252 >= cast(2023-06-01 00:00:00 as timestamp_ntz)) AND (dropoff_datetime#2252 < cast(2024-06-01 00:00:00 as timestamp_ntz)))
            +- Filter ((pickup_datetime#2232 >= cast(2023-06-01 00:00:00 as timestamp_ntz)) AND (pickup_datetime#2232 < cast(2024-06-01 00:00:00 as timestamp_ntz)))
               +- Filter (isnotnull(dropoff_datetime#2252) AND isnotnull(pickup_datetime#2232))
                  +- Filter (isnotnull(dropoff_datetime#2252) AND isnotnull(pickup_datetime#2232))
                     +- Union false, false
                        :- Project [VendorID#40, pickup_datetime#2232, dropoff_datetime#2252, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, cast(ehail_fee#2212 as double) AS ehail_fee#2767]
                        :  +- Filter (passenger_count#43L >= cast(0 as bigint))
                        :     +- Filter (passenger_count#43L <= cast(6 as bigint))
                        :        +- Filter (total_amount#56 >= cast(3 as double))
                        :           +- Project [VendorID#40, pickup_datetime#2232, tpep_dropoff_datetime#42 AS dropoff_datetime#2252, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#2212]
                        :              +- Project [VendorID#40, tpep_pickup_datetime#41 AS pickup_datetime#2232, tpep_dropoff_datetime#42, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, ehail_fee#2212]
                        :                 +- Filter (passenger_count#43L <= cast(6 as bigint))
                        :                    +- Filter (fare_amount#50 >= cast(3 as double))
                        :                       +- Project [VendorID#40, tpep_pickup_datetime#41, tpep_dropoff_datetime#42, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57, 0 AS ehail_fee#2212]
                        :                          +- Project [VendorID#40, tpep_pickup_datetime#41, tpep_dropoff_datetime#42, passenger_count#43L, trip_distance#44, RatecodeID#45L, store_and_fwd_flag#46, PULocationID#47, DOLocationID#48, payment_type#49L, fare_amount#50, extra#51, mta_tax#52, tip_amount#53, tolls_amount#54, improvement_surcharge#55, total_amount#56, congestion_surcharge#57]
                        :                             +- Union false, false
                        :                                :- Relation [VendorID#40,tpep_pickup_datetime#41,tpep_dropoff_datetime#42,passenger_count#43L,trip_distance#44,RatecodeID#45L,store_and_fwd_flag#46,PULocationID#47,DOLocationID#48,payment_type#49L,fare_amount#50,extra#51,mta_tax#52,tip_amount#53,tolls_amount#54,improvement_surcharge#55,total_amount#56,congestion_surcharge#57,Airport_fee#58] parquet
                        :                                :- Project [VendorID#78, tpep_pickup_datetime#79, tpep_dropoff_datetime#80, passenger_count#81L, trip_distance#82, RatecodeID#83L, store_and_fwd_flag#84, PULocationID#85, DOLocationID#86, payment_type#87L, fare_amount#88, extra#89, mta_tax#90, tip_amount#91, tolls_amount#92, improvement_surcharge#93, total_amount#94, congestion_surcharge#95, Airport_fee#96]
                        :                                :  +- Relation [VendorID#78,tpep_pickup_datetime#79,tpep_dropoff_datetime#80,passenger_count#81L,trip_distance#82,RatecodeID#83L,store_and_fwd_flag#84,PULocationID#85,DOLocationID#86,payment_type#87L,fare_amount#88,extra#89,mta_tax#90,tip_amount#91,tolls_amount#92,improvement_surcharge#93,total_amount#94,congestion_surcharge#95,Airport_fee#96] parquet
                        :                                :- Project [VendorID#116, tpep_pickup_datetime#117, tpep_dropoff_datetime#118, passenger_count#119L, trip_distance#120, RatecodeID#121L, store_and_fwd_flag#122, PULocationID#123, DOLocationID#124, payment_type#125L, fare_amount#126, extra#127, mta_tax#128, tip_amount#129, tolls_amount#130, improvement_surcharge#131, total_amount#132, congestion_surcharge#133, Airport_fee#134]
                        :                                :  +- Relation [VendorID#116,tpep_pickup_datetime#117,tpep_dropoff_datetime#118,passenger_count#119L,trip_distance#120,RatecodeID#121L,store_and_fwd_flag#122,PULocationID#123,DOLocationID#124,payment_type#125L,fare_amount#126,extra#127,mta_tax#128,tip_amount#129,tolls_amount#130,improvement_surcharge#131,total_amount#132,congestion_surcharge#133,Airport_fee#134] parquet
                        :                                :- Project [VendorID#154, tpep_pickup_datetime#155, tpep_dropoff_datetime#156, passenger_count#157L, trip_distance#158, RatecodeID#159L, store_and_fwd_flag#160, PULocationID#161, DOLocationID#162, payment_type#163L, fare_amount#164, extra#165, mta_tax#166, tip_amount#167, tolls_amount#168, improvement_surcharge#169, total_amount#170, congestion_surcharge#171, Airport_fee#172]
                        :                                :  +- Relation [VendorID#154,tpep_pickup_datetime#155,tpep_dropoff_datetime#156,passenger_count#157L,trip_distance#158,RatecodeID#159L,store_and_fwd_flag#160,PULocationID#161,DOLocationID#162,payment_type#163L,fare_amount#164,extra#165,mta_tax#166,tip_amount#167,tolls_amount#168,improvement_surcharge#169,total_amount#170,congestion_surcharge#171,Airport_fee#172] parquet
                        :                                :- Project [VendorID#192, tpep_pickup_datetime#193, tpep_dropoff_datetime#194, passenger_count#195L, trip_distance#196, RatecodeID#197L, store_and_fwd_flag#198, PULocationID#199, DOLocationID#200, payment_type#201L, fare_amount#202, extra#203, mta_tax#204, tip_amount#205, tolls_amount#206, improvement_surcharge#207, total_amount#208, congestion_surcharge#209, Airport_fee#210]
                        :                                :  +- Relation [VendorID#192,tpep_pickup_datetime#193,tpep_dropoff_datetime#194,passenger_count#195L,trip_distance#196,RatecodeID#197L,store_and_fwd_flag#198,PULocationID#199,DOLocationID#200,payment_type#201L,fare_amount#202,extra#203,mta_tax#204,tip_amount#205,tolls_amount#206,improvement_surcharge#207,total_amount#208,congestion_surcharge#209,Airport_fee#210] parquet
                        :                                :- Project [VendorID#230, tpep_pickup_datetime#231, tpep_dropoff_datetime#232, passenger_count#233L, trip_distance#234, RatecodeID#235L, store_and_fwd_flag#236, PULocationID#237, DOLocationID#238, payment_type#239L, fare_amount#240, extra#241, mta_tax#242, tip_amount#243, tolls_amount#244, improvement_surcharge#245, total_amount#246, congestion_surcharge#247, Airport_fee#248]
                        :                                :  +- Relation [VendorID#230,tpep_pickup_datetime#231,tpep_dropoff_datetime#232,passenger_count#233L,trip_distance#234,RatecodeID#235L,store_and_fwd_flag#236,PULocationID#237,DOLocationID#238,payment_type#239L,fare_amount#240,extra#241,mta_tax#242,tip_amount#243,tolls_amount#244,improvement_surcharge#245,total_amount#246,congestion_surcharge#247,Airport_fee#248] parquet
                        :                                :- Project [VendorID#268, tpep_pickup_datetime#269, tpep_dropoff_datetime#270, passenger_count#271L, trip_distance#272, RatecodeID#273L, store_and_fwd_flag#274, PULocationID#275, DOLocationID#276, payment_type#277L, fare_amount#278, extra#279, mta_tax#280, tip_amount#281, tolls_amount#282, improvement_surcharge#283, total_amount#284, congestion_surcharge#285, Airport_fee#286]
                        :                                :  +- Relation [VendorID#268,tpep_pickup_datetime#269,tpep_dropoff_datetime#270,passenger_count#271L,trip_distance#272,RatecodeID#273L,store_and_fwd_flag#274,PULocationID#275,DOLocationID#276,payment_type#277L,fare_amount#278,extra#279,mta_tax#280,tip_amount#281,tolls_amount#282,improvement_surcharge#283,total_amount#284,congestion_surcharge#285,Airport_fee#286] parquet
                        :                                :- Project [VendorID#306, tpep_pickup_datetime#307, tpep_dropoff_datetime#308, passenger_count#309L, trip_distance#310, RatecodeID#311L, store_and_fwd_flag#312, PULocationID#313, DOLocationID#314, payment_type#315L, fare_amount#316, extra#317, mta_tax#318, tip_amount#319, tolls_amount#320, improvement_surcharge#321, total_amount#322, congestion_surcharge#323, Airport_fee#324]
                        :                                :  +- Relation [VendorID#306,tpep_pickup_datetime#307,tpep_dropoff_datetime#308,passenger_count#309L,trip_distance#310,RatecodeID#311L,store_and_fwd_flag#312,PULocationID#313,DOLocationID#314,payment_type#315L,fare_amount#316,extra#317,mta_tax#318,tip_amount#319,tolls_amount#320,improvement_surcharge#321,total_amount#322,congestion_surcharge#323,Airport_fee#324] parquet
                        :                                :- Project [VendorID#344, tpep_pickup_datetime#345, tpep_dropoff_datetime#346, passenger_count#347L, trip_distance#348, RatecodeID#349L, store_and_fwd_flag#350, PULocationID#351, DOLocationID#352, payment_type#353L, fare_amount#354, extra#355, mta_tax#356, tip_amount#357, tolls_amount#358, improvement_surcharge#359, total_amount#360, congestion_surcharge#361, Airport_fee#362]
                        :                                :  +- Relation [VendorID#344,tpep_pickup_datetime#345,tpep_dropoff_datetime#346,passenger_count#347L,trip_distance#348,RatecodeID#349L,store_and_fwd_flag#350,PULocationID#351,DOLocationID#352,payment_type#353L,fare_amount#354,extra#355,mta_tax#356,tip_amount#357,tolls_amount#358,improvement_surcharge#359,total_amount#360,congestion_surcharge#361,Airport_fee#362] parquet
                        :                                :- Project [VendorID#382, tpep_pickup_datetime#383, tpep_dropoff_datetime#384, passenger_count#385L, trip_distance#386, RatecodeID#387L, store_and_fwd_flag#388, PULocationID#389, DOLocationID#390, payment_type#391L, fare_amount#392, extra#393, mta_tax#394, tip_amount#395, tolls_amount#396, improvement_surcharge#397, total_amount#398, congestion_surcharge#399, Airport_fee#400]
                        :                                :  +- Relation [VendorID#382,tpep_pickup_datetime#383,tpep_dropoff_datetime#384,passenger_count#385L,trip_distance#386,RatecodeID#387L,store_and_fwd_flag#388,PULocationID#389,DOLocationID#390,payment_type#391L,fare_amount#392,extra#393,mta_tax#394,tip_amount#395,tolls_amount#396,improvement_surcharge#397,total_amount#398,congestion_surcharge#399,Airport_fee#400] parquet
                        :                                :- Project [VendorID#420, tpep_pickup_datetime#421, tpep_dropoff_datetime#422, passenger_count#423L, trip_distance#424, RatecodeID#425L, store_and_fwd_flag#426, PULocationID#427, DOLocationID#428, payment_type#429L, fare_amount#430, extra#431, mta_tax#432, tip_amount#433, tolls_amount#434, improvement_surcharge#435, total_amount#436, congestion_surcharge#437, Airport_fee#438]
                        :                                :  +- Relation [VendorID#420,tpep_pickup_datetime#421,tpep_dropoff_datetime#422,passenger_count#423L,trip_distance#424,RatecodeID#425L,store_and_fwd_flag#426,PULocationID#427,DOLocationID#428,payment_type#429L,fare_amount#430,extra#431,mta_tax#432,tip_amount#433,tolls_amount#434,improvement_surcharge#435,total_amount#436,congestion_surcharge#437,Airport_fee#438] parquet
                        :                                +- Project [VendorID#458, tpep_pickup_datetime#459, tpep_dropoff_datetime#460, passenger_count#461L, trip_distance#462, RatecodeID#463L, store_and_fwd_flag#464, PULocationID#465, DOLocationID#466, payment_type#467L, fare_amount#468, extra#469, mta_tax#470, tip_amount#471, tolls_amount#472, improvement_surcharge#473, total_amount#474, congestion_surcharge#475, Airport_fee#476]
                        :                                   +- Relation [VendorID#458,tpep_pickup_datetime#459,tpep_dropoff_datetime#460,passenger_count#461L,trip_distance#462,RatecodeID#463L,store_and_fwd_flag#464,PULocationID#465,DOLocationID#466,payment_type#467L,fare_amount#468,extra#469,mta_tax#470,tip_amount#471,tolls_amount#472,improvement_surcharge#473,total_amount#474,congestion_surcharge#475,Airport_fee#476] parquet
                        +- Project [VendorID#496, pickup_datetime#2272, dropoff_datetime#2292, passenger_count#503L, trip_distance#504, RatecodeID#500L, store_and_fwd_flag#499, PULocationID#501, DOLocationID#502, payment_type#513L, fare_amount#505, extra#506, mta_tax#507, tip_amount#508, tolls_amount#509, improvement_surcharge#511, total_amount#512, congestion_surcharge#515, ehail_fee#510]
                           +- Filter (passenger_count#503L >= cast(0 as bigint))
                              +- Filter (passenger_count#503L <= cast(6 as bigint))
                                 +- Filter (total_amount#512 >= cast(3 as double))
                                    +- Project [VendorID#496, pickup_datetime#2272, lpep_dropoff_datetime#498 AS dropoff_datetime#2292, store_and_fwd_flag#499, RatecodeID#500L, PULocationID#501, DOLocationID#502, passenger_count#503L, trip_distance#504, fare_amount#505, extra#506, mta_tax#507, tip_amount#508, tolls_amount#509, ehail_fee#510, improvement_surcharge#511, total_amount#512, payment_type#513L, congestion_surcharge#515]
                                       +- Project [VendorID#496, lpep_pickup_datetime#497 AS pickup_datetime#2272, lpep_dropoff_datetime#498, store_and_fwd_flag#499, RatecodeID#500L, PULocationID#501, DOLocationID#502, passenger_count#503L, trip_distance#504, fare_amount#505, extra#506, mta_tax#507, tip_amount#508, tolls_amount#509, ehail_fee#510, improvement_surcharge#511, total_amount#512, payment_type#513L, congestion_surcharge#515]
                                          +- Filter (passenger_count#503L <= cast(6 as bigint))
                                             +- Filter (fare_amount#505 >= cast(3 as double))
                                                +- Project [VendorID#496, lpep_pickup_datetime#497, lpep_dropoff_datetime#498, store_and_fwd_flag#499, RatecodeID#500L, PULocationID#501, DOLocationID#502, passenger_count#503L, trip_distance#504, fare_amount#505, extra#506, mta_tax#507, tip_amount#508, tolls_amount#509, ehail_fee#510, improvement_surcharge#511, total_amount#512, payment_type#513L, congestion_surcharge#515]
                                                   +- Union false, false
                                                      :- Relation [VendorID#496,lpep_pickup_datetime#497,lpep_dropoff_datetime#498,store_and_fwd_flag#499,RatecodeID#500L,PULocationID#501,DOLocationID#502,passenger_count#503L,trip_distance#504,fare_amount#505,extra#506,mta_tax#507,tip_amount#508,tolls_amount#509,ehail_fee#510,improvement_surcharge#511,total_amount#512,payment_type#513L,trip_type#514L,congestion_surcharge#515] parquet
                                                      :- Project [VendorID#536, lpep_pickup_datetime#537, lpep_dropoff_datetime#538, store_and_fwd_flag#539, RatecodeID#540L, PULocationID#541, DOLocationID#542, passenger_count#543L, trip_distance#544, fare_amount#545, extra#546, mta_tax#547, tip_amount#548, tolls_amount#549, ehail_fee#550, improvement_surcharge#551, total_amount#552, payment_type#553L, trip_type#554L, congestion_surcharge#555]
                                                      :  +- Relation [VendorID#536,lpep_pickup_datetime#537,lpep_dropoff_datetime#538,store_and_fwd_flag#539,RatecodeID#540L,PULocationID#541,DOLocationID#542,passenger_count#543L,trip_distance#544,fare_amount#545,extra#546,mta_tax#547,tip_amount#548,tolls_amount#549,ehail_fee#550,improvement_surcharge#551,total_amount#552,payment_type#553L,trip_type#554L,congestion_surcharge#555] parquet
                                                      :- Project [VendorID#576, lpep_pickup_datetime#577, lpep_dropoff_datetime#578, store_and_fwd_flag#579, RatecodeID#580L, PULocationID#581, DOLocationID#582, passenger_count#583L, trip_distance#584, fare_amount#585, extra#586, mta_tax#587, tip_amount#588, tolls_amount#589, ehail_fee#590, improvement_surcharge#591, total_amount#592, payment_type#593L, trip_type#594L, congestion_surcharge#595]
                                                      :  +- Relation [VendorID#576,lpep_pickup_datetime#577,lpep_dropoff_datetime#578,store_and_fwd_flag#579,RatecodeID#580L,PULocationID#581,DOLocationID#582,passenger_count#583L,trip_distance#584,fare_amount#585,extra#586,mta_tax#587,tip_amount#588,tolls_amount#589,ehail_fee#590,improvement_surcharge#591,total_amount#592,payment_type#593L,trip_type#594L,congestion_surcharge#595] parquet
                                                      :- Project [VendorID#616, lpep_pickup_datetime#617, lpep_dropoff_datetime#618, store_and_fwd_flag#619, RatecodeID#620L, PULocationID#621, DOLocationID#622, passenger_count#623L, trip_distance#624, fare_amount#625, extra#626, mta_tax#627, tip_amount#628, tolls_amount#629, ehail_fee#630, improvement_surcharge#631, total_amount#632, payment_type#633L, trip_type#634L, congestion_surcharge#635]
                                                      :  +- Relation [VendorID#616,lpep_pickup_datetime#617,lpep_dropoff_datetime#618,store_and_fwd_flag#619,RatecodeID#620L,PULocationID#621,DOLocationID#622,passenger_count#623L,trip_distance#624,fare_amount#625,extra#626,mta_tax#627,tip_amount#628,tolls_amount#629,ehail_fee#630,improvement_surcharge#631,total_amount#632,payment_type#633L,trip_type#634L,congestion_surcharge#635] parquet
                                                      :- Project [VendorID#656, lpep_pickup_datetime#657, lpep_dropoff_datetime#658, store_and_fwd_flag#659, RatecodeID#660L, PULocationID#661, DOLocationID#662, passenger_count#663L, trip_distance#664, fare_amount#665, extra#666, mta_tax#667, tip_amount#668, tolls_amount#669, ehail_fee#670, improvement_surcharge#671, total_amount#672, payment_type#673L, trip_type#674L, congestion_surcharge#675]
                                                      :  +- Relation [VendorID#656,lpep_pickup_datetime#657,lpep_dropoff_datetime#658,store_and_fwd_flag#659,RatecodeID#660L,PULocationID#661,DOLocationID#662,passenger_count#663L,trip_distance#664,fare_amount#665,extra#666,mta_tax#667,tip_amount#668,tolls_amount#669,ehail_fee#670,improvement_surcharge#671,total_amount#672,payment_type#673L,trip_type#674L,congestion_surcharge#675] parquet
                                                      :- Project [VendorID#696, lpep_pickup_datetime#697, lpep_dropoff_datetime#698, store_and_fwd_flag#699, RatecodeID#700L, PULocationID#701, DOLocationID#702, passenger_count#703L, trip_distance#704, fare_amount#705, extra#706, mta_tax#707, tip_amount#708, tolls_amount#709, ehail_fee#710, improvement_surcharge#711, total_amount#712, payment_type#713L, trip_type#714L, congestion_surcharge#715]
                                                      :  +- Relation [VendorID#696,lpep_pickup_datetime#697,lpep_dropoff_datetime#698,store_and_fwd_flag#699,RatecodeID#700L,PULocationID#701,DOLocationID#702,passenger_count#703L,trip_distance#704,fare_amount#705,extra#706,mta_tax#707,tip_amount#708,tolls_amount#709,ehail_fee#710,improvement_surcharge#711,total_amount#712,payment_type#713L,trip_type#714L,congestion_surcharge#715] parquet
                                                      :- Project [VendorID#736, lpep_pickup_datetime#737, lpep_dropoff_datetime#738, store_and_fwd_flag#739, RatecodeID#740L, PULocationID#741, DOLocationID#742, passenger_count#743L, trip_distance#744, fare_amount#745, extra#746, mta_tax#747, tip_amount#748, tolls_amount#749, ehail_fee#750, improvement_surcharge#751, total_amount#752, payment_type#753L, trip_type#754L, congestion_surcharge#755]
                                                      :  +- Relation [VendorID#736,lpep_pickup_datetime#737,lpep_dropoff_datetime#738,store_and_fwd_flag#739,RatecodeID#740L,PULocationID#741,DOLocationID#742,passenger_count#743L,trip_distance#744,fare_amount#745,extra#746,mta_tax#747,tip_amount#748,tolls_amount#749,ehail_fee#750,improvement_surcharge#751,total_amount#752,payment_type#753L,trip_type#754L,congestion_surcharge#755] parquet
                                                      :- Project [VendorID#776, lpep_pickup_datetime#777, lpep_dropoff_datetime#778, store_and_fwd_flag#779, RatecodeID#780L, PULocationID#781, DOLocationID#782, passenger_count#783L, trip_distance#784, fare_amount#785, extra#786, mta_tax#787, tip_amount#788, tolls_amount#789, ehail_fee#790, improvement_surcharge#791, total_amount#792, payment_type#793L, trip_type#794L, congestion_surcharge#795]
                                                      :  +- Relation [VendorID#776,lpep_pickup_datetime#777,lpep_dropoff_datetime#778,store_and_fwd_flag#779,RatecodeID#780L,PULocationID#781,DOLocationID#782,passenger_count#783L,trip_distance#784,fare_amount#785,extra#786,mta_tax#787,tip_amount#788,tolls_amount#789,ehail_fee#790,improvement_surcharge#791,total_amount#792,payment_type#793L,trip_type#794L,congestion_surcharge#795] parquet
                                                      :- Project [VendorID#816, lpep_pickup_datetime#817, lpep_dropoff_datetime#818, store_and_fwd_flag#819, RatecodeID#820L, PULocationID#821, DOLocationID#822, passenger_count#823L, trip_distance#824, fare_amount#825, extra#826, mta_tax#827, tip_amount#828, tolls_amount#829, ehail_fee#830, improvement_surcharge#831, total_amount#832, payment_type#833L, trip_type#834L, congestion_surcharge#835]
                                                      :  +- Relation [VendorID#816,lpep_pickup_datetime#817,lpep_dropoff_datetime#818,store_and_fwd_flag#819,RatecodeID#820L,PULocationID#821,DOLocationID#822,passenger_count#823L,trip_distance#824,fare_amount#825,extra#826,mta_tax#827,tip_amount#828,tolls_amount#829,ehail_fee#830,improvement_surcharge#831,total_amount#832,payment_type#833L,trip_type#834L,congestion_surcharge#835] parquet
                                                      :- Project [VendorID#856, lpep_pickup_datetime#857, lpep_dropoff_datetime#858, store_and_fwd_flag#859, RatecodeID#860L, PULocationID#861, DOLocationID#862, passenger_count#863L, trip_distance#864, fare_amount#865, extra#866, mta_tax#867, tip_amount#868, tolls_amount#869, ehail_fee#870, improvement_surcharge#871, total_amount#872, payment_type#873L, trip_type#874L, congestion_surcharge#875]
                                                      :  +- Relation [VendorID#856,lpep_pickup_datetime#857,lpep_dropoff_datetime#858,store_and_fwd_flag#859,RatecodeID#860L,PULocationID#861,DOLocationID#862,passenger_count#863L,trip_distance#864,fare_amount#865,extra#866,mta_tax#867,tip_amount#868,tolls_amount#869,ehail_fee#870,improvement_surcharge#871,total_amount#872,payment_type#873L,trip_type#874L,congestion_surcharge#875] parquet
                                                      :- Project [VendorID#896, lpep_pickup_datetime#897, lpep_dropoff_datetime#898, store_and_fwd_flag#899, RatecodeID#900L, PULocationID#901, DOLocationID#902, passenger_count#903L, trip_distance#904, fare_amount#905, extra#906, mta_tax#907, tip_amount#908, tolls_amount#909, ehail_fee#910, improvement_surcharge#911, total_amount#912, payment_type#913L, trip_type#914L, congestion_surcharge#915]
                                                      :  +- Relation [VendorID#896,lpep_pickup_datetime#897,lpep_dropoff_datetime#898,store_and_fwd_flag#899,RatecodeID#900L,PULocationID#901,DOLocationID#902,passenger_count#903L,trip_distance#904,fare_amount#905,extra#906,mta_tax#907,tip_amount#908,tolls_amount#909,ehail_fee#910,improvement_surcharge#911,total_amount#912,payment_type#913L,trip_type#914L,congestion_surcharge#915] parquet
                                                      +- Project [VendorID#936, lpep_pickup_datetime#937, lpep_dropoff_datetime#938, store_and_fwd_flag#939, RatecodeID#940L, PULocationID#941, DOLocationID#942, passenger_count#943L, trip_distance#944, fare_amount#945, extra#946, mta_tax#947, tip_amount#948, tolls_amount#949, ehail_fee#950, improvement_surcharge#951, total_amount#952, payment_type#953L, trip_type#954L, congestion_surcharge#955]
                                                         +- Relation [VendorID#936,lpep_pickup_datetime#937,lpep_dropoff_datetime#938,store_and_fwd_flag#939,RatecodeID#940L,PULocationID#941,DOLocationID#942,passenger_count#943L,trip_distance#944,fare_amount#945,extra#946,mta_tax#947,tip_amount#948,tolls_amount#949,ehail_fee#950,improvement_surcharge#951,total_amount#952,payment_type#953L,trip_type#954L,congestion_surcharge#955] parquet


In [59]:
# Convert to kilometers for the trip distance
combined = combined.withColumn("trip_distance_km", F.col("trip_distance") * 1.60934)
# drop the trip_distance column
combined = combined.drop("trip_distance")

In [60]:
# Show the first 5 rows of the combined data
combined.show(5)

+--------+-------------------+-------------------+---------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+------------------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|  trip_distance_km|
+--------+-------------------+-------------------+---------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+------------------+
|       1|2023-06-01 00:08:48|2023-06-01 00:29:41|              1|         1|                 N|         140|         238|           1|       21.9|  3.5|    0.5|       6.7|         0.0|  

## Add new column ##

## Sampling Data

## Handling missing data 

 ## Datatype Conversion

## Handling duplicates

## Anomaly 

## Standardization 