# Preprocessing TLC data

This notebook cleans the following datasets: 
1. Yellow taxi data from 2023-06 to 2024-05
2. Green taxi data from 2023-06 to 2024-05


In [22]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [12]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


In [13]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [14]:
# Read 2023-2024 TLC data
df = spark.read.parquet('../data/landing/tlc_data/*.parquet')


In [15]:
# Read 2023-6 to 2024-5 yellow data
path = "../data/landing/tlc_data"
ydf_2023_6 = spark.read.parquet(path + "/Y-2023-06.parquet")
ydf_2023_7 = spark.read.parquet(path + "/Y-2023-07.parquet")
ydf_2023_8 = spark.read.parquet(path + "/Y-2023-08.parquet")
ydf_2023_9 = spark.read.parquet(path + "/Y-2023-09.parquet")
ydf_2023_10 = spark.read.parquet(path + "/Y-2023-10.parquet")
ydf_2023_11 = spark.read.parquet(path + "/Y-2023-11.parquet")
ydf_2023_12 = spark.read.parquet(path + "/Y-2023-12.parquet")
ydf_2024_1 = spark.read.parquet(path + "/Y-2024-01.parquet")
ydf_2024_2 = spark.read.parquet(path + "/Y-2024-02.parquet")
ydf_2024_3 = spark.read.parquet(path + "/Y-2024-03.parquet")
ydf_2024_4 = spark.read.parquet(path + "/Y-2024-04.parquet")
ydf_2024_5 = spark.read.parquet(path + "/Y-2024-05.parquet")

In [16]:
# Read 2023-6 to 2024-5 green data
path = "../data/landing/tlc_data"
gdf_2023_6 = spark.read.parquet(path + "/G-2023-06.parquet")
gdf_2023_7 = spark.read.parquet(path + "/G-2023-07.parquet")
gdf_2023_8 = spark.read.parquet(path + "/G-2023-08.parquet")
gdf_2023_9 = spark.read.parquet(path + "/G-2023-09.parquet")
gdf_2023_10 = spark.read.parquet(path + "/G-2023-10.parquet")
gdf_2023_11 = spark.read.parquet(path + "/G-2023-11.parquet")
gdf_2023_12 = spark.read.parquet(path + "/G-2023-12.parquet")
gdf_2024_1 = spark.read.parquet(path + "/G-2024-01.parquet")
gdf_2024_2 = spark.read.parquet(path + "/G-2024-02.parquet")
gdf_2024_3 = spark.read.parquet(path + "/G-2024-03.parquet")
gdf_2024_4 = spark.read.parquet(path + "/G-2024-04.parquet")
gdf_2024_5 = spark.read.parquet(path + "/G-2024-05.parquet")

In [17]:
# Show 2023-6 yellow data 
ydf_2023_6.show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-06-01 00:08:48|  2023-06-01 00:29:41|              1|          3.4|         1|                 N|         140|         238|           1|       21.9|  3.5|    0.5|       6.

In [18]:
# Show 2023-6 green data
gdf_2023_6.show(10)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2023-06-01 00:32:25|  2023-06-01 00:39:06|                 N|         1|          74|          42|              1|         0.84|        7.9|  1.0|    0.

## Inspection of TLC datasets

In [20]:
# Calculate the total row count for yellow taxi data from 2023-6 to 2024-5
yellow_count = (
    ydf_2023_6.count() + 
    ydf_2023_7.count() + 
    ydf_2023_8.count() + 
    ydf_2023_9.count() + 
    ydf_2023_10.count() + 
    ydf_2023_11.count() + 
    ydf_2023_12.count() + 
    ydf_2024_1.count() + 
    ydf_2024_2.count() + 
    ydf_2024_3.count() + 
    ydf_2024_4.count() + 
    ydf_2024_5.count()
)

# Display the total count
yellow_count

38916740

In [21]:
# Calculate the total row count for green taxi data from 2023-6 to 2024-5
green_count = (
    gdf_2023_6.count() + 
    gdf_2023_7.count() + 
    gdf_2023_8.count() + 
    gdf_2023_9.count() + 
    gdf_2023_10.count() + 
    gdf_2023_11.count() + 
    gdf_2023_12.count() + 
    gdf_2024_1.count() + 
    gdf_2024_2.count() + 
    gdf_2024_3.count() + 
    gdf_2024_4.count() + 
    gdf_2024_5.count()
)

# Display the total count
green_count

732489

In [63]:
# Show the schema of 2024-5 yellow data
ydf_2024_5.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [64]:
# Show the schema of 2024-5 green data
gdf_2024_5.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [65]:
# Get columns of each DataFrame
columns_ydf = set(ydf_2024_5.columns)
columns_gdf = set(gdf_2024_5.columns)

# Find differences in columns
columns_only_in_df1 = columns_ydf - columns_gdf
columns_only_in_df2 = columns_gdf - columns_ydf

print(f"Columns only in yellowDF: {columns_only_in_df1}")
print(f"Columns only in greenDF: {columns_only_in_df2}")


Columns only in yellowDF: {'Airport_fee', 'tpep_dropoff_datetime', 'tpep_pickup_datetime'}
Columns only in greenDF: {'trip_type', 'ehail_fee', 'lpep_pickup_datetime', 'lpep_dropoff_datetime'}


## Preprocessing

In [66]:
# Drop fare_amount less than $3 initial price 
df_filtered = df.filter(F.col('fare_amount') >= 3)

In [68]:
# Max passenger count should be 6 
df_filtered = df_filtered.filter(F.col('passenger_count') <= 6)

In [70]:
# Drop negative trip distances
df_filtered = df_filtered.filter(F.col('trip_distance') >= 0)

In [71]:
df_filtered

VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
1,,,N,1,138,145,1,6.1,28.2,7.75,0.5,5.0,0.0,,1.0,42.45,1,,0.0
2,,,N,1,138,249,1,11.23,46.4,6.0,0.5,8.72,0.0,,1.0,66.87,1,,2.5
2,,,N,1,138,170,1,9.02,35.9,6.0,0.5,10.57,6.94,,1.0,65.16,1,,2.5
2,,,N,1,87,133,1,6.53,30.3,1.0,0.5,7.06,0.0,,1.0,42.36,1,,2.5
2,,,N,1,161,165,1,14.38,61.8,1.0,0.5,0.0,0.0,,1.0,66.8,1,,2.5
2,,,N,1,138,151,1,9.19,37.3,6.0,0.5,10.7,6.94,,1.0,64.19,1,,0.0
2,,,N,1,186,33,1,4.52,23.3,1.0,0.5,5.09,0.0,,1.0,33.39,1,,2.5
1,,,N,1,237,166,1,2.7,12.8,3.5,0.5,3.2,0.0,,1.0,21.0,1,,2.5
1,,,N,1,237,116,1,4.3,20.5,3.5,0.5,0.0,0.0,,1.0,25.5,2,,2.5
2,,,N,1,114,142,1,4.28,25.4,1.0,0.5,6.08,0.0,,1.0,36.48,1,,2.5


In [52]:
# Convert the columns to timestamp if they are not already
df_filtered = df_filtered.withColumn("lpep_pickup_datetime", F.col("lpep_pickup_datetime").cast("timestamp"))
df_filtered = df_filtered.withColumn("lpep_dropoff_datetime", F.col("lpep_dropoff_datetime").cast("timestamp"))

# Calculate the trip duration in seconds
df_filtered = df_filtered.withColumn("trip_duration", 
                   (F.col("lpep_dropoff_datetime").cast("long") - F.col("lpep_pickup_datetime").cast("long")))

# Show the resulting DataFrame
df_filtered.show()



+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_duration|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-------------+
|       1|                NULL|                 NULL|                 N|         1|         138|         145|            

In [None]:
# Drop negative trip durations
df_filtered = df_filtered.filter(F.col('trip_duration') >= 0)

In [16]:
ydf_2024_5.filter(F.col('trip_distance') > 100).limit(10)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
2,2024-05-01 12:10:15,2024-05-01 17:00:14,1,116.76,5,N,265,230,1,300.0,0.0,0.0,5.0,15.38,1.0,321.38,0.0,0.0
2,2024-05-02 15:40:24,2024-05-02 18:16:00,1,107.52,5,N,132,265,1,350.0,0.0,0.0,30.0,37.26,1.0,420.01,0.0,1.75
1,2024-05-03 12:09:20,2024-05-03 12:54:56,1,324.0,99,N,130,188,1,33.5,0.0,0.5,0.0,0.0,1.0,35.0,0.0,0.0
2,2024-05-05 00:57:11,2024-05-05 01:13:28,1,4003.25,4,N,68,231,1,20.0,0.0,0.5,0.0,0.0,1.0,24.0,2.5,0.0
2,2024-05-05 02:24:58,2024-05-05 04:09:48,2,101.7,5,N,216,265,2,370.0,0.0,0.0,0.0,0.0,1.0,371.0,0.0,0.0
1,2024-05-06 15:47:24,2024-05-06 20:21:44,2,205.4,5,N,132,265,1,800.0,0.0,0.0,50.0,0.0,1.0,851.0,0.0,0.0
2,2024-05-06 17:28:00,2024-05-06 19:50:04,4,102.64,4,N,132,265,2,621.1,2.5,0.0,0.0,29.26,1.0,653.86,0.0,0.0
2,2024-05-07 17:41:49,2024-05-07 20:28:57,1,116.79,5,N,132,265,1,400.0,0.0,0.0,5.0,0.0,1.0,406.0,0.0,0.0
2,2024-05-10 02:49:19,2024-05-10 05:03:14,2,128.07,5,N,155,265,3,-350.0,0.0,0.0,0.0,-13.38,-1.0,-364.38,0.0,0.0
2,2024-05-10 02:49:19,2024-05-10 05:03:14,2,128.07,5,N,155,265,3,350.0,0.0,0.0,0.0,13.38,1.0,364.38,0.0,0.0


In [14]:
# Filtering yellow taxi data
df_2023_filtered = df_2019_1.filter(df_2019_1["trip_distance"] > 10) 
df_2023_filtered.show(5)

NameError: name 'df_2019_1' is not defined

In [None]:
df_2019_filtered.where(F.col('passenger_count') > 3).limit(10)

In [None]:
# 合并2019年和2021年的数据
 df_all = df_2019_1.union(df_2021_1)
 df_all.show(10)

IndentationError: unexpected indent (3649347377.py, line 2)

## Add new column ##

In [None]:
df_renamed = df_all.withColumnRenamed("RatecodeID", "Rate_codeID") 
df_renamed.show(5)

NameError: name 'df_all' is not defined

In [None]:
df_dropped = df_all.drop("passenger_count_plus_10")
df_dropped.show(5) 

NameError: name 'df_all' is not defined

In [None]:
df_all.groupBy("VendorID").agg({"passenger_count": "avg", "extra": "max"}).show()

NameError: name 'df_all' is not defined

## Sampling Data

In [None]:
SAMPLE_SIZE = 0.01

In [None]:
df = df_all.sample(SAMPLE_SIZE, seed = 20020223).toPandas() 
df

NameError: name 'df_all' is not defined

In [None]:
df_2019_1.printSchema() 
print(f"数据总量: {df_2019_1.count()}") 
df_2019_1.describe().show()

NameError: name 'df_2019_1' is not defined

## Handling missing data 

In [None]:
df_2019_1 = df_2019_1.dropna()
df_2019_1 = df_2019_1.fillna({'passenger_count': 1, 'trip_distance': 0.0})

NameError: name 'df_2019_1' is not defined

 ## Datatype Conversion

In [None]:
df_2019_1 = df_2019_1.withColumn("passenger_count", col("passenger_count").cast("integer"))
df_2019_1 = df_2019_1.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime")) 
df_2019_1 = df_2019_1.withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime"))  

NameError: name 'df_2019_1' is not defined

## Handling duplicates

In [None]:
df_2019_1 = df_2019_1.dropDuplicates()

NameError: name 'df_2019_1' is not defined

## Anomaly 

In [None]:
trip_distance_min = 0
trip_distance_max = 100
df_2019_1 = df_2019_1.filter((df_2019_1["trip_distance"] > trip_distance_min) & (df_2019_1["trip_distance"] > trip_distance_max))

NameError: name 'df_2019_1' is not defined

## Standardization 

In [None]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
assembler = VectorAssembler(inputCols=["trip_distance", "trip_duration"], outputCol= "features")
df_features = assembler.transform(df_2019_1)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features") 
scaler_model = scaler.fit(df_features)
df_scaled = scaler_model.transform(df_features)

AssertionError: 