# Preprocessing TLC data

This notebook cleans the following datasets: 
1. Yellow taxi data from 2023-06 to 2024-05
2. Green taxi data from 2023-06 to 2024-05


In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [2]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


24/08/24 15:06:38 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 100.94.176.147 instead (on interface en0)
24/08/24 15:06:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/24 15:06:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [4]:
# Read 2023-2024 TLC data
df = spark.read.parquet('../data/landing/tlc_data/*.parquet')


                                                                                

In [5]:
# Read 2023-6 to 2024-5 yellow data
path = "../data/landing/tlc_data"
ydf_2023_6 = spark.read.parquet(path + "/Y-2023-06.parquet")
ydf_2023_7 = spark.read.parquet(path + "/Y-2023-07.parquet")
ydf_2023_8 = spark.read.parquet(path + "/Y-2023-08.parquet")
ydf_2023_9 = spark.read.parquet(path + "/Y-2023-09.parquet")
ydf_2023_10 = spark.read.parquet(path + "/Y-2023-10.parquet")
ydf_2023_11 = spark.read.parquet(path + "/Y-2023-11.parquet")
ydf_2023_12 = spark.read.parquet(path + "/Y-2023-12.parquet")
ydf_2024_1 = spark.read.parquet(path + "/Y-2024-01.parquet")
ydf_2024_2 = spark.read.parquet(path + "/Y-2024-02.parquet")
ydf_2024_3 = spark.read.parquet(path + "/Y-2024-03.parquet")
ydf_2024_4 = spark.read.parquet(path + "/Y-2024-04.parquet")
ydf_2024_5 = spark.read.parquet(path + "/Y-2024-05.parquet")

In [6]:
# Read 2023-6 to 2024-5 green data
path = "../data/landing/tlc_data"
gdf_2023_6 = spark.read.parquet(path + "/G-2023-06.parquet")
gdf_2023_7 = spark.read.parquet(path + "/G-2023-07.parquet")
gdf_2023_8 = spark.read.parquet(path + "/G-2023-08.parquet")
gdf_2023_9 = spark.read.parquet(path + "/G-2023-09.parquet")
gdf_2023_10 = spark.read.parquet(path + "/G-2023-10.parquet")
gdf_2023_11 = spark.read.parquet(path + "/G-2023-11.parquet")
gdf_2023_12 = spark.read.parquet(path + "/G-2023-12.parquet")
gdf_2024_1 = spark.read.parquet(path + "/G-2024-01.parquet")
gdf_2024_2 = spark.read.parquet(path + "/G-2024-02.parquet")
gdf_2024_3 = spark.read.parquet(path + "/G-2024-03.parquet")
gdf_2024_4 = spark.read.parquet(path + "/G-2024-04.parquet")
gdf_2024_5 = spark.read.parquet(path + "/G-2024-05.parquet")

In [7]:
# Show 2023-6 yellow data 
ydf_2023_6.show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-06-01 00:08:48|  2023-06-01 00:29:41|              1|          3.4|         1|                 N|         140|         238|           1|       21.9|  3.5|    0.5|       6.

In [8]:
# Show 2023-6 green data
gdf_2023_6.show(10)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2023-06-01 00:32:25|  2023-06-01 00:39:06|                 N|         1|          74|          42|              1|         0.84|        7.9|  1.0|    0.

## Inspection of TLC datasets

In [9]:
# Calculate the total row count for yellow taxi data from 2023-6 to 2024-5
yellow_count = (
    ydf_2023_6.count() + 
    ydf_2023_7.count() + 
    ydf_2023_8.count() + 
    ydf_2023_9.count() + 
    ydf_2023_10.count() + 
    ydf_2023_11.count() + 
    ydf_2023_12.count() + 
    ydf_2024_1.count() + 
    ydf_2024_2.count() + 
    ydf_2024_3.count() + 
    ydf_2024_4.count() + 
    ydf_2024_5.count()
)

# Display the total count
yellow_count

38916740

In [10]:
# Calculate the total row count for green taxi data from 2023-6 to 2024-5
green_count = (
    gdf_2023_6.count() + 
    gdf_2023_7.count() + 
    gdf_2023_8.count() + 
    gdf_2023_9.count() + 
    gdf_2023_10.count() + 
    gdf_2023_11.count() + 
    gdf_2023_12.count() + 
    gdf_2024_1.count() + 
    gdf_2024_2.count() + 
    gdf_2024_3.count() + 
    gdf_2024_4.count() + 
    gdf_2024_5.count()
)

# Display the total count
green_count

CodeCache: size=131072Kb used=33751Kb max_used=33751Kb free=97320Kb
 bounds [0x000000010a1e8000, 0x000000010c318000, 0x00000001121e8000]
 total_blobs=12830 nmethods=11838 adapters=903
 compilation: disabled (not enough contiguous free space left)




732489

In [11]:
# Calculate the total row count for all taxi data from 2023-6 to 2024-5
total_count = yellow_count + green_count
total_count

39649229

In [12]:
# Show the schema of 2024-5 yellow data
ydf_2024_5.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [13]:
# Show the schema of 2024-5 green data
gdf_2024_5.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [14]:
# Get columns of each DataFrame
columns_ydf = set(ydf_2024_5.columns)
columns_gdf = set(gdf_2024_5.columns)

# Find differences in columns
columns_only_in_df1 = columns_ydf - columns_gdf
columns_only_in_df2 = columns_gdf - columns_ydf

print(f"Columns only in yellowDF: {columns_only_in_df1}")
print(f"Columns only in greenDF: {columns_only_in_df2}")


Columns only in yellowDF: {'tpep_dropoff_datetime', 'Airport_fee', 'tpep_pickup_datetime'}
Columns only in greenDF: {'trip_type', 'lpep_dropoff_datetime', 'ehail_fee', 'lpep_pickup_datetime'}


## Data Cleaning

In [18]:
ydfs = [ydf_2023_6, ydf_2023_7, ydf_2023_8, ydf_2023_9, ydf_2023_10, ydf_2023_11, ydf_2023_12, 
       ydf_2024_1, ydf_2024_2, ydf_2024_3, ydf_2024_4, ydf_2024_5]
gdfs = [gdf_2023_6, gdf_2023_7, gdf_2023_8, gdf_2023_9, gdf_2023_10, gdf_2023_11, gdf_2023_12, 
       gdf_2024_1, gdf_2024_2, gdf_2024_3, gdf_2024_4, gdf_2024_5]


In [19]:
# Combine all yellow taxi data and green taxi data
from pyspark.sql import DataFrame

# Combine all yellow taxi data
yellow_combined = ydfs[0]
for df in ydfs[1:]:
    yellow_combined = yellow_combined.unionByName(df)

# Combine all green taxi data
green_combined = gdfs[0]
for df in gdfs[1:]:
    green_combined = green_combined.unionByName(df)

In [20]:
# Drop Airport_fee and trip_type columns from the combined data
yellow_combined = yellow_combined.drop("Airport_fee")
green_combined = green_combined.drop("trip_type")

In [21]:
# ehail fee is 0 for all yellow taxi data
yellow_combined = yellow_combined.withColumn("ehail_fee", lit(0))

In [22]:
# Drop fare_amount less than $3 initial price 
yellow_combined = yellow_combined.filter(yellow_combined.fare_amount >= 3)
green_combined = green_combined.filter(green_combined.fare_amount >= 3)

In [23]:
# Max passenger count should be 6 
yellow_combined = yellow_combined.filter(yellow_combined.passenger_count <= 6)
green_combined = green_combined.filter(green_combined.passenger_count <= 6)

In [70]:
# Drop negative trip distances
yellow_combined = yellow_combined.filter(yellow_combined.trip_distance >= 0)
green_combined = green_combined.filter(green_combined.trip_distance >= 0)

In [25]:
# Rename datetime columns to be consistent
from pyspark.sql.functions import col

yellow_combined = yellow_combined.withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
                               .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

green_combined = green_combined.withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
                             .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')


In [26]:
# Drop total_amount less than $3
yellow_combined = yellow_combined.filter(yellow_combined.total_amount >= 3)
green_combined = green_combined.filter(green_combined.total_amount >= 3)

## Add new column ##

In [None]:
df_renamed = df_all.withColumnRenamed("RatecodeID", "Rate_codeID") 
df_renamed.show(5)

NameError: name 'df_all' is not defined

In [None]:
df_dropped = df_all.drop("passenger_count_plus_10")
df_dropped.show(5) 

NameError: name 'df_all' is not defined

In [None]:
df_all.groupBy("VendorID").agg({"passenger_count": "avg", "extra": "max"}).show()

NameError: name 'df_all' is not defined

## Sampling Data

In [None]:
SAMPLE_SIZE = 0.01

In [None]:
df = df_all.sample(SAMPLE_SIZE, seed = 20020223).toPandas() 
df

NameError: name 'df_all' is not defined

In [None]:
df_2019_1.printSchema() 
print(f"数据总量: {df_2019_1.count()}") 
df_2019_1.describe().show()

NameError: name 'df_2019_1' is not defined

## Handling missing data 

In [None]:
df_2019_1 = df_2019_1.dropna()
df_2019_1 = df_2019_1.fillna({'passenger_count': 1, 'trip_distance': 0.0})

NameError: name 'df_2019_1' is not defined

 ## Datatype Conversion

In [None]:
df_2019_1 = df_2019_1.withColumn("passenger_count", col("passenger_count").cast("integer"))
df_2019_1 = df_2019_1.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime")) 
df_2019_1 = df_2019_1.withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime"))  

NameError: name 'df_2019_1' is not defined

## Handling duplicates

In [None]:
df_2019_1 = df_2019_1.dropDuplicates()

NameError: name 'df_2019_1' is not defined

## Anomaly 

In [None]:
trip_distance_min = 0
trip_distance_max = 100
df_2019_1 = df_2019_1.filter((df_2019_1["trip_distance"] > trip_distance_min) & (df_2019_1["trip_distance"] > trip_distance_max))

NameError: name 'df_2019_1' is not defined

## Standardization 

In [None]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
assembler = VectorAssembler(inputCols=["trip_distance", "trip_duration"], outputCol= "features")
df_features = assembler.transform(df_2019_1)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features") 
scaler_model = scaler.fit(df_features)
df_scaled = scaler_model.transform(df_features)

AssertionError: 