# Pre-processing_final
This notebook aggregates 3 preprocessed datasets

In [41]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 

In [42]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

In [43]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

## Datasets 

In [44]:
# read the data from preprocessed tlc data
tdf = spark.read.parquet("../data/raw/tlc_df.parquet") 

In [45]:
# read the preprocessed weather data
wdf = spark.read.csv("../data/raw/NYC_weather_raw.csv", header=True, inferSchema=True)


In [46]:
# read the preprocessed event data
edf = spark.read.parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet") 

In [47]:
# show 5 rows of the tlc data
tdf.show(5)

+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+
|VendorID|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|pickup_hour|dropoff_hour|pickup_date|dropoff_date|
+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+
|       2|              1|         1.75|         1|             false|          68|          50|           2|       12.1|  1.0|   

In [48]:
# schema of the tlc data
tdf.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- dropoff_hour: integer (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- dropoff_date: date (nullable = true)



In [49]:
# show 5 rows of the weather data
wdf.show(5)

+----------+----+-------+---+------+---+----+------+
|      DATE|HOUR|    CIG|WND|   VIS|TMP| DEW|   SLP|
+----------+----+-------+---+------+---+----+------+
|2023-12-01|   0|22000.0|4.1|1609.3|9.4|-2.8|1020.1|
|2023-12-01|   1|22000.0|2.1|1609.3|8.9|-2.2|1020.0|
|2023-12-01|   2|22000.0|3.1|1609.3|8.9|-2.2|1020.4|
|2023-12-01|   3|22000.0|3.1|1609.3|8.3|-1.7|1020.7|
|2023-12-01|   4|22000.0|3.1|1609.3|7.8|-1.7|1020.8|
+----------+----+-------+---+------+---+----+------+
only showing top 5 rows



In [50]:
# show 5 rows of the event data
edf.show(5)

+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|      Event Location|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  684438|2023-12-09|         7|2023-12-09|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
|  693693|2023-12-03|        12|2023-12-03|      14|Special Event|    Manhattan|Central Park: Wag...|
|  686564|2023-12-03|        17|2023-12-03|      18|Special Event|    Manhattan|Carl Schurz Park:...|
|  684416|2023-12-09|         9|2023-12-09|      11|Special Event|    Manhattan|Washington Square...|
|  687023|2023-12-01|        11|2023-12-01|      12|Special Event|    Manhattan|Central Park: Lad...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



## Aggregation

### tlc aggregation for number of trips

In [51]:
# Convert pickup_date and pickup_hour to a timestamp and create Time column
tdf = tdf.withColumn(
    'Time', 
    to_timestamp(concat_ws(' ', col('pickup_date'), col('pickup_hour')))
)

# Aggregate hourly trip counts
hourly_trip_counts = tdf.groupBy('Time').agg(count('*').alias('hourly_trip_count'))

# Aggregate daily trip counts
daily_trip_counts = tdf.groupBy('pickup_date').agg(count('*').alias('daily_trip_count'))

# drop the Time column
tdf = tdf.drop('Time')

In [52]:
tdf.show(5)

+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+
|VendorID|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|pickup_hour|dropoff_hour|pickup_date|dropoff_date|
+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+
|       2|              1|         1.75|         1|             false|          68|          50|           2|       12.1|  1.0|   

### Map taxi zones to boroughs 

In [53]:
# Load the zones DataFrame
zones = spark.read.csv("../data/landing/external/taxi_zones.csv", header=True, inferSchema=True)
# show 5 rows of the zones data
zones.show(5)


+--------+---------------+--------------------+----------------+--------------------+----------+-------------+
|OBJECTID|     Shape_Leng|            the_geom|      Shape_Area|                zone|LocationID|      borough|
+--------+---------------+--------------------+----------------+--------------------+----------+-------------+
|       1| 0.116357453189|MULTIPOLYGON (((-...|  7.823067885E-4|      Newark Airport|         1|          EWR|
|       2|  0.43346966679|MULTIPOLYGON (((-...|0.00486634037837|         Jamaica Bay|         2|       Queens|
|       3|0.0843411059012|MULTIPOLYGON (((-...|3.14414156821E-4|Allerton/Pelham G...|         3|        Bronx|
|       4|0.0435665270921|MULTIPOLYGON (((-...|1.11871946192E-4|       Alphabet City|         4|    Manhattan|
|       5|0.0921464898574|MULTIPOLYGON (((-...|4.97957489363E-4|       Arden Heights|         5|Staten Island|
+--------+---------------+--------------------+----------------+--------------------+----------+-------------+
o

In [54]:
# Join the tdf DataFrame with zones to get the borough for PULocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'PUBorough'),
               tdf['PULocationID'] == zones['LocationID'], 'left').drop('LocationID')

# Join the tdf DataFrame with zones to get the borough for DOLocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'DOBorough'),
               tdf['DOLocationID'] == zones['LocationID'], 'left').drop('LocationID')


In [55]:
tdf.show(5)

+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+---------+---------+
|VendorID|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|pickup_hour|dropoff_hour|pickup_date|dropoff_date|PUBorough|DOBorough|
+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+---------+---------+
|       2|              1|         1.75|         1|             false|

### Aggregation of number of events

In [56]:
# Combine 'Start Date' and 'Start Hour' to create a 'Start Time' column (formatted correctly)
edf = edf.withColumn('Start Time', concat_ws(' ', col('Start Date'), col('Start Hour')))

# Aggregate the number of events per date, hour, and borough
edf = edf.groupBy('Start Date', 'Start Hour', 'Event Borough').agg(count('Event ID').alias('Number of Events'))

# Sort the results if needed
edf = edf.orderBy('Start Date', 'Start Hour', 'Event Borough')

# Show or save the aggregated results
edf.show()

+----------+----------+-------------+----------------+
|Start Date|Start Hour|Event Borough|Number of Events|
+----------+----------+-------------+----------------+
|2023-12-01|         0|    Manhattan|              36|
|2023-12-01|         1|     Brooklyn|               1|
|2023-12-01|         7|    Manhattan|               2|
|2023-12-01|         7|       Queens|               1|
|2023-12-01|         8|        Bronx|               7|
|2023-12-01|         8|     Brooklyn|              10|
|2023-12-01|         8|    Manhattan|              12|
|2023-12-01|         8|       Queens|               6|
|2023-12-01|         8|Staten Island|               1|
|2023-12-01|         9|        Bronx|               2|
|2023-12-01|         9|     Brooklyn|               4|
|2023-12-01|         9|    Manhattan|               5|
|2023-12-01|         9|       Queens|               3|
|2023-12-01|         9|Staten Island|               4|
|2023-12-01|        10|        Bronx|               2|
|2023-12-0

## Merge datasets 

In [57]:
# Step 1: Create unified datetime columns in each dataset

# For tdf (taxi dataset)
tdf = tdf.withColumn("datetime", to_timestamp(concat_ws(" ", col("pickup_date"), col("pickup_hour")), "yyyy-MM-dd H"))

# For edf (event dataset)
edf = edf.withColumn("datetime", to_timestamp(concat_ws(" ", col("Start Date"), col("Start Hour")), "yyyy-MM-dd H"))

# For wdf (weather dataset)
wdf = wdf.withColumn("datetime", to_timestamp(concat_ws(" ", col("DATE"), col("HOUR")), "yyyy-MM-dd H"))

# Step 2: Join tdf (taxi dataset) with edf (event dataset) on datetime and borough
tdf_edf = tdf.join(edf, 
                   (tdf["datetime"] == edf["datetime"]) & 
                   (tdf["PUBorough"] == edf["Event Borough"]), 
                   "left")

# Drop the redundant datetime column from edf after the join to avoid ambiguity
tdf_edf = tdf_edf.drop(edf["datetime"])

# Step 3: Join the result with the weather dataset on datetime only (since weather data is from one station)
final_df = tdf_edf.join(wdf, 
                        tdf_edf["datetime"] == wdf["datetime"], 
                        "left")

# Step 4: Drop the redundant datetime columns after the final join
final_df = final_df.drop(wdf["datetime"]).drop(edf["datetime"]) 


In [58]:
# Drop the redundant datetime columns after the final join 
final_df = final_df.drop("weather_datetime", 'event_datetime', 'datetime', 'DATE', 'HOUR', 'Start Date', 'Start Hour', 'Event Borough')

In [59]:
final_df = final_df.select(
    "pickup_date", "pickup_hour", "dropoff_date", "dropoff_hour",  # Date and Time Columns
    "VendorID", "passenger_count", "trip_distance", "RatecodeID",  # Trip Information
    "PULocationID", "PUBorough", "DOLocationID", "DOBorough",      # Location Information
    "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge",  # Payment Details
    "congestion_surcharge", "total_amount", "ehail_fee",            # Additional Charges
    "trip_duration",                                                 # Duration
    "Number of Events",                                              # Event Data
    "CIG", "WND", "VIS", "TMP", "DEW", "SLP"                        # Weather Data
) 


## Preprocessing

### Handling missing data
The assumption is no events occured if number of evnets in Null


In [60]:
# Impute missing values in Number of Events column with 0
final_df = final_df.fillna(0, subset=['Number of Events'])

In [62]:
# Calculate the mode for PUBorough based on PULocationID
puborough_mode = final_df.groupBy("PULocationID", "PUBorough").agg(count("*").alias("count"))
puborough_mode = puborough_mode.withColumn("row", row_number().over(Window.partitionBy("PULocationID").orderBy(col("count").desc())))
puborough_mode = puborough_mode.filter(col("row") == 1).select("PULocationID", col("PUBorough").alias("PUBorough_mode"))

# Join to fill in missing PUBorough
final_df = final_df.join(puborough_mode, "PULocationID", "left").withColumn(
    "PUBorough", coalesce(col("PUBorough"), col("PUBorough_mode"))
).drop("PUBorough_mode")

# Calculate the mode for DOBorough based on DOLocationID
doborough_mode = final_df.groupBy("DOLocationID", "DOBorough").agg(count("*").alias("count"))
doborough_mode = doborough_mode.withColumn("row", row_number().over(Window.partitionBy("DOLocationID").orderBy(col("count").desc())))
doborough_mode = doborough_mode.filter(col("row") == 1).select("DOLocationID", col("DOBorough").alias("DOBorough_mode"))

# Join to fill in missing DOBorough
final_df = final_df.join(doborough_mode, "DOLocationID", "left").withColumn(
    "DOBorough", coalesce(col("DOBorough"), col("DOBorough_mode"))
).drop("DOBorough_mode")


In [63]:
# Define a window specification to forward fill missing values
window_spec = Window.partitionBy().orderBy("pickup_date", "pickup_hour").rowsBetween(-sys.maxsize, 0)

# Apply forward fill to the missing weather columns
final_df = final_df.withColumn("CIG", last(col("CIG"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("WND", last(col("WND"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("VIS", last(col("VIS"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("TMP", last(col("TMP"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("DEW", last(col("DEW"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("SLP", last(col("SLP"), ignorenulls=True).over(window_spec))

### Handling outliers

In [68]:
# Only keep rows within the date range of the taxi data
final_df = final_df.filter(col("pickup_date").between("2023-12-01", "2024-5-31"))

In [70]:
# show 5 rows of the final data
final_df.show(5)

24/08/28 03:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 0

KeyboardInterrupt: 

## Export file to curated folder

In [69]:
# save the merged data
final_df.write.parquet("../data/curated/tlc_data/first_cleaned.parquet")

24/08/28 03:33:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:33:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:33:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:33:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:33:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 03:33:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/28 0