# Pre-processing_final_1
This notebook merges 3 preprocessed datasets and prepares for EDA_hourly and model analysis

In [42]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 

In [43]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

In [44]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

## Datasets 

In [45]:
# read the data from preprocessed tlc data
tdf = spark.read.parquet("../data/raw/tlc_df.parquet") 

In [46]:
# read the preprocessed weather data
wdf = spark.read.csv("../data/raw/NYC_weather_raw.csv", header=True, inferSchema=True)


In [47]:
# read the preprocessed event data
edf = spark.read.parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet") 

In [48]:
# show 5 rows of the tlc data
tdf.show(5)

+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+
|VendorID|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|pickup_hour|dropoff_hour|pickup_date|dropoff_date|
+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+-----------+------------+-----------+------------+
|       1|              1|          7.1|         1|             false|         249|         179|           1|       32.4|  3.5|   

In [49]:
# schema of the tlc data
tdf.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- dropoff_hour: integer (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- dropoff_date: date (nullable = true)



In [50]:
# show 5 rows of the weather data
wdf.show(5)

+----------+----+-------+------------------+-----+----+----+------+
|      DATE|HOUR|    CIG|               WND|  VIS| TMP| DEW|   SLP|
+----------+----+-------+------------------+-----+----+----+------+
|2023-07-01|   0|22000.0|2.6319672131147542|965.6|23.9|13.3|1017.1|
|2023-07-01|   1|22000.0|2.6319672131147542|965.6|23.3|13.3|1017.6|
|2023-07-01|   2|22000.0|2.6319672131147542|965.6|23.3|12.8|1017.8|
|2023-07-01|   3|22000.0|               3.1|965.6|22.8|12.8|1017.7|
|2023-07-01|   4|22000.0|               1.5|965.6|22.8|11.7|1017.4|
+----------+----+-------+------------------+-----+----+----+------+
only showing top 5 rows



In [51]:
# show 5 rows of the event data
edf.show(5)

+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|      Event Location|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  683185|2023-11-17|         8|2023-11-17|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
|  683046|2023-10-13|         8|2023-10-13|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
|  682144|2023-07-14|         8|2023-07-14|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
|  681104|2023-09-17|        15|2023-09-17|      20|Special Event|     Brooklyn|  Fulton Park: Plaza|
|  683192|2023-11-25|         8|2023-11-25|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



In [52]:
# check min and max date for edf
edf.select(F.min("Start Date"), F.max("Start Date")).show()

+---------------+---------------+
|min(Start Date)|max(Start Date)|
+---------------+---------------+
|     2023-07-01|     2023-12-30|
+---------------+---------------+



## Aggregation

### Map taxi zones to boroughs 

In [53]:
# Load the zones DataFrame
zones = spark.read.csv("../data/landing/external/taxi_zones.csv", header=True, inferSchema=True)

In [54]:
# Join the tdf DataFrame with zones to get the borough for PULocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'PUBorough'),
               tdf['PULocationID'] == zones['LocationID'], 'left').drop('LocationID')

# Join the tdf DataFrame with zones to get the borough for DOLocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'DOBorough'),
               tdf['DOLocationID'] == zones['LocationID'], 'left').drop('LocationID')


In [55]:
# Convert pickup_date and pickup_hour to a timestamp and create Time column
tdf = tdf.withColumn(
    'Time', 
    to_timestamp(concat_ws(' ', col('pickup_date'), col('pickup_hour')))
)

df_hourly_agg = tdf.groupBy(['pickup_date', 'pickup_hour', "PUBorough"]).agg({
    '*': 'count',
}).withColumnRenamed('count(1)', 'hourly_trip_count')
df_daily_agg = tdf.groupBy(['pickup_date', "PUBorough"]).agg({
    '*': 'count',
})

### Aggregation of number of events

In [56]:
# Aggregate the number of events per date, hour, and borough
edf_hourly = edf.groupBy('Start Date', 'Start Hour', 'Event Borough').agg(count('Event ID').alias('Number of Events'))

In [57]:
edf_hourly.show(5)

+----------+----------+-------------+----------------+
|Start Date|Start Hour|Event Borough|Number of Events|
+----------+----------+-------------+----------------+
|2023-08-14|        10|    Manhattan|              41|
|2023-08-19|         8|       Queens|              87|
|2023-07-28|         8|        Bronx|              46|
|2023-07-09|        10|        Bronx|              33|
|2023-07-29|        14|    Manhattan|              26|
+----------+----------+-------------+----------------+
only showing top 5 rows



## Merge datasets 

In [59]:
# Step 1: Create unified datetime columns in each dataset

# For tdf (taxi dataset)
df_hourly_agg = df_hourly_agg.withColumn("datetime", to_timestamp(concat_ws(" ", col("pickup_date"), col("pickup_hour")), "yyyy-MM-dd H"))

# For edf (event dataset)
edf_hourly = edf_hourly.withColumn("datetime", to_timestamp(concat_ws(" ", col("Start Date"), col("Start Hour")), "yyyy-MM-dd H"))
edf_hourly = edf_hourly.withColumnRenamed("Event Borough", "PUBorough")

# For wdf (weather dataset)
wdf = wdf.withColumn("datetime", to_timestamp(col("DATE"), "yyyy-MM-dd HH:mm:ss"))

# Step 2: Join tdf (taxi dataset) with edf (event dataset) on datetime and borough
tdf_edf = df_hourly_agg.join(edf_hourly, on=["datetime", "PUBorough"], how="left")

# Step 3: Join the result with the weather dataset on datetime only (since weather data is from one station)
final_df = tdf_edf.join(wdf, 
                        on="datetime", 
                        how="left")

In [60]:
# print the schema of the final dataframe
final_df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- hourly_trip_count: long (nullable = false)
 |-- Start Date: date (nullable = true)
 |-- Start Hour: integer (nullable = true)
 |-- Number of Events: long (nullable = true)
 |-- DATE: date (nullable = true)
 |-- HOUR: integer (nullable = true)
 |-- CIG: double (nullable = true)
 |-- WND: double (nullable = true)
 |-- VIS: double (nullable = true)
 |-- TMP: double (nullable = true)
 |-- DEW: double (nullable = true)
 |-- SLP: double (nullable = true)



In [61]:
# Drop the redundant datetime columns after the final join 
final_df = final_df.drop('Start Date', 'Start Hour', 'DATE', 'HOUR')

## Preprocessing

### Handling missing data
The assumption is no events occured if number of event in Null


In [62]:
# Impute missing values in Number of Events column with 0
final_df = final_df.fillna(0, subset=['Number of Events'])

In [63]:
# Define a window specification with proper partitioning
window_spec = Window.partitionBy("pickup_date").orderBy("pickup_hour").rowsBetween(-sys.maxsize, 0)

# Apply forward fill to the missing weather columns
final_df = final_df.withColumn("CIG", last(col("CIG"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("WND", last(col("WND"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("VIS", last(col("VIS"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("TMP", last(col("TMP"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("DEW", last(col("DEW"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("SLP", last(col("SLP"), ignorenulls=True).over(window_spec))

### Handling error values

In [64]:
final_df.select(F.min('pickup_date'), F.max('pickup_date')).show()

+----------------+----------------+
|min(pickup_date)|max(pickup_date)|
+----------------+----------------+
|      2023-06-30|      2024-01-03|
+----------------+----------------+



In [65]:
# Only keep rows within the date range of the taxi data
final_df = final_df.filter(col("pickup_date").between("2023-07-01", "2023-12-31"))

In [66]:
# check date range
final_df.select(F.min('pickup_date'), F.max('pickup_date')).show()

+----------------+----------------+
|min(pickup_date)|max(pickup_date)|
+----------------+----------------+
|      2023-07-01|      2023-12-31|
+----------------+----------------+



In [67]:
# check schema
final_df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- hourly_trip_count: long (nullable = false)
 |-- Number of Events: long (nullable = true)
 |-- CIG: double (nullable = true)
 |-- WND: double (nullable = true)
 |-- VIS: double (nullable = true)
 |-- TMP: double (nullable = true)
 |-- DEW: double (nullable = true)
 |-- SLP: double (nullable = true)



## Export file to curated folder

In [None]:
# save the merged data and overwrite the previous one
final_df.write.parquet("../data/curated/tlc_data/first_cleaned.parquet")