# Pre-processing_final_2
This notebook merges 3 preprocessed datasets and prepares for EDA_daily

In [2]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 

In [3]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

24/08/30 21:06:29 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 100.94.176.147 instead (on interface en0)
24/08/30 21:06:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 21:06:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/30 21:06:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Datasets

In [4]:
# read the data from preprocessed tlc data
tdf = spark.read.parquet("../data/raw/tlc_df.parquet") 

                                                                                

In [5]:
# read the preprocessed weather data
wdf = spark.read.csv("../data/raw/NYC_weather_raw.csv", header=True, inferSchema=True)

In [6]:
# read the preprocessed event data
edf = spark.read.parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet") 

## Aggregation

In [7]:
# Load the zones DataFrame
zones = spark.read.csv("../data/landing/external/taxi_zones.csv", header=True, inferSchema=True)

In [8]:
# Join the tdf DataFrame with zones to get the borough for PULocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'PUBorough'),
               tdf['PULocationID'] == zones['LocationID'], 'left').drop('LocationID')

# Join the tdf DataFrame with zones to get the borough for DOLocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'DOBorough'),
               tdf['DOLocationID'] == zones['LocationID'], 'left').drop('LocationID')

In [9]:
# Aggregating the data for daily analysis
tdf_daily_agg = tdf.groupBy(['pickup_date', "PUBorough"]).agg({
    '*': 'count',
}).withColumnRenamed('count(1)', 'daily_trip_count')

In [10]:
# Convert DATE to Date format if it's not already
wdf = wdf.withColumn('date', col('DATE').cast('date'))

# Aggregate weather data by date (averaging the values across the day)
wdf_daily_agg = wdf.groupBy('date').agg(
    avg("CIG").alias("avg_CIG"),
    avg("WND").alias("avg_WND"),
    avg("VIS").alias("avg_VIS"),
    avg("TMP").alias("avg_TMP"),
    avg("DEW").alias("avg_DEW"),
    avg("SLP").alias("avg_SLP")
)

In [11]:
# Aggregate the number of events per date and borough
edf_daily = edf.groupBy('Start Date', 'Event Borough').agg(count('Event ID').alias('Number of Events'))

## Merge

In [12]:
# Rename the date columns for consistency
tdf_daily_agg = tdf_daily_agg.withColumnRenamed("pickup_date", "date")
edf_daily = edf_daily.withColumnRenamed("Start Date", "date")

In [13]:
# Merge tdf_daily_agg with edf_daily on date and PUBorough
tdf_edf_daily = tdf_daily_agg.join(
    edf_daily.withColumnRenamed("Event Borough", "PUBorough"),
    on=["date", "PUBorough"], 
    how="left"
)

# Merge the result with wdf_daily_agg on date only (since weather data is borough-independent)
final_daily_df = tdf_edf_daily.join(
    wdf_daily_agg,
    on="date",
    how="left"
)

## Preprocessing

In [14]:
# Impute missing values in Number of Events column with 0
final_df = final_daily_df.fillna(0, subset=['Number of Events'])

In [15]:
# Define a window specification with partitioning by date and PUBorough
window_spec = Window.partitionBy("date", "PUBorough").orderBy("date").rowsBetween(-sys.maxsize, 0)

In [16]:
# Apply forward fill to the missing weather columns for daily analysis
final_df = final_df.withColumn("avg_CIG", last(col("avg_CIG"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("avg_WND", last(col("avg_WND"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("avg_VIS", last(col("avg_VIS"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("avg_TMP", last(col("avg_TMP"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("avg_DEW", last(col("avg_DEW"), ignorenulls=True).over(window_spec))
final_df = final_df.withColumn("avg_SLP", last(col("avg_SLP"), ignorenulls=True).over(window_spec))

In [17]:
# Only keep rows within the date range of the taxi data
final_df = final_df.filter(col("date").between("2023-07-01", "2023-12-31"))

In [18]:
# drop the null values and record % of remaining rows
total_rows = final_df.count()
final_df = final_df.dropna()
remaining_rows = final_df.count()

print(f"Remaining rows after dropping missing values: {remaining_rows} out of {total_rows}")

# Print percentage of rows remaining
print(f"Percentage of rows remaining: {remaining_rows / total_rows * 100:.2f}%")



CodeCache: size=131072Kb used=27591Kb max_used=27598Kb free=103480Kb
 bounds [0x000000010a9e8000, 0x000000010c508000, 0x00000001129e8000]
 total_blobs=10650 nmethods=9651 adapters=910
 compilation: disabled (not enough contiguous free space left)


                                                                                

Remaining rows after dropping missing values: 949 out of 1136
Percentage of rows remaining: 83.54%


In [19]:
# check schema
final_df.printSchema()

root
 |-- date: date (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- daily_trip_count: long (nullable = false)
 |-- Number of Events: long (nullable = true)
 |-- avg_CIG: double (nullable = true)
 |-- avg_WND: double (nullable = true)
 |-- avg_VIS: double (nullable = true)
 |-- avg_TMP: double (nullable = true)
 |-- avg_DEW: double (nullable = true)
 |-- avg_SLP: double (nullable = true)



In [21]:
# save the merged data and overwrite the previous one
final_df.write.parquet("../data/curated/merged_data/second_cleaned.parquet")

                                                                                

24/08/31 01:50:37 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 403025 ms exceeds timeout 120000 ms
24/08/31 01:50:37 WARN SparkContext: Killing executors is not supported by current scheduler.
24/08/31 01:50:41 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$