# Preprocessing Event Data
This notebook performs preprocessing for NYC event dataset

In [52]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [53]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)


In [54]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [55]:
# Path to your CSV file
csv_file_path = "../data/landing/external/NYC_Permitted_Event_Information_-_Historical.csv"

# Read the CSV file into a DataFrame
edf = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [56]:
# Show the DataFrame schema
edf.printSchema()

root
 |-- Event ID: integer (nullable = true)
 |-- Event Name: string (nullable = true)
 |-- Start Date/Time: string (nullable = true)
 |-- End Date/Time: string (nullable = true)
 |-- Event Agency: string (nullable = true)
 |-- Event Type: string (nullable = true)
 |-- Event Borough: string (nullable = true)
 |-- Event Location: string (nullable = true)
 |-- Event Street Side: string (nullable = true)
 |-- Street Closure Type: string (nullable = true)
 |-- Community Board: string (nullable = true)
 |-- Police Precinct: string (nullable = true)



In [57]:
# Select the columns: Event ID, Event Type, Start Date/Time, End Date/Time, Event Borough, Event Location
edf = edf.select("Event ID", "Event Type", "Start Date/Time", "End Date/Time", "Event Borough", "Event Location")

In [66]:
# Convert start_date and end_date strings to timestamp literals
start_timestamp = lit(start_date).cast("timestamp")
end_timestamp = lit(end_date).cast("timestamp")

# Filter the DataFrame to include only rows within the specified date range
edf_filtered = edf.filter(
    (col("Start Date/Time") >= start_timestamp) &
    (col("End Date/Time") <= end_timestamp)
)

# Display the number of rows before and after filtering
original_count = edf.count()
filtered_count = edf_filtered.count()

print(f"Original count: {original_count}")
print(f"Filtered count: {filtered_count}")




Original count: 23827581
Filtered count: 620618


                                                                                

In [67]:
edf.select("Start Date/Time", "End Date/Time").agg({"Start Date/Time": "min", "Start Date/Time": "max", "End Date/Time": "min", "End Date/Time": "max"}).show()




+-------------------+--------------------+
| max(End Date/Time)|max(Start Date/Time)|
+-------------------+--------------------+
|2026-12-31 23:00:00| 2024-12-31 13:30:00|
+-------------------+--------------------+



                                                                                

In [68]:
# check min start date
edf.select("Start Date/Time").agg({"Start Date/Time": "min"}).show()



+--------------------+
|min(Start Date/Time)|
+--------------------+
| 2008-01-02 00:00:00|
+--------------------+



                                                                                

## Missing values and duplicates

In [None]:
# Drop missing values and duplicates 
edf_filtered = edf_filtered.dropna()
edf_filtered = edf_filtered.dropDuplicates()

In [None]:
edf_filtered.count()

ERROR:root:KeyboardInterrupt while sending command.               (10 + 8) / 39]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

In [None]:
# check the percetge of data left
print(f"Percentage of data left: {edf_filtered.count()/edf.count()*100:.2f}%")

ERROR:root:KeyboardInterrupt while sending command.               (16 + 8) / 39]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 



## Feature Engineering

### Temporal Features

In [None]:
# Extract hour and date from Start Date/Time
edf_filtered = edf_filtered.withColumn("Start Hour", hour("Start Date/Time")) \
                           .withColumn("Start Date", to_date("Start Date/Time"))
# Extract hour and date from End Date/Time
edf_filtered = edf_filtered.withColumn("End Hour", hour("End Date/Time")) \
                           .withColumn("End Date", to_date("End Date/Time"))
# Drop the Start Date/Time and End Date/Time columns
edf_filtered = edf_filtered.drop("Start Date/Time", "End Date/Time")

In [None]:
# Rearrange the columns
edf_filtered = edf_filtered.select("Event ID",  "Start Date", "Start Hour", "End Date", "End Hour", "Event Type", "Event Borough", "Event Location"
                                  )

In [None]:
# Show the DataFrame 
edf_filtered.show(5)



+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|      Event Location|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  684437|2023-12-02|         7|2023-12-02|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
|  707751|2023-12-11|        12|2023-12-11|      13|Special Event|    Manhattan|Central Park: Lad...|
|  684680|2023-12-29|        13|2023-12-29|      15|Special Event|    Manhattan|Central Park: Wag...|
|  691714|2023-12-05|        13|2023-12-05|      15|Special Event|    Manhattan|Dag Hammarskjold ...|
|  694374|2023-12-05|        14|2023-12-05|      16|Special Event|    Manhattan|Central Park: Wag...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



                                                                                

## Export to raw folder

In [None]:
# Export the filtered DataFrame to a Parquet file in the raw zone
edf_filtered.write.mode("overwrite").parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet")

ERROR:root:KeyboardInterrupt while sending command.                (0 + 8) / 39]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

