# Preprocessing Event Data
This notebook performs preprocessing for NYC event dataset

In [13]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

[Stage 26:=>                                                       (1 + 8) / 39]

In [14]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)


In [15]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [16]:
# Path to your CSV file
csv_file_path = "../data/landing/external/NYC_Permitted_Event_Information_-_Historical.csv"

# Read the CSV file into a DataFrame
edf = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [17]:
# Show the DataFrame schema
edf.printSchema()

root
 |-- Event ID: integer (nullable = true)
 |-- Event Name: string (nullable = true)
 |-- Start Date/Time: string (nullable = true)
 |-- End Date/Time: string (nullable = true)
 |-- Event Agency: string (nullable = true)
 |-- Event Type: string (nullable = true)
 |-- Event Borough: string (nullable = true)
 |-- Event Location: string (nullable = true)
 |-- Event Street Side: string (nullable = true)
 |-- Street Closure Type: string (nullable = true)
 |-- Community Board: string (nullable = true)
 |-- Police Precinct: string (nullable = true)



In [18]:
# Select the columns: Event ID, Event Type, Start Date/Time, End Date/Time, Event Borough, Event Location
edf = edf.select("Event ID", "Event Type", "Start Date/Time", "End Date/Time", "Event Borough", "Event Location")

In [19]:
# convert the date columns to datetime
edf = edf.withColumn("Start Date/Time", to_timestamp("Start Date/Time", "MM/dd/yyyy hh:mm:ss a"))
edf = edf.withColumn("End Date/Time", to_timestamp("End Date/Time", "MM/dd/yyyy hh:mm:ss a"))

In [20]:
from pyspark.sql.functions import lit, col

# Define the start and end dates as strings
START_DATE = "2023-07-01"
END_DATE = "2023-12-31"

# Convert start_date and end_date strings to timestamp literals
start_timestamp = lit(START_DATE).cast("timestamp")
end_timestamp = lit(END_DATE).cast("timestamp")

# Filter the DataFrame to include only rows within the specified date range
edf_filtered = edf.filter(
    (col("Start Date/Time") >= start_timestamp) &
    (col("End Date/Time") <= end_timestamp)
)

# Display the number of rows before and after filtering
original_count = edf.count()
filtered_count = edf_filtered.count()

print(f"Original count: {original_count}")
print(f"Filtered count: {filtered_count}")




Original count: 23827581
Filtered count: 3340814


                                                                                

In [25]:
# check the percetge of data left
print(f"Percentage of data left: {edf_filtered.count()/edf.count()*100:.2f}%")



Percentage of data left: 0.66%


                                                                                

## Feature Engineering

### Temporal Features

In [26]:
# Extract hour and date from Start Date/Time
edf_filtered = edf_filtered.withColumn("Start Hour", hour("Start Date/Time")) \
                           .withColumn("Start Date", to_date("Start Date/Time"))
# Extract hour and date from End Date/Time
edf_filtered = edf_filtered.withColumn("End Hour", hour("End Date/Time")) \
                           .withColumn("End Date", to_date("End Date/Time"))
# Drop the Start Date/Time and End Date/Time columns
edf_filtered = edf_filtered.drop("Start Date/Time", "End Date/Time")

In [27]:
# Rearrange the columns
edf_filtered = edf_filtered.select("Event ID",  "Start Date", "Start Hour", "End Date", "End Hour", "Event Type", "Event Borough", "Event Location"
                                  )

In [28]:
# Show the DataFrame 
edf_filtered.show(5)



+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|      Event Location|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  682142|2023-07-08|         8|2023-07-08|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
|  679970|2023-07-29|        12|2023-07-29|      19|Special Event|     Brooklyn|Herbert Von King ...|
|  682137|2023-07-01|         8|2023-07-01|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
|  683055|2023-10-28|         8|2023-10-28|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
|  683191|2023-11-24|         8|2023-11-24|      23|Special Event|     Brooklyn|Prospect Park: Pi...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



                                                                                

## Export to raw folder

In [29]:
# Export the filtered DataFrame to a Parquet file in the raw zone
edf_filtered.write.mode("overwrite").parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet")

                                                                                