# Preprocessing Event Data
This notebook performs preprocessing for NYC event dataset

In [29]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [30]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)


In [31]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [32]:
# Path to your CSV file
csv_file_path = "../data/landing/external/NYC_Permitted_Event_Information_-_Historical.csv"

# Read the CSV file into a DataFrame
edf = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [33]:
# Show the DataFrame schema
edf.printSchema()

root
 |-- Event ID: integer (nullable = true)
 |-- Event Name: string (nullable = true)
 |-- Start Date/Time: string (nullable = true)
 |-- End Date/Time: string (nullable = true)
 |-- Event Agency: string (nullable = true)
 |-- Event Type: string (nullable = true)
 |-- Event Borough: string (nullable = true)
 |-- Event Location: string (nullable = true)
 |-- Event Street Side: string (nullable = true)
 |-- Street Closure Type: string (nullable = true)
 |-- Community Board: string (nullable = true)
 |-- Police Precinct: string (nullable = true)



In [34]:
# Select the columns: Event ID, Event Type, Start Date/Time, End Date/Time, Event Borough, Event Location
edf = edf.select("Event ID", "Event Type", "Start Date/Time", "End Date/Time", "Event Borough", "Event Location")

In [35]:
from pyspark.sql.functions import col, to_timestamp, lit

# Convert the 'Start Date/Time' and 'End Date/Time' columns to timestamp type using the correct format
edf = edf.withColumn("Start Date/Time", to_timestamp(col("Start Date/Time"), "MM/dd/yyyy hh:mm:ss a")) \
       .withColumn("End Date/Time", to_timestamp(col("End Date/Time"), "MM/dd/yyyy hh:mm:ss a"))

# Define the date range as timestamps in the correct format
start_date = "2023-12-01 00:00:00"
end_date = "2024-05-31 23:59:59"

# Convert start_date and end_date to timestamp using lit and to_timestamp
start_timestamp = to_timestamp(lit(start_date), "yyyy-MM-dd HH:mm:ss")
end_timestamp = to_timestamp(lit(end_date), "yyyy-MM-dd HH:mm:ss")

# Filter the DataFrame to include only rows within the specified date range
edf_filtered = edf.filter((col("Start Date/Time") >= start_timestamp) & 
                          (col("End Date/Time") <= end_timestamp))

## Missing values and duplicates

In [36]:
# Drop missing values and duplicates 
edf_filtered = edf_filtered.dropna()
edf_filtered = edf_filtered.dropDuplicates()
# Checked and ther's no missing values 

## Feature Engineering

### Temporal Features

In [37]:
# Extract hour and date from Start Date/Time
edf_filtered = edf_filtered.withColumn("Start Hour", hour("Start Date/Time")) \
                           .withColumn("Start Date", to_date("Start Date/Time"))
# Extract hour and date from End Date/Time
edf_filtered = edf_filtered.withColumn("End Hour", hour("End Date/Time")) \
                           .withColumn("End Date", to_date("End Date/Time"))
# Drop the Start Date/Time and End Date/Time columns
edf_filtered = edf_filtered.drop("Start Date/Time", "End Date/Time")

In [38]:
# Rearrange the columns
edf_filtered = edf_filtered.select("Event ID",  "Start Date", "Start Hour", "End Date", "End Hour", "Event Type", "Event Borough", "Event Location"
                                  )

In [27]:
# Show the DataFrame 
edf_filtered.show(5)



CodeCache: size=131072Kb used=35542Kb max_used=35542Kb free=95529Kb
 bounds [0x00000001081e8000, 0x000000010a4e8000, 0x00000001101e8000]
 total_blobs=13201 nmethods=12261 adapters=851
 compilation: disabled (not enough contiguous free space left)




+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|      Event Location|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  684438|2023-12-09|         7|2023-12-09|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
|  684437|2023-12-02|         7|2023-12-02|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
|  684438|2023-12-09|         7|2023-12-09|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
|  684437|2023-12-02|         7|2023-12-02|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
|  684438|2023-12-09|         7|2023-12-09|      10|Special Event|     Brooklyn|Prospect Park: Pi...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



                                                                                

## Export to raw folder

In [39]:
# Export the filtered DataFrame to a Parquet file in the raw zone
edf_filtered.write.mode("overwrite").parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet")

                                                                                