# Preprocessing Event Data

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [2]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

24/08/26 18:05:58 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 10.12.200.32 instead (on interface en0)
24/08/26 18:05:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 18:05:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [4]:
# Path to your CSV file
csv_file_path = "../data/landing/external/NYC_Permitted_Event_Information.csv"

# Read the CSV file into a DataFrame
edf = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [8]:
# Show the DataFrame schema
edf.printSchema()

root
 |-- Event ID: integer (nullable = true)
 |-- Event Type: string (nullable = true)
 |-- Start Date/Time: string (nullable = true)
 |-- End Date/Time: string (nullable = true)
 |-- Event Borough: string (nullable = true)
 |-- Event Location: string (nullable = true)



In [10]:
# Show the first 5 rows
edf.show(5)

+--------+--------------+--------------------+--------------------+-------------+--------------------+
|Event ID|    Event Type|     Start Date/Time|       End Date/Time|Event Borough|      Event Location|
+--------+--------------+--------------------+--------------------+-------------+--------------------+
|  368421| Special Event|11/18/2017 07:00:...|11/18/2017 08:00:...|    Manhattan|Damrosch Park: Da...|
|  330050| Special Event|11/16/2017 08:00:...|11/16/2017 04:00:...|        Bronx|Mount Eden Malls:...|
|  314111|Farmers Market|11/21/2017 08:00:...|11/21/2017 05:00:...|    Manhattan| BROADWAY between...|
|  369850|  Construction|11/23/2017 12:00:...|11/23/2017 11:58:...|    Manhattan|Madison Square Pa...|
|  335783| Special Event|11/22/2017 09:00:...|11/22/2017 08:00:...|Staten Island|LaTourette Park &...|
+--------+--------------+--------------------+--------------------+-------------+--------------------+
only showing top 5 rows



In [11]:
# Select the columns: Event ID, Event Type, Start Date/Time, End Date/Time, Event Borough, Event Location
edf = edf.select("Event ID", "Event Type", "Start Date/Time", "End Date/Time", "Event Borough", "Event Location")
edf.show(5)


+--------+--------------+--------------------+--------------------+-------------+--------------------+
|Event ID|    Event Type|     Start Date/Time|       End Date/Time|Event Borough|      Event Location|
+--------+--------------+--------------------+--------------------+-------------+--------------------+
|  368421| Special Event|11/18/2017 07:00:...|11/18/2017 08:00:...|    Manhattan|Damrosch Park: Da...|
|  330050| Special Event|11/16/2017 08:00:...|11/16/2017 04:00:...|        Bronx|Mount Eden Malls:...|
|  314111|Farmers Market|11/21/2017 08:00:...|11/21/2017 05:00:...|    Manhattan| BROADWAY between...|
|  369850|  Construction|11/23/2017 12:00:...|11/23/2017 11:58:...|    Manhattan|Madison Square Pa...|
|  335783| Special Event|11/22/2017 09:00:...|11/22/2017 08:00:...|Staten Island|LaTourette Park &...|
+--------+--------------+--------------------+--------------------+-------------+--------------------+
only showing top 5 rows



In [12]:
from pyspark.sql.functions import col, to_timestamp, lit

# Convert the 'Start Date/Time' and 'End Date/Time' columns to timestamp type using the correct format
edf = edf.withColumn("Start Date/Time", to_timestamp(col("Start Date/Time"), "MM/dd/yyyy hh:mm:ss a")) \
       .withColumn("End Date/Time", to_timestamp(col("End Date/Time"), "MM/dd/yyyy hh:mm:ss a"))

# Define the date range as timestamps in the correct format
start_date = "2023-12-01 00:00:00"
end_date = "2024-05-31 23:59:59"

# Convert start_date and end_date to timestamp using lit and to_timestamp
start_timestamp = to_timestamp(lit(start_date), "yyyy-MM-dd HH:mm:ss")
end_timestamp = to_timestamp(lit(end_date), "yyyy-MM-dd HH:mm:ss")

# Filter the DataFrame to include only rows within the specified date range
edf_filtered = edf.filter((col("Start Date/Time") >= start_timestamp) & 
                          (col("End Date/Time") <= end_timestamp))

# Show the filtered DataFrame
edf_filtered.show(5)




+--------+-------------+-------------------+-------------------+-------------+--------------------+
|Event ID|   Event Type|    Start Date/Time|      End Date/Time|Event Borough|      Event Location|
+--------+-------------+-------------------+-------------------+-------------+--------------------+
|  684438|Special Event|2023-12-09 07:00:00|2023-12-09 10:00:00|     Brooklyn|Prospect Park: Pi...|
|  684437|Special Event|2023-12-02 07:00:00|2023-12-02 10:00:00|     Brooklyn|Prospect Park: Pi...|
|  684438|Special Event|2023-12-09 07:00:00|2023-12-09 10:00:00|     Brooklyn|Prospect Park: Pi...|
|  684437|Special Event|2023-12-02 07:00:00|2023-12-02 10:00:00|     Brooklyn|Prospect Park: Pi...|
|  684438|Special Event|2023-12-09 07:00:00|2023-12-09 10:00:00|     Brooklyn|Prospect Park: Pi...|
+--------+-------------+-------------------+-------------------+-------------+--------------------+
only showing top 5 rows



                                                                                

## Missing values 

In [13]:
# Check missing values 
edf_filtered.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in edf_filtered.columns]).show()



+--------+----------+---------------+-------------+-------------+--------------+
|Event ID|Event Type|Start Date/Time|End Date/Time|Event Borough|Event Location|
+--------+----------+---------------+-------------+-------------+--------------+
|       0|         0|              0|            0|            0|             0|
+--------+----------+---------------+-------------+-------------+--------------+



                                                                                