# NYC Major Events Dataset (2022-2024)

Creating a clean dataset with 5 major NYC events for analysis.

In [43]:
# Import libraries
from sodapy import Socrata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [44]:
# Initialize API client
client = Socrata("data.cityofnewyork.us", None, timeout=120)

# Load Events Data for 2022-2024
print("ðŸ“¥ Loading Events Data (2022-2024)...")

events_results = client.get(
    "bkfu-528j",  # NYC Events dataset
    where="start_date_time >= '2022-01-01T00:00:00.000' AND start_date_time < '2025-01-01T00:00:00.000'",
    limit=100000
)



ðŸ“¥ Loading Events Data (2022-2024)...


In [45]:
import pandas as pd

df_events = pd.DataFrame.from_records(events_results)
print(f"âœ… Loaded {len(df_events):,} event records")

# Process dates
if "start_date_time" in df_events.columns:
    df_events["start_date_time"] = pd.to_datetime(df_events["start_date_time"], errors="coerce")
    
    # Show date range and year distribution
    print(f"ðŸ“… Date Range: {df_events['start_date_time'].min()} to {df_events['start_date_time'].max()}")
    print(f"ðŸ“Š Year Distribution:")
    event_year_counts = df_events['start_date_time'].dt.year.value_counts().sort_index()
    for year, count in event_year_counts.items():
        print(f"   â€¢ {year}: {count:,} events")

if "end_date_time" in df_events.columns:
    df_events["end_date_time"] = pd.to_datetime(df_events["end_date_time"], errors="coerce")

print(f"\nðŸŽ¯ Final Datasets:")
print(f"   â€¢ Arrests (Target Crimes): {len(df_arrests_filtered):,} records")
print(f"   â€¢ Events (All Types): {len(df_events):,} records")

âœ… Loaded 100,000 event records
ðŸ“… Date Range: 2022-01-01 00:00:00 to 2022-02-02 11:30:00
ðŸ“Š Year Distribution:
   â€¢ 2022: 100,000 events

ðŸŽ¯ Final Datasets:
   â€¢ Arrests (Target Crimes): 191,409 records
   â€¢ Events (All Types): 100,000 records


In [46]:
# Filter for 5 Major NYC Events
print("ðŸŽ¯ Filtering for 5 Major NYC Events...")

# Define 5 major events to track
major_events_keywords = {
    'US Open Tennis': ['us open', 'tennis'],
    'NYC Marathon': ['marathon', 'new york city marathon', 'tcs new york'],
    'Pride March': ['pride', 'lgbt', 'lgbtq'],
    'Thanksgiving Parade': ['thanksgiving', 'macy'],
    'New Year\'s Eve': ['new year', 'times square', 'ball drop']
}

# Function to check if event matches any major event
def is_major_event(event_name):
    if pd.isna(event_name):
        return None
    event_name_lower = event_name.lower()
    
    for event_category, keywords in major_events_keywords.items():
        if any(keyword in event_name_lower for keyword in keywords):
            return event_category
    return None

# Apply filter
df_events['major_event_category'] = df_events['event_name'].apply(is_major_event)
df_events_major = df_events[df_events['major_event_category'].notna()].copy()

print(f"âœ… Filtered to {len(df_events_major):,} events (from {len(df_events):,} total)")
print(f"\nðŸ“Š Events by Category:")
for event, count in df_events_major['major_event_category'].value_counts().items():
    print(f"   â€¢ {event}: {count:,} events")

ðŸŽ¯ Filtering for 5 Major NYC Events...
âœ… Filtered to 163 events (from 100,000 total)

ðŸ“Š Events by Category:
   â€¢ New Year's Eve: 115 events
   â€¢ NYC Marathon: 48 events
âœ… Filtered to 163 events (from 100,000 total)

ðŸ“Š Events by Category:
   â€¢ New Year's Eve: 115 events
   â€¢ NYC Marathon: 48 events


In [47]:
# Deduplicate: Keep only one event per category per day
print(f"\nðŸ”§ Deduplicating events...")
print(f"   Before: {len(df_events_major):,} events")

# Create date field (without time)
df_events_major['event_date'] = df_events_major['start_date_time'].dt.date

# Keep only first event per category per day
df_events_major = df_events_major.drop_duplicates(
    subset=['major_event_category', 'event_date'],
    keep='first'
).reset_index(drop=True)

print(f"   After: {len(df_events_major):,} unique events")
print(f"\nâœ… Final Event Counts:")
for event, count in df_events_major['major_event_category'].value_counts().items():
    print(f"   â€¢ {event}: {count:,} events")


ðŸ”§ Deduplicating events...
   Before: 163 events
   After: 16 unique events

âœ… Final Event Counts:
   â€¢ New Year's Eve: 12 events
   â€¢ NYC Marathon: 4 events


In [48]:
# Display Final DataFrame
print("\n" + "="*80)
print("ðŸ“‹ FINAL MAJOR EVENTS DATAFRAME")
print("="*80)
print(f"\nShape: {df_events_major.shape}")
print(f"Columns: {list(df_events_major.columns)}\n")

df_events_major


ðŸ“‹ FINAL MAJOR EVENTS DATAFRAME

Shape: (16, 14)
Columns: ['event_id', 'event_name', 'start_date_time', 'end_date_time', 'event_agency', 'event_type', 'event_borough', 'event_location', 'street_closure_type', 'community_board', 'police_precinct', 'event_street_side', 'major_event_category', 'event_date']



Unnamed: 0,event_id,event_name,start_date_time,end_date_time,event_agency,event_type,event_borough,event_location,street_closure_type,community_board,police_precinct,event_street_side,major_event_category,event_date
0,609882,New Year's Eve 2021-2022,2022-01-01 00:00:00,2022-01-01 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-01
1,609641,Sri Chinmoy New Year's Half Marathon,2022-01-01 08:00:00,2022-01-01 12:00:00,Parks Department,Special Event,Queens,Flushing Meadows Corona Park: Ederle Terrace,,81,110,,NYC Marathon,2022-01-01
2,609882,New Year's Eve 2021-2022,2022-01-02 00:00:00,2022-01-02 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-02
3,609882,New Year's Eve 2021-2022,2022-01-03 00:00:00,2022-01-03 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-03
4,609882,New Year's Eve 2021-2022,2022-01-04 00:00:00,2022-01-04 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-04
5,611955,CDP Marathon Protests,2022-01-04 13:30:00,2022-01-04 15:00:00,Parks Department,Special Event,Manhattan,Dag Hammarskjold Plaza: First Avenue Plaza,,6,17,,NYC Marathon,2022-01-04
6,609882,New Year's Eve 2021-2022,2022-01-05 00:00:00,2022-01-05 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-05
7,609882,New Year's Eve 2021-2022,2022-01-06 00:00:00,2022-01-06 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-06
8,609882,New Year's Eve 2021-2022,2022-01-07 00:00:00,2022-01-07 23:59:00,Parks Department,Special Event,Manhattan,Father Duffy Square: Father Duffy Square,,5,18,,New Year's Eve,2022-01-07
9,610003,NYCRUNS Frozen Penguin Half Marathon and 5K,2022-01-09 05:00:00,2022-01-09 12:30:00,Parks Department,Special Event,Manhattan,"Central Park: 102nd Street Transverse ,Central...",,64,22,,NYC Marathon,2022-01-09


In [49]:
# Save to CSV
output_file = "major_events_final.csv"
df_events_major.to_csv(output_file, index=False)
print(f"âœ… DataFrame saved to: {output_file}")
print(f"   Total events: {len(df_events_major):,}")

âœ… DataFrame saved to: major_events_final.csv
   Total events: 16
