In [14]:
import pandas as pd 
import glob
import os 
#this block of code takes the csv generated from Flight_ETL notebook
# And then grabs the latest one that was saved to the file path
data= "events_data/LA_events/Raw/"
csv_files = glob.glob(os.path.join(data, "events_LA_*.csv"))

latest_file = max(csv_files,key=os.path.getmtime)

Events_df = pd.read_csv(latest_file)
Events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 77 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   access_method                    0 non-null      float64
 1   announce_date                    68 non-null     object 
 2   conditional                      68 non-null     bool   
 3   contingent                       68 non-null     bool   
 4   created_at                       68 non-null     object 
 5   date_tbd                         68 non-null     bool   
 6   datetime_local                   68 non-null     object 
 7   datetime_tbd                     68 non-null     bool   
 8   datetime_utc                     68 non-null     object 
 9   description                      0 non-null      float64
 10  onsale                           0 non-null      float64
 11  enddatetime_utc                  68 non-null     object 
 12  game_number             

In [15]:
Events_df = Events_df[['id','datetime_utc','enddatetime_utc','short_title','popularity','score','type','venue.name','venue.postal_code','venue.address','venue.location.lat','venue.location.lon']]
Events_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  68 non-null     int64  
 1   datetime_utc        68 non-null     object 
 2   enddatetime_utc     68 non-null     object 
 3   short_title         68 non-null     object 
 4   popularity          68 non-null     float64
 5   score               68 non-null     float64
 6   type                68 non-null     object 
 7   venue.name          68 non-null     object 
 8   venue.postal_code   66 non-null     float64
 9   venue.address       68 non-null     object 
 10  venue.location.lat  68 non-null     float64
 11  venue.location.lon  68 non-null     float64
dtypes: float64(5), int64(1), object(6)
memory usage: 6.5+ KB


In [16]:
Events_df['datetime_utc'] = pd.to_datetime(Events_df['datetime_utc'], utc=True)
Events_df['enddatetime_utc'] = pd.to_datetime(Events_df['enddatetime_utc'], utc=True)

In [17]:
Events_df['pacific_datetime'] = Events_df['datetime_utc'].dt.tz_convert('America/Los_Angeles')
Events_df['pacific_endtime'] = Events_df['enddatetime_utc'].dt.tz_convert('America/Los_Angeles')

In [18]:
Events_df['date'] = Events_df['pacific_datetime'].dt.strftime('%m/%d/%Y')
Events_df['Start Time'] = Events_df['pacific_datetime'].dt.strftime('%I:%M %p')
Events_df['End Time'] = Events_df['pacific_endtime'].dt.strftime('%I:%M %p')
Events_df['date'] = pd.to_datetime(Events_df['date'], errors='coerce')


Events_df['Day of the week'] = Events_df['date'].dt.day_name()

In [19]:
Events_df.head()
Events_df = Events_df.drop(columns=['datetime_utc','enddatetime_utc','pacific_datetime'])
Events_df = Events_df.rename(columns={'venue.name':'Name of Venue','venue.postal_code':'Zipcode',
                                      'venue.address':'address','venue.location.lat':'latitude','venue.location.lon':'longitude','short_title':'Event Name'})

In [20]:
import numpy as np
Events_df['lat_rad'] = np.radians(Events_df['latitude'])
Events_df['lon_rad'] = np.radians(Events_df['longitude'])


In [21]:
from sklearn.cluster import DBSCAN


# Earth radius in kilometers
EARTH_RADIUS_KM = 6371.0088

# Your scaled input
X = Events_df[['lat_rad','lon_rad']].to_numpy()

# DBSCAN parameters
eps_km =3# max distance in km for clustering
min_samples = 2 # minimum points to form a cluster

# Fit DBSCAN
db = DBSCAN(
    eps=eps_km/EARTH_RADIUS_KM,  # convert km to radians
    min_samples=min_samples,
    metric='haversine'
).fit(X)

# Assign cluster labels
Events_df['Cluster'] = db.labels_

In [22]:
cluster_day_counts = Events_df.groupby(['Cluster', 'Day of the week']).size().reset_index(name='count')
Events_df = pd.merge(Events_df,cluster_day_counts, on= ['Cluster', 'Day of the week'], how='left')
Events_df['Cluster'] = Events_df.apply(
    lambda row: -1 if row['count'] == 1 else row['Cluster'],
    axis=1
)
Events_df.drop(columns=['count'], inplace=True)

In [23]:
unique_clusters = Events_df['Cluster'].unique()
print(unique_clusters)
cluster_counts = Events_df['Cluster'].value_counts().sort_index()
print("\nCluster distribution:")
print(cluster_counts)



[ 0  1 -1  3]

Cluster distribution:
Cluster
-1    10
 0    35
 1    21
 3     2
Name: count, dtype: int64


In [24]:
Events_df.head()

Unnamed: 0,id,Event Name,popularity,score,type,Name of Venue,Zipcode,address,latitude,longitude,pacific_endtime,date,Start Time,End Time,Day of the week,lat_rad,lon_rad,Cluster
0,17019632,Blue Jays at Dodgers,0.916764,0.766696,mlb,Dodger Stadium,90012.0,1000 Elysian Park Avenue,34.0718,-118.246,2025-08-08 20:40:00-07:00,2025-08-08,07:10 PM,08:40 PM,Friday,0.594665,-2.063782,0
1,17019635,Blue Jays at Dodgers,0.920552,0.782294,mlb,Dodger Stadium,90012.0,1000 Elysian Park Avenue,34.0718,-118.246,2025-08-09 19:40:00-07:00,2025-08-09,06:10 PM,07:40 PM,Saturday,0.594665,-2.063782,0
2,17019637,Blue Jays at Dodgers,0.902473,0.742398,mlb,Dodger Stadium,90012.0,1000 Elysian Park Avenue,34.0718,-118.246,2025-08-10 14:40:00-07:00,2025-08-10,01:10 PM,02:40 PM,Sunday,0.594665,-2.063782,0
3,17290017,Connecticut Sun at Los Angeles Sparks,0.76963,0.509183,wnba,Crypto.com Arena,90015.0,1111 South Figueroa Street,34.043,-118.267,2025-08-07 20:00:00-07:00,2025-08-07,07:00 PM,08:00 PM,Thursday,0.594162,-2.064149,0
4,17122657,Anuel AA (Rescheduled from 12/13/24),0.826789,0.608715,concert,Crypto.com Arena,90015.0,1111 South Figueroa Street,34.043,-118.267,2025-08-09 21:30:00-07:00,2025-08-09,08:00 PM,09:30 PM,Saturday,0.594162,-2.064149,0


In [25]:
Events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype                              
---  ------           --------------  -----                              
 0   id               68 non-null     int64                              
 1   Event Name       68 non-null     object                             
 2   popularity       68 non-null     float64                            
 3   score            68 non-null     float64                            
 4   type             68 non-null     object                             
 5   Name of Venue    68 non-null     object                             
 6   Zipcode          66 non-null     float64                            
 7   address          68 non-null     object                             
 8   latitude         68 non-null     float64                            
 9   longitude        68 non-null     float64                            
 10  paci

In [26]:
from datetime import datetime, timedelta 
timestamp = datetime.now().strftime('%Y%m%d')
raw_path = f"events_data/LA_events/modeled/event_clustering_LA{timestamp}.parquet"
Events_df.to_parquet(raw_path, index=False)