## Creating dataframes 

In [None]:
import pandas as pd

# Columns: Land, Ride, Wait Time, Local Time, Day of Week
wait_times_df = pd.read_csv('disney_wait_times.csv', parse_dates=['Local Time'])

# Columns: date, hours_DL, hours_DCA
hours_df = pd.read_csv('disneyland_hours_2024_2025.csv', parse_dates=['date'])

# Columns: date, tier
tiers_df = pd.read_csv('disneyland_tiers_2024_2025.csv', parse_dates=['date'])

# Load holidays data
# Columns: date, holiday_flag, holiday_name
holidays_df = pd.read_csv('holidays_2024_2025.csv', parse_dates=['date'])

# Columns: time, temperature_2m (°C), relative_humidity_2m (%), dew_point_2m (°C), 
#          apparent_temperature (°C), precipitation (mm), rain (mm), wind_speed_10m (km/h),
#          wind_speed_80m (km/h), wind_gusts_10m (km/h), is_day (), lightning_potential (J/kg)
weather_df = pd.read_csv('weather_data.csv', skiprows=2, parse_dates=['time'])

print("Data loaded successfully!")
print(f"\nWait times shape: {wait_times_df.shape}")
print(f"Hours shape: {hours_df.shape}")
print(f"Tiers shape: {tiers_df.shape}")
print(f"Holidays shape: {holidays_df.shape}")
print(f"Weather shape: {weather_df.shape}")


Data loaded successfully!

Wait times shape: (653200, 5)
Hours shape: (399, 3)
Tiers shape: (365, 2)
Holidays shape: (366, 3)
Weather shape: (32160, 12)


## Taking a look at the data before removing closed hours

In [5]:
print("Hours Data Sample:")
print(hours_df.head())
print(f"\nHours Data Types:\n{hours_df.dtypes}")

print("\n" + "="*80)
print("\nWait Times Data Sample:")
print(wait_times_df.head())
print(f"\nWait Times Data Types:\n{wait_times_df.dtypes}")

print("\n" + "="*80)
print(f"\nUnique lands in wait times: {wait_times_df['Land'].unique()}")

Hours Data Sample:
                       date          hours_DL         hours_DCA
0 2024-06-30 00:00:00+00:00  8:00am - 12:00am  8:00am - 10:00pm
1 2024-07-01 00:00:00+00:00  8:00am - 12:00am  8:00am - 10:00pm
2 2024-07-02 00:00:00+00:00  8:00am - 12:00am  8:00am - 10:00pm
3 2024-07-03 00:00:00+00:00  8:00am - 12:00am  8:00am - 10:00pm
4 2024-07-04 00:00:00+00:00  8:00am - 12:00am  8:00am - 10:00pm

Hours Data Types:
date         datetime64[ns, UTC]
hours_DL                  object
hours_DCA                 object
dtype: object


Wait Times Data Sample:
                 Land                                               Ride  \
0     Avengers Campus       Guardians of the Galaxy - Mission: BREAKOUT!   
1     Avengers Campus      Guardians of the Galaxy - Monsters After Dark   
2     Avengers Campus               WEB SLINGERS: A Spider-Man Adventure   
3     Avengers Campus  WEB SLINGERS: A Spider-Man Adventure Single Rider   
4  Buena Vista Street                                    Re

## Removing the rows that had data outside of park hours

In [18]:
import re
from datetime import datetime, time

wait_times_df['Local Time'] = pd.to_datetime(wait_times_df['Local Time'], utc=True)
wait_times_df['date'] = wait_times_df['Local Time'].dt.tz_convert('America/Los_Angeles').dt.tz_localize(None).dt.normalize()

def parse_hours(hours_str):
    if pd.isna(hours_str) or hours_str == '':
        return None, None
    
    match = re.search(r'(\d+):(\d+)(am|pm)\s*-\s*(\d+):(\d+)(am|pm)', hours_str)
    if not match:
        return None, None
    
    open_hour = int(match.group(1))
    open_min = int(match.group(2))
    open_period = match.group(3)
    if open_period == 'pm' and open_hour != 12:
        open_hour += 12
    elif open_period == 'am' and open_hour == 12:
        open_hour = 0
    
    close_hour = int(match.group(4))
    close_min = int(match.group(5))
    close_period = match.group(6)
    if close_period == 'pm' and close_hour != 12:
        close_hour += 12
    elif close_period == 'am' and close_hour == 12:
        close_hour = 0
    if close_hour == 0 and close_period == 'am':
        close_hour = 24
    
    return time(open_hour, open_min), time(close_hour, close_min)

hours_df[['dca_open', 'dca_close']] = hours_df['hours_DCA'].apply(
    lambda x: pd.Series(parse_hours(x))
)

print("Sample of parsed hours:")
print(hours_df[['date', 'hours_DCA', 'dca_open', 'dca_close']].head(10))

hours_df['date_merge'] = hours_df['date'].dt.tz_localize(None)

wait_times_with_hours = wait_times_df.merge(
    hours_df[['date_merge', 'dca_open', 'dca_close']], 
    left_on='date', 
    right_on='date_merge',
    how='left'
)

print(f"\nBefore merge: {len(wait_times_df)} records")
print(f"After merge: {len(wait_times_with_hours)} records")
print(f"Records with hours data: {wait_times_with_hours['dca_open'].notna().sum()}")

def is_within_hours(row):
    if pd.isna(row['dca_open']) or pd.isna(row['dca_close']):
        return False
    
    local_time = row['Local Time'].tz_convert('America/Los_Angeles').time()
    
    if row['dca_close'].hour == 0 or row['dca_close'] < row['dca_open']:
        return local_time >= row['dca_open'] or local_time <= row['dca_close']
    else:
        return row['dca_open'] <= local_time <= row['dca_close']

wait_times_filtered = wait_times_with_hours[
    wait_times_with_hours.apply(is_within_hours, axis=1)
].copy()

wait_times_filtered = wait_times_filtered.drop(columns=['dca_open', 'dca_close', 'date', 'date_merge'])

print(f"\nOriginal wait times records: {len(wait_times_df):,}")
print(f"After filtering to DCA operating hours: {len(wait_times_filtered):,}")
print(f"Records removed: {len(wait_times_df) - len(wait_times_filtered):,}")
print(f"Percentage kept: {100 * len(wait_times_filtered) / len(wait_times_df):.1f}%")

wait_times_df = wait_times_filtered

wait_times_df.tail()

Sample of parsed hours:
                       date         hours_DCA  dca_open dca_close
0 2024-06-30 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
1 2024-07-01 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
2 2024-07-02 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
3 2024-07-03 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
4 2024-07-04 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
5 2024-07-05 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
6 2024-07-06 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
7 2024-07-07 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
8 2024-07-08 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00
9 2024-07-09 00:00:00+00:00  8:00am - 10:00pm  08:00:00  22:00:00

Before merge: 589985 records
After merge: 589985 records
Records with hours data: 589985

Original wait times records: 589,985
After filtering to DCA operating hours: 589,985
Records removed: 0
Percentage kept: 100.0%

Original wait times records: 5

Unnamed: 0,Land,Ride,Wait Time,Local Time,Day of Week
589980,Pixar Pier,Jumpin' Jellyfish,20,2025-06-11 23:15:03.929766+00:00,Wednesday
589981,Pixar Pier,Pixar Pal-A-Round - Swinging,30,2025-06-11 23:15:03.929766+00:00,Wednesday
589982,Pixar Pier,Pixar Pal-A-Round – Non-Swinging,35,2025-06-11 23:15:03.929766+00:00,Wednesday
589983,Pixar Pier,Toy Story Midway Mania!,65,2025-06-11 23:15:03.929766+00:00,Wednesday
589984,San Fransokyo Square,The Bakery Tour,0,2025-06-11 23:15:03.929766+00:00,Wednesday
