# Get and Flag Holidays

In [16]:
# Cell: generate a daily date range and flag US federal holidays
import pandas as pd
import holidays

dates = pd.date_range(start='2024-07-01', end='2025-07-01', freq='D')
df = pd.DataFrame({'date': dates})

us_holidays = holidays.US(years=range(2024, 2026))

df['holiday_flag'] = df['date'].isin(us_holidays).astype(int)
df['holiday_name'] = df['date'].map(us_holidays).fillna('')

print(df.head(15))


         date  holiday_flag      holiday_name
0  2024-07-01             0                  
1  2024-07-02             0                  
2  2024-07-03             0                  
3  2024-07-04             1  Independence Day
4  2024-07-05             0                  
5  2024-07-06             0                  
6  2024-07-07             0                  
7  2024-07-08             0                  
8  2024-07-09             0                  
9  2024-07-10             0                  
10 2024-07-11             0                  
11 2024-07-12             0                  
12 2024-07-13             0                  
13 2024-07-14             0                  
14 2024-07-15             0                  


  df['holiday_flag'] = df['date'].isin(us_holidays).astype(int)


In [11]:
# Cell: save the holidays dataframe to CSV
output_path = "../holidays_2024_2025.csv"
try:
    df.to_csv(output_path, index=False)
    print(f"Wrote CSV to: {output_path}")
except Exception as e:
    print("Failed to write CSV:", e)


Wrote CSV to: ../holidays_2024_2025.csv


# Get Disneyland Hours and Ticket Tiers

In [17]:
# Cell: download ThemeParkIQ Disneyland park hours page and save HTML
import requests

url = "https://www.themeparkiq.com/disneyland/park-hours"
headers = {"User-Agent": "ResearchBot/1.0 (email: your_email@example.com)"}
r = requests.get(url, headers=headers, timeout=15)
r.raise_for_status()
with open("themeparkiq_park_hours.html", "wb") as f:
    f.write(r.content)
print("Wrote themeparkiq_park_hours.html (size:", len(r.content), "bytes )")
print(r.text)


Wrote themeparkiq_park_hours.html (size: 808932 bytes )
<!doctype html>
<html lang="en">
<head>
    <!-- Load Inter font immediately to prevent font swapping -->
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=block">
    
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta charset="utf-8" />
<meta name="image" content="https://images.themeparkiq.com/pwa/themepark_og_icon.png" />
<meta property="og:image" content="https://images.themeparkiq.com/pwa/themepark_og_icon.png" />
<meta name="embedx:image" content="https://images.themeparkiq.com/pwa/themepark_og_icon.png" />
<meta name="copyright" content="Theme Park IQ" />
<meta name="mobile-web-app-capable" content="yes" />
<meta name="theme-color" content="#615fff" />
<meta property="og:site_name" content="Theme 

In [18]:
# Cell: parse the saved ThemeParkIQ HTML to extract day numbers and park hours
from bs4 import BeautifulSoup
import pandas as pd

with open("themeparkiq_park_hours.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

days = []
day_divs = soup.select("div.min-h-\\[80px\\].sm\\:min-h-\\[100px\\]")

for div in day_divs:
    date_tag = div.select_one("div.text-xs")
    if not date_tag:
        continue
    day_number = date_tag.text.strip()

    dl_tag = div.find("span", string=lambda t: t and "DL:" in t)
    dca_tag = div.find("span", string=lambda t: t and "DCA:" in t)

    dl_hours = None
    dca_hours = None

    if dl_tag and dl_tag.find_next("span"):
        dl_hours = dl_tag.find_next("span").text.strip()
    if dca_tag and dca_tag.find_next("span"):
        dca_hours = dca_tag.find_next("span").text.strip()

    days.append({
        "day": day_number,
        "dl_hours": dl_hours,
        "dca_hours": dca_hours
    })

df = pd.DataFrame(days)
df.to_csv("disneyland_hours_parsed.csv", index=False)
print(df)


   day  dl_hours dca_hours
0   30  8am-12am  8am-10pm
1    1  8am-11pm   8am-9pm
2    2  8am-11pm   8am-9pm
3    3  8am-11pm   8am-9pm
4    4  8am-11pm   8am-9pm
5    5  8am-12am  8am-10pm
6    6  8am-12am  8am-10pm
7    7  8am-12am  8am-10pm
8    8  8am-12am  8am-10pm
9    9  8am-11pm   8am-9pm
10  10  8am-11pm   8am-9pm
11  11  8am-11pm   8am-9pm
12  12  8am-12am  8am-10pm
13  13  8am-12am  8am-10pm
14  14  8am-11pm   8am-9pm
15  15  8am-11pm   8am-9pm
16  16  8am-11pm   8am-9pm
17  17  8am-11pm   8am-9pm
18  18  8am-11pm   8am-9pm
19  19  8am-12am  8am-10pm
20  20  8am-12am  8am-10pm
21  21  8am-12am  8am-10pm
22  22  8am-12am  8am-10pm
23  23  8am-12am  8am-10pm
24  24  8am-12am  8am-10pm
25  25  8am-12am  8am-10pm
26  26  8am-12am  8am-10pm
27  27  8am-12am  8am-10pm
28  28  8am-12am  8am-10pm
29  29  8am-12am  8am-10pm
30  30  8am-12am  8am-10pm
31  31   8am-2am   8am-1am
32   1  8am-12am  8am-10pm
33   2  8am-12am  8am-10pm
34   3  8am-12am  8am-10pm


In [21]:
# Cell: parse saved monthly 'hours' HTML files and extract operating hours per date
import os
import json
import html
import pandas as pd
from bs4 import BeautifulSoup

HTML_DIR = "../hours"
all_data = []

def parse_park_hours(html_file):
    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Find divs that include a wire:snapshot JSON payload
    snapshot_divs = soup.find_all("div", attrs={"wire:snapshot": True})
    if not snapshot_divs:
        print(f"    No wire:snapshot found in {html_file}")
        return pd.DataFrame()

    data = None
    for snapshot_div in snapshot_divs:
        snapshot_json = snapshot_div.get("wire:snapshot")
        snapshot_json = html.unescape(snapshot_json)
        try:
            temp_data = json.loads(snapshot_json)
            if "calendarDays" in temp_data.get("data", {}):
                data = temp_data
                break
        except Exception:
            continue

    if not data:
        print(f"    No calendarDays found in {html_file}")
        return pd.DataFrame()

    calendar_days_raw = data.get("data", {}).get("calendarDays", [])
    if not calendar_days_raw:
        print(f"    Empty calendarDays in {html_file}")
        return pd.DataFrame()

    rows = []
    # Flatten the nested calendarDays structure
    for outer_array in calendar_days_raw:
        if not isinstance(outer_array, list):
            continue
        for inner_array in outer_array:
            if not isinstance(inner_array, list):
                continue
            for day_entry in inner_array:
                if not isinstance(day_entry, dict):
                    continue

                date_field = day_entry.get("date", [])
                if isinstance(date_field, list) and len(date_field) > 0:
                    date_str = date_field[0]
                else:
                    continue

                date_obj = pd.to_datetime(date_str)
                row = {
                    "date": date_obj,
                    "year": date_obj.year,
                    "month": date_obj.month,
                    "day": date_obj.day
                }

                events_raw = day_entry.get("events", [])
                for events_outer in events_raw:
                    if not isinstance(events_outer, list):
                        continue
                    for events_inner in events_outer:
                        if not isinstance(events_inner, list):
                            continue
                        for event in events_inner:
                            if not isinstance(event, dict):
                                continue
                            park_name = event.get("park_name", "")
                            event_type = event.get("type", "")
                            if event_type == "OPERATING":
                                title = event.get("title", "")
                                if "Disneyland Park" in park_name:
                                    row["hours_DL"] = title
                                elif "California Adventure" in park_name:
                                    row["hours_DCA"] = title
                rows.append(row)

    if not rows:
        return pd.DataFrame()
    df = pd.DataFrame(rows)
    df = df.groupby(["date", "year", "month", "day"], as_index=False).first()
    return df

# Ensure directory exists and iterate over saved month files
if not os.path.exists(HTML_DIR):
    print(f"❌ Directory '{HTML_DIR}' does not exist!")
    print(f"   Current working directory: {os.getcwd()}")
else:
    all_files = os.listdir(HTML_DIR)
    html_files = [f for f in all_files if f.endswith(".html")]
    print(f"📁 Found {len(html_files)} HTML file(s) in '{HTML_DIR}':")
    for file in sorted(html_files):
        full_path = os.path.join(HTML_DIR, file)
        try:
            df_month = parse_park_hours(full_path)
            if len(df_month) > 0:
                all_data.append(df_month)
                print(f"✅ Parsed {file}: {len(df_month)} days")
            else:
                print(f"⚠️  Skipped {file}: no data extracted")
        except Exception as e:
            print(f"❌ Failed {file}: {e}")

if not all_data:
    print("\n❌ No data was parsed! Check:")
    print("   1. Does the 'hours' directory exist and contain .html files?")
    print("   2. Are the HTML files in the expected format?")
else:
    combined = pd.concat(all_data, ignore_index=True)
    combined = combined.sort_values(["year", "month", "day"])
    for col in ["hours_DL", "hours_DCA"]:
        if col not in combined.columns:
            combined[col] = None
    combined = combined[["date", "hours_DL", "hours_DCA"]]
    combined = combined.drop_duplicates(subset=["date"], keep="first")
    combined.to_csv("disneyland_hours_2024_2025.csv", index=False)
    print(f"\n🎢 Saved {len(combined)} rows to disneyland_hours_2024_2025.csv")
    print(f"   Date range: {combined['date'].min().date()} to {combined['date'].max().date()}")
    print(f"   Columns: {', '.join(combined.columns)}")


📁 Found 13 HTML file(s) in '../hours':
✅ Parsed april2025.html: 35 days
✅ Parsed august2024.html: 35 days
✅ Parsed dec2024.html: 35 days
✅ Parsed feb2025.html: 35 days
✅ Parsed jan2025.html: 35 days
✅ Parsed july2024.html: 35 days
✅ Parsed july2025.html: 35 days
✅ Parsed june2025.html: 35 days
✅ Parsed march2025.html: 42 days
✅ Parsed may2025.html: 35 days
✅ Parsed nov2024.html: 35 days
✅ Parsed oct2024.html: 35 days
✅ Parsed sep2024.html: 35 days

🎢 Saved 399 rows to disneyland_hours_2024_2025.csv
   Date range: 2024-06-30 to 2025-08-02
   Columns: date, hours_DL, hours_DCA


In [23]:
# Cell: parse saved 'tiers' HTML files to extract ticket tier per date
import os
import re
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

TIERS_DIR = '../tiers'
tier_files = sorted([f for f in os.listdir(TIERS_DIR) if f.endswith('.html')])
print(f"Found {len(tier_files)} tier HTML files")
all_tier_data = []

for file in tier_files:
    full_path = os.path.join(TIERS_DIR, file)
    print(f"Processing {file}...")
    with open(full_path, 'r', encoding='utf-8') as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')

    # Locate the snapshot JSON that contains year/month
    all_snapshots = soup.find_all('div', {'wire:snapshot': True})
    snapshot_div_found = None
    for snap in all_snapshots:
        snap_str = snap.get('wire:snapshot', '')
        if '"year"' in snap_str and '"month"' in snap_str:
            snapshot_div_found = snap
            break
    if not snapshot_div_found:
        print(f"  Warning: No snapshot div found in {file}")
        continue
    snapshot_str = snapshot_div_found.get('wire:snapshot', '')
    import json
    import html as html_module
    snapshot_str = html_module.unescape(snapshot_str)
    try:
        snapshot_data = json.loads(snapshot_str)
        year = snapshot_data['data']['year']
        month = snapshot_data['data']['month']
        print(f"  Year: {year}, Month: {month}")
    except Exception as e:
        print(f"  Error parsing snapshot: {e}")
        continue

    tier_headers = soup.find_all('h4', class_='text-xs font-semibold text-gray-500 dark:text-gray-400', string='Ticket Tier')
    print(f"  Found {len(tier_headers)} tier tooltips")
    for tier_header in tier_headers:
        tooltip_div = tier_header.find_parent('div', class_='w-48 rounded-lg bg-white p-3 shadow-lg border border-zinc-200 dark:border-gray-700 dark:bg-zinc-800 space-y-2')
        if not tooltip_div:
            continue
        date_h4 = tooltip_div.find('h4', class_='text-xs font-semibold text-gray-500 dark:text-gray-400')
        if not date_h4 or date_h4.get_text(strip=True) == 'Ticket Tier':
            all_h4s = tooltip_div.find_all('h4', class_='text-xs font-semibold text-gray-500 dark:text-gray-400')
            date_h4 = None
            for h4 in all_h4s:
                text = h4.get_text(strip=True)
                if 'Ticket Tier' not in text and 'Seasonal Event' not in text:
                    date_h4 = h4
                    break
        if not date_h4:
            continue
        date_text = date_h4.get_text(strip=True)
        try:
            date_match = re.search(r'(\w+),\s+(\w+)\s+(\d+)\s+(\d+)', date_text)
            if not date_match:
                continue
            month_str = date_match.group(2)
            day = int(date_match.group(3))
            year_parsed = int(date_match.group(4))
            date_obj = datetime.strptime(f"{month_str} {day} {year_parsed}", "%b %d %Y")
        except:
            continue
        tier_div = tier_header.find_parent('div')
        tier_span = tier_div.find('span', class_=re.compile(r'rounded-sm.*text-xs.*font-medium'))
        tier = None
        if tier_span:
            tier_text = tier_span.get_text(strip=True)
            tier_match = re.search(r'Tier\s+(\d)', tier_text)
            if tier_match:
                tier = int(tier_match.group(1))
        if tier is not None:
            all_tier_data.append({
                'date': date_obj,
                'tier': tier
            })

tier_df = pd.DataFrame(all_tier_data)
if not tier_df.empty:
    tier_df = tier_df.sort_values('date').reset_index(drop=True)
    print(f"\nTotal records extracted: {len(tier_df)}")
    print(f"Date range: {tier_df['date'].min()} to {tier_df['date'].max()}")
    print(f"Tier values: {sorted(tier_df['tier'].unique())}")
    tier_0_count = (tier_df['tier'] == 0).sum()
    if tier_0_count > 0:
        print(f"\n⚠️  Warning: Found {tier_0_count} days with tier 0")
        print("Tier 0 dates:")
        print(tier_df[tier_df['tier'] == 0][['date', 'tier']])
    print(f"\nFirst few records:")
    print(tier_df.head(10))
    print(f"\nLast few records:")
    print(tier_df.tail(10))
    print(f"\nTier distribution:")
    print(tier_df['tier'].value_counts().sort_index())
    output_path = 'disneyland_tiers_2024_2025.csv'
    tier_df.to_csv(output_path, index=False)
    print(f"\n✅ Tier data exported to {output_path}")
else:
    print("\n❌ No tier data was extracted!")


Found 12 tier HTML files
Processing april2025.html...
  Year: 2025, Month: 4
  Found 30 tier tooltips
Processing august2024.html...
  Year: 2024, Month: 8
  Found 31 tier tooltips
Processing dec2024.html...
  Year: 2024, Month: 12
  Found 31 tier tooltips
Processing feb2025.html...
  Year: 2025, Month: 2
  Found 28 tier tooltips
Processing jan2025.html...
  Year: 2025, Month: 1
  Found 31 tier tooltips
Processing july2024.html...
  Year: 2024, Month: 7
  Found 31 tier tooltips
Processing june2025.html...
  Year: 2025, Month: 6
  Found 30 tier tooltips
Processing march2025.html...
  Year: 2025, Month: 3
  Found 31 tier tooltips
Processing may2025.html...
  Year: 2025, Month: 5
  Found 31 tier tooltips
Processing nov2024.html...
  Year: 2024, Month: 11
  Found 30 tier tooltips
Processing oct2024.html...
  Year: 2024, Month: 10
  Found 31 tier tooltips
Processing sep2024.html...
  Year: 2024, Month: 9
  Found 30 tier tooltips

Total records extracted: 365
Date range: 2024-07-01 00:00:00 t