## Scraping Holiday (ไว้โม้ว่าเขาไใ่ทำงานวันหยุด info to llm)

In [13]:
import time
import re
import redis
import requests
import json
import os
from datetime import datetime
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


def fetch_holiday(r):
    url = "https://www.myhora.com/calendar/ical/holiday.aspx?latest.txt"
    thai_months = {
        "ม.ค.": "01", "ก.พ.": "02", "มี.ค.": "03", "เม.ย.": "04",
        "พ.ค.": "05", "มิ.ย.": "06", "ก.ค.": "07", "ส.ค.": "08",
        "ก.ย.": "09", "ต.ค.": "10", "พ.ย.": "11", "ธ.ค.": "12"
    }

    def convert(date_str):
        match = re.match(r"(\d{1,2})\s(\S+)\s(\d{4})", date_str.strip())
        if not match:
            return None
        d, m, y = match.groups()
        return f"{int(d):02d}/{thai_months.get(m)}/{int(y) - 543}"

    try:
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, "html.parser")
    except Exception as e:
        print("⚠️ Error fetching holiday page:", e)
        return

    count = 0
    for div in soup.find_all("div", class_="mb-5"):
        cols = div.find_all("div")
        if len(cols) >= 2:
            raw = cols[0].text.strip()
            name = cols[1].text.strip()
            formatted = convert(raw)
            if formatted:
                iso = datetime.strptime(formatted, "%d/%m/%Y").strftime("%Y-%m-%d")
                r.set(f"holiday:{iso}", name)
                r.sadd(f"holidays:{iso[:4]}", iso)
                count += 1
    print(f"✅ fetched {count} holidays")


def fetch_air_quality(r, driver):
    url = "https://airquality.airbkk.com/PublicWebClient/#/Modules/Aqs/HomePage"
    driver.get(url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    table = soup.find("div", class_="table-responsive")
    if not table:
        print("❌ Could not find air quality table")
        return

    rows = table.find("tbody", class_="table-bordered").find_all("tr")
    count = 0

    for row in rows:
        cols = row.find_all("td")
        if len(cols) >= 6:
            b_tag = cols[1].find("b")
            pm25 = cols[5].text.strip()
            if b_tag and pm25:
                location = b_tag.text.strip()
                r.set(f"aqi:{location}", pm25)
                count += 1
    print(f"✅ fetched PM2.5 for {count} locations")


def fetch_weather_forecast(r, driver):
    provinces = [
        "Bangkok", "Nakhon Pathom", "Pathum Thani",
        "Nonthaburi", "Samut Prakan", "Samut Sakhon"
    ]

    total = 0

    for province in provinces:
        url = f"https://www.tmd.go.th/en/weatherForecast7Days?province={province}&culture=en-US"
        driver.get(url)
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        container = soup.select_one("div.d-flex.gap-3.h-100")
        if not container:
            print(f"❌ Cannot find forecast container for {province}")
            continue

        cards = container.select(".card")
        for card in cards:
            try:
                date_text = card.select_one(".today-header .text-dark2").text.strip()
                date_obj = datetime.strptime(date_text + f" {datetime.now().year}", "%d %b %Y")
                date_iso = date_obj.strftime("%Y-%m-%d")

                weather = card.select(".font-tiny.text-center")[0].text.strip()
                rain = card.select(".font-tiny.text-center")[1].text.strip()
                temps = card.select(".sub-heading div")

                max_temp = temps[0].text.strip() if len(temps) > 0 else ""
                min_temp = temps[2].text.strip() if len(temps) > 2 else ""
                wind = card.select_one("span.ps-1").text.strip()

                redis_key = f"weather:{date_iso}:{province}"
                data = {
                    "province": province,
                    "date": date_iso,
                    "weather": weather,
                    "rain": rain,
                    "max_temp": max_temp.replace("°", ""),
                    "min_temp": min_temp.replace("°", ""),
                    "wind_speed": wind.replace(" km./hr.", "")
                }

                r.set(redis_key, json.dumps(data))
                total += 1

            except Exception as e:
                print(f"⚠️ Error parsing weather card for {province}: {e}")

    print(f"✅ fetched {total} weather forecast entries")



In [14]:

# MAIN LOOP
try:
    r = redis.Redis(host="localhost", port=6379, decode_responses=True)
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=options)

    while True:
        print(f"\n⏰ Running at {datetime.now().strftime('%H:%M:%S')}")
        fetch_weather_forecast(r, driver)
        fetch_air_quality(r, driver)
        fetch_holiday(r)
        # time.sleep(60) # for testing, reduce to 60 seconds
        time.sleep(3600*23)  # Sleep for 23 hours to avoid rate limits

except KeyboardInterrupt:
    print("\n⛔ Stopped manually.")
finally:
    driver.quit()



⏰ Running at 15:22:10
✅ fetched 42 weather forecast entries
✅ fetched PM2.5 for 83 locations
✅ fetched 55 holidays

⛔ Stopped manually.
