In [None]:
import re, time, json, urllib.parse
from typing import Dict, Optional, List
from bs4 import BeautifulSoup, Tag, NavigableString

from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Try Chrome first (Selenium Manager), then Edge (usually present on Windows)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.edge.options import Options as EdgeOptions

HEADINGS_MAP = {
    "ข้อมูลและสาเหตุของโรค": "info_and_causes",
    "ข้อมูลโรคและสาเหตุการเกิดโรค": "info_and_causes",  # variant
    "อาการของโรค": "symptoms",
    "แนวทางการตรวจวินิจฉัยโรค": "diagnosis",
    "แนวทางการดูแลรักษา": "treatment",
    "แพทย์เฉพาะทางแนะนำ": "specialist_doctor_recommended",
    "ข้อควรระวัง": "precautions",
    "ข้อมูลเพิ่มเติม": "additional_info",
}

def _clean(s: Optional[str]) -> Optional[str]:
    if s is None: return None
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

def _collect_by_siblings(start_el: Tag, heading_texts: set) -> Optional[str]:
    """Primary strategy: walk next siblings until the next recognized heading."""
    parts: List[str] = []
    cur = start_el.next_sibling
    while cur:
        if isinstance(cur, NavigableString):
            cur = cur.next_sibling
            continue
        if isinstance(cur, Tag):
            t_all = _clean(cur.get_text(" ", strip=True))
            if t_all in heading_texts:
                break
            if cur.name in ("p","div") and t_all:
                parts.append(t_all)
            elif cur.name in ("ul","ol"):
                for li in cur.find_all("li"):
                    lt = _clean(li.get_text(" ", strip=True))
                    if lt: parts.append("• " + lt)
        cur = cur.next_sibling
    return _clean(" ".join(parts)) if parts else None

def _collect_by_walkdown(start_el: Tag, heading_texts: set, max_nodes: int = 300) -> Optional[str]:
    """Fallback strategy: walk forward through the DOM (depth-first) until next heading is encountered."""
    parts: List[str] = []
    count = 0
    for node in start_el.find_all_next(True):
        count += 1
        if count > max_nodes: break
        t_all = _clean(node.get_text(" ", strip=True))
        if not t_all: 
            continue
        if t_all in heading_texts:
            break
        if node.name in ("p","div"):
            parts.append(t_all)
        elif node.name in ("ul","ol"):
            for li in node.find_all("li"):
                lt = _clean(li.get_text(" ", strip=True))
                if lt: parts.append("• " + lt)
    return _clean(" ".join(parts)) if parts else None

def _open_driver(headless: bool = True):
    # Try Chrome
    try:
        copts = ChromeOptions()
        if headless: copts.add_argument("--headless=new")
        copts.add_argument("--disable-gpu")
        copts.add_argument("--lang=th-TH")
        copts.add_argument("--window-size=1200,2000")
        return webdriver.Chrome(options=copts)
    except WebDriverException:
        # Fallback to Edge (installed by default on Windows)
        eopts = EdgeOptions()
        if headless: eopts.add_argument("--headless=new")
        eopts.add_argument("--disable-gpu")
        eopts.add_argument("--lang=th-TH")
        eopts.add_argument("--window-size=1200,2000")
        return webdriver.Edge(options=eopts)

def parse_disease_page_selenium(url: str, wait_sec: int = 20, headless: bool = True, debug: bool=False) -> Dict:
    driver = _open_driver(headless=headless)
    try:
        driver.get(url)

        # Wait for ANY known section heading to appear (robust to tag type/class)
        any_heading_xpath = " | ".join(
            [f"//*[normalize-space(text())='{h}']" for h in HEADINGS_MAP.keys()]
        )
        try:
            WebDriverWait(driver, wait_sec).until(
                EC.presence_of_element_located((By.XPATH, any_heading_xpath))
            )
        except TimeoutException:
            # Try a small scroll then one more wait
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.0)
            WebDriverWait(driver, 8).until(
                EC.presence_of_element_located((By.XPATH, any_heading_xpath))
            )

        # Gentle scroll to trigger any lazy content
        for _ in range(6):
            driver.execute_script("window.scrollBy(0, 800);"); time.sleep(0.25)

        html = driver.page_source
    finally:
        driver.quit()

    soup = BeautifulSoup(html, "lxml")

    # Page title (best-effort)
    title = None
    h1 = soup.find("h1")
    if h1:
        title = _clean(h1.get_text(" ", strip=True))

    # Find heading nodes by visible text (not by classes)
    heading_nodes: Dict[str, Tag] = {}
    wanted = set(HEADINGS_MAP.keys())
    for tag_name in ("h2","h3","p","div","span"):
        for el in soup.find_all(tag_name):
            t = _clean(el.get_text(" ", strip=True))
            if t and t in wanted and t not in heading_nodes:
                heading_nodes[t] = el

    out = {
        "url": url,
        "slug": urllib.parse.urlparse(url).path.rsplit("/", 1)[-1],
        "title": title,
        "info_and_causes": None,
        "symptoms": None,
        "diagnosis": None,
        "treatment": None,
        "specialist_doctor_recommended": None,
        "precautions": None,
        "additional_info": None,
    }

    heading_texts = set(heading_nodes.keys())
    for thai_heading, key in HEADINGS_MAP.items():
        el = heading_nodes.get(thai_heading)
        if not el:
            continue
        # Try sibling-walk first; fallback to forward-walk if empty
        sec = _collect_by_siblings(el, heading_texts)
        if not sec:
            sec = _collect_by_walkdown(el, heading_texts)
        if sec:
            out[key] = sec

    if debug:
        print(json.dumps(out, ensure_ascii=False, indent=2))
    return out


In [None]:
# ---- Discover diseases grouped by initial letter, then batch-scrape ----
import re, json, csv, time, urllib.parse
from typing import List, Dict, Optional
import httpx
from bs4 import BeautifulSoup, Tag, NavigableString

FORUM_BASE = "https://www.agnoshealth.com"
INDEX_URL  = f"{FORUM_BASE}/diseases-and-symptoms?tab=diseases&group=alphabet"
UA = {"User-Agent":"AgnosRAGBot/1.1 (+demo)", "Accept-Language":"th,en;q=0.8"}

def _clean(s: Optional[str]) -> Optional[str]:
    if s is None: return None
    return re.sub(r"\s+"," ", s).strip() or None

def _slugify_en(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s

def _build_url(thai_name: str, english_name: Optional[str]) -> Optional[str]:
    if not thai_name: return None
    if english_name:
        thai_q   = urllib.parse.quote(thai_name, safe="")
        eng_slug = _slugify_en(english_name)
        return f"{FORUM_BASE}/diseases/{thai_q}/{eng_slug}"
    return None

def _is_h2_letter(tag: Tag) -> bool:
    if not isinstance(tag, Tag): return False
    if tag.name != "h2": return False
    classes = set(tag.get("class", []))
    return "font-bold" in classes and "text-2xl" in classes

def _extract_grouped_from_html(html: str) -> List[Dict]:
    """
    Walk each <h2.font-bold.text-2xl> (Thai initial letter),
    then collect all <p.text-primary_blue-600> until the next <h2>.
    """
    soup = BeautifulSoup(html, "lxml")
    items: List[Dict] = []

    letters = [h for h in soup.find_all("h2") if _is_h2_letter(h)]
    for h2 in letters:
        letter = _clean(h2.get_text(" ", strip=True))
        # walk siblings until next h2-letter
        sib = h2.next_sibling
        while sib and not (_is_h2_letter(sib) if isinstance(sib, Tag) else False):
            if isinstance(sib, Tag):
                for p in sib.find_all("p", class_="text-primary_blue-600"):
                    text = _clean(p.get_text(" ", strip=True))
                    if not text:
                        continue
                    # pattern: ไทย (English)
                    m = re.match(r"^(.*?)\s*\((.*?)\)\s*$", text)
                    thai    = m.group(1) if m else text
                    english = m.group(2) if m else None

                    # Prefer the real link if the <p> sits inside an <a> row
                    href = None
                    a = p.find_parent("a")
                    if a and a.get("href"):
                        href = urllib.parse.urljoin(FORUM_BASE, a["href"])
                    url = href or _build_url(thai, english)

                    items.append({"letter": letter, "thai": thai, "english": english, "url": url})
            sib = sib.next_sibling

    # de-duplicate by URL (or (letter,thai,english) if URL missing)
    seen = set(); unique = []
    for it in items:
        key = it["url"] or (it["letter"], it["thai"], it.get("english"))
        if key in seen: 
            continue
        seen.add(key)
        unique.append(it)
    return unique

def discover_disease_links_grouped(headless: bool = True) -> List[Dict]:
    """
    HTTP first; if 0 found (JS-rendered), render with Selenium and retry.
    Returns: [{letter, thai, english, url}]
    """
    # Try plain HTTP
    try:
        with httpx.Client(headers=UA, timeout=30, follow_redirects=True) as c:
            r = c.get(INDEX_URL)
            if r.status_code == 200:
                out = _extract_grouped_from_html(r.text)
                if out:
                    # quick per-letter stats
                    by_letter = {}
                    for x in out: by_letter.setdefault(x["letter"], 0); by_letter[x["letter"]] += 1
                    print(f"[discover] via HTTP: total {len(out)}")
                    print("  letters:", ", ".join(f"{k}:{v}" for k,v in by_letter.items()))
                    return out
    except Exception:
        pass

    # Fallback: Selenium render (we already have _open_driver from your parser)
    print("[discover] HTTP yielded 0; rendering with Selenium …")
    driver = _open_driver(headless=headless)
    try:
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC

        driver.get(INDEX_URL)
        # wait for any letter h2 and at least one disease row under it
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h2.font-bold.text-2xl"))
        )
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p.text-primary_blue-600"))
        )
        # scroll to ensure all groups load
        for _ in range(10):
            driver.execute_script("window.scrollBy(0, 1600);")
            time.sleep(0.25)

        html = driver.page_source
    finally:
        driver.quit()

    out = _extract_grouped_from_html(html)
    by_letter = {}
    for x in out: by_letter.setdefault(x["letter"], 0); by_letter[x["letter"]] += 1
    print(f"[discover] via Selenium: total {len(out)}")
    print("  letters:", ", ".join(f"{k}:{v}" for k,v in by_letter.items()))
    return out

def scrape_diseases_to_files_grouped(limit: int = 0,
                                     headless: bool = True,
                                     json_path: str = "agnos_diseases.json",
                                     csv_path: str = "agnos_diseases.csv") -> List[Dict]:
    links = discover_disease_links_grouped(headless=headless)
    if limit and limit > 0:
        links = links[:limit]
    print(f"[scrape] will fetch {len(links)} disease pages")

    rows: List[Dict] = []
    for i, item in enumerate(links, 1):
        url = item.get("url")
        if not url:
            print(f"[skip] no URL for: {item['letter']} / {item['thai']} ({item.get('english')})")
            continue
        try:
            row = parse_disease_page_selenium(url, headless=headless, debug=False)
            # attach names from index
            row["letter"]       = item["letter"]
            row["thai_name"]    = item["thai"]
            row["english_name"] = item.get("english")
            rows.append(row)
            print(f"[{i}/{len(links)}] {item['letter']} • {item['thai']} ✓")
        except Exception as e:
            print(f"[{i}/{len(links)}] FAIL {url}: {e}")

    # Save outputs
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)

    fields = ["letter","thai_name","english_name","url","slug","title",
              "info_and_causes","symptoms","diagnosis","treatment",
              "specialist_doctor_recommended","precautions","additional_info"]
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader()
        for r in rows:
            w.writerow({k: r.get(k) for k in fields})

    print(f"[done] saved {len(rows)} diseases → {json_path}, {csv_path}")
    if rows:
        print("Sample:")
        print(json.dumps(rows[0], ensure_ascii=False, indent=2))
    return rows


In [None]:
rows = scrape_diseases_to_files_grouped(limit=0, headless=True)
