In [8]:
# -*- coding: utf-8 -*-
"""
서울교통공사 지연증명서 - 최소검증 + 30분 슬롯(05:30~00:30) + StaleElement 회피
- 표 병합셀(rowspan/colspan) 전개
- 최종 CSV: [날짜, 노선, 방향, 05:30, 06:00, …, 23:30, 00:00, 00:30]
"""

import time
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager


# ------------------------------
# Selenium 드라이버 (최소 설정)
# ------------------------------
def build_driver(headless: bool = True) -> webdriver.Chrome:
    opts = webdriver.ChromeOptions()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,2200")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)


# ---------------------------------------------
# rowspan/colspan 전개 (tbody만)
# ---------------------------------------------
def _clean(s: str) -> str:
    return " ".join((s or "").replace("\xa0", " ").split())

def _build_grid(tr_nodes) -> list[list[str]]:
    grid, carry = [], {}          # carry: col_idx -> (val, remain_rows)
    for tr in tr_nodes:
        row, col = [], 0

        def fill_from_carry():
            nonlocal col
            while col in carry:
                val, remain = carry[col]
                row.append(val)
                remain -= 1
                if remain <= 0:
                    del carry[col]
                else:
                    carry[col] = (val, remain)
                col += 1

        fill_from_carry()

        for cell in tr.find_all(["td", "th"], recursive=False):
            txt = _clean(cell.get_text(separator=" "))
            cs  = int(cell.get("colspan", 1) or 1)
            rs  = int(cell.get("rowspan", 1) or 1)

            fill_from_carry()

            for _ in range(cs):
                row.append(txt)
                if rs > 1:
                    carry[col] = (txt, rs - 1)
                col += 1

        fill_from_carry()
        grid.append(row)

    max_w = max((len(r) for r in grid), default=0)
    grid = [r + [""] * (max_w - len(r)) for r in grid]
    return grid


# ---------------------------------------------
# 30분 슬롯 생성(05:30 ~ 00:30, 자정 넘김 지원)
# ---------------------------------------------
def make_time_slots(start="05:30", end="00:30", step_min=30):
    def to_min(tstr):
        h, m = map(int, tstr.split(":"))
        return h * 60 + m
    def label(total_min):
        total_min %= 1440
        return f"{total_min // 60:02d}:{total_min % 60:02d}"

    s = to_min(start)
    e = to_min(end)
    if e < s:
        e += 24 * 60  # 다음날 00:30

    slots = []
    cur = s
    while cur <= e:
        slots.append(label(cur))
        cur += step_min
    return slots

SLOTS = make_time_slots("05:30", "00:30", 30)


# ---------------------------------------------
# 표 HTML → 30분 슬롯 DF
#  - 본문 1행을 [노선, 방향, 첫차~09시, 09시~18시, 18시~막차]로 간주
#  - 버킷 텍스트를 슬롯에 그대로 채움(연산/분배 없음)
# ---------------------------------------------
def parse_tbl_to_slot_df(table_outer_html: str, date_label: str) -> pd.DataFrame:
    soup = BeautifulSoup(table_outer_html, "html.parser")
    tbody_trs = soup.select("tbody tr")
    if not tbody_trs:
        base = {"날짜": date_label, "노선": "", "방향": ""}
        for c in SLOTS: base[c] = ""
        return pd.DataFrame([base])

    grid = _build_grid(tbody_trs)

    # 버킷 경계(분)
    FIRST_START = 5*60 + 30    # 05:30
    BOUND_09    = 9*60         # 09:00
    BOUND_18    = 18*60        # 18:00

    def bucket_for_slot(tstr: str) -> int:
        h, m = map(int, tstr.split(":"))
        mins = (h * 60 + m) % 1440
        if FIRST_START <= mins < BOUND_09: return 0
        if BOUND_09 <= mins < BOUND_18:   return 1
        return 2  # 18:00~24:00 및 00:00~00:30

    rows = []
    for r in grid:
        # [노선, 방향, 첫차~09시, 09시~18시, 18시~막차] 형태로 단순 취급
        if len(r) >= 5:
            line, kind, b1, b2, b3 = r[:5]
        elif len(r) == 4:
            line, b1, b2, b3 = r
            kind = ""
        else:
            pad = (r + ["", "", "", "", ""])[:5]
            line, kind, b1, b2, b3 = pad

        row = {"날짜": date_label, "노선": _clean(line), "방향": _clean(kind)}
        buckets = [_clean(b1), _clean(b2), _clean(b3)]
        for slot in SLOTS:
            row[slot] = buckets[bucket_for_slot(slot)]
        rows.append(row)

    return pd.DataFrame(rows)


# ------------------------------
# 메인: 최소크롤링 파이프라인 (Stale 회피)
# ------------------------------
def scrape_simple_slots(
    url="http://www.seoulmetro.co.kr/kr/delayProofList.do?",
    output_csv="delay_proof_30min.csv",
    headless=True,
    sleep_after_click=1.4,   # ajax 교체를 아주 단순히 기다림
):
    driver = build_driver(headless=headless)
    all_df = []
    try:
        driver.get(url)
        time.sleep(1.0)

        # 1) 옵션 메타를 "문자열"로 먼저 복사해 둔다(중요: WebElement 보관 금지)
        sel = driver.find_element(By.CSS_SELECTOR, "select#view_date")
        sel_obj = Select(sel)
        option_meta = []
        for i, opt in enumerate(sel_obj.options):
            text = (opt.text or "").strip()
            value = (opt.get_attribute("value") or "").strip()
            if not value or "선택" in text:
                continue
            option_meta.append({"text": text, "value": value})
            print("option -",i,")",text)

        # 2) 옵션 메타를 순회하면서, 매 루프마다 select를 "다시 찾고" 선택
        for meta in option_meta:
            date_label = meta["text"]
            value = meta["value"]

            # select/검색은 DOM 갱신 후 항상 새로 조회
            sel = driver.find_element(By.CSS_SELECTOR, "select#view_date")
            Select(sel).select_by_value(value)

            # 버튼을 다시 찾는 대신 JS로 submit → stale 방지
            driver.execute_script("document.searchForm && document.searchForm.submit();")

            # 아주 단순 대기만 수행(검증 없음)
            time.sleep(sleep_after_click)

            # 표 읽기(필요 시 한 번 더 시도)
            try:
                tbl = driver.find_element(By.CSS_SELECTOR, "table.tbl-type1")
            except Exception:
                time.sleep(0.6)
                tbl = driver.find_element(By.CSS_SELECTOR, "table.tbl-type1")

            html = tbl.get_attribute("outerHTML")
            df_slots = parse_tbl_to_slot_df(html, date_label=date_label)
            all_df.append(df_slots)
            print("data_label :", date_label)
            print("data_value :", value)

        # 3) 저장
        if all_df:
            final = pd.concat(all_df, ignore_index=True)
        else:
            cols = ["날짜", "노선", "방향"] + SLOTS
            final = pd.DataFrame(columns=cols)

        final.to_csv(output_csv, index=False, encoding="utf-8-sig")
        print(f"[저장] {output_csv}  (총 행수={len(final)})")

    finally:
        try:
            driver.quit()
        except Exception:
            pass

if __name__ == "__main__":
    # 디버깅 시 headless=False 로 창을 띄워 확인
    scrape_simple_slots(
        url="http://www.seoulmetro.co.kr/kr/delayProofList.do?",
        output_csv="csv/delay_proof_30min.csv",
        headless=True,
        sleep_after_click=1.4
    )

option - 0 ) 금일 (2025-09-17)
option - 1 ) 1일전 (2025-09-16)
option - 2 ) 2일전 (2025-09-15)
option - 3 ) 3일전 (2025-09-14)
option - 4 ) 4일전 (2025-09-13)
option - 5 ) 5일전 (2025-09-12)
option - 6 ) 6일전 (2025-09-11)
option - 7 ) 7일전 (2025-09-10)
option - 8 ) 8일전 (2025-09-09)
option - 9 ) 9일전 (2025-09-08)
option - 10 ) 10일전 (2025-09-07)
option - 11 ) 11일전 (2025-09-06)
option - 12 ) 12일전 (2025-09-05)
option - 13 ) 13일전 (2025-09-04)
option - 14 ) 14일전 (2025-09-03)
option - 15 ) 15일전 (2025-09-02)
option - 16 ) 16일전 (2025-09-01)
option - 17 ) 17일전 (2025-08-31)
option - 18 ) 18일전 (2025-08-30)
option - 19 ) 19일전 (2025-08-29)
option - 20 ) 20일전 (2025-08-28)
option - 21 ) 21일전 (2025-08-27)
option - 22 ) 22일전 (2025-08-26)
option - 23 ) 23일전 (2025-08-25)
option - 24 ) 24일전 (2025-08-24)
option - 25 ) 25일전 (2025-08-23)
option - 26 ) 26일전 (2025-08-22)
option - 27 ) 27일전 (2025-08-21)
option - 28 ) 28일전 (2025-08-20)
option - 29 ) 29일전 (2025-08-19)
option - 30 ) 30일전 (2025-08-18)
data_label : 금일 (2025-09-17)


In [5]:
# -*- coding: utf-8 -*-
"""
코레일 간편지연증명서(https://info.korail.com/mbs/www/neo/delay/delaylist.jsp)
- 날짜 이동 안정화: ?indate=YYYY-MM-DD 로 직접 진입(우선), 실패 시 텍스트 클릭 보조
- 검증 최소(클릭/이동 후 sleep)
- 표 병합셀 전개 + 30분 슬롯(05:30~00:30)만 저장
- 최종 컬럼: [날짜, 노선, 방향, 05:30, 06:00, ..., 23:30, 00:00, 00:30]
"""

import re, time
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


# ------------------------------
# 드라이버
# ------------------------------
def build_driver(headless=True):
    opts = webdriver.ChromeOptions()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,2200")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)


# ------------------------------
# 유틸: 텍스트 정리 & 표 전개
# ------------------------------
def _clean(s: str) -> str:
    return " ".join((s or "").replace("\xa0", " ").split())

def _build_grid(tr_nodes) -> list[list[str]]:
    """tbody에서 rowspan/colspan 전개 (검증 최소)."""
    grid, carry = [], {}  # carry: col_idx -> (val, remain_rows)
    for tr in tr_nodes:
        row, col = [], 0

        def pull():
            nonlocal col
            while col in carry:
                val, left = carry[col]
                row.append(val)
                left -= 1
                if left <= 0:
                    del carry[col]
                else:
                    carry[col] = (val, left)
                col += 1

        pull()
        for cell in tr.find_all(["td", "th"], recursive=False):
            txt = _clean(cell.get_text(separator=" "))
            cs  = int(cell.get("colspan", 1) or 1)
            rs  = int(cell.get("rowspan", 1) or 1)
            pull()
            for _ in range(cs):
                row.append(txt)
                if rs > 1:
                    carry[col] = (txt, rs - 1)
                col += 1
        pull()
        grid.append(row)

    mw = max((len(r) for r in grid), default=0)
    grid = [r + [""] * (mw - len(r)) for r in grid]
    return grid


# ------------------------------
# 30분 슬롯 (05:30~00:30)
# ------------------------------
def make_time_slots(start="05:30", end="00:30", step_min=30):
    def to_min(hhmm):
        h, m = map(int, hhmm.split(":")); return h*60+m
    def label(m):
        m %= 1440; return f"{m//60:02d}:{m%60:02d}"
    s, e = to_min(start), to_min(end)
    if e < s: e += 1440
    slots, cur = [], s
    while cur <= e:
        slots.append(label(cur)); cur += step_min
    return slots

SLOTS = make_time_slots("05:30", "00:30", 30)

# 시간대 버킷 경계(분)
BOUND_08     = 8*60
BOUND_10     = 10*60
BOUND_18     = 18*60
BOUND_22     = 22*60
FIRST_START  = 5*60 + 30  # 05:30

def bucket_for_slot(tstr: str) -> int:
    """슬롯 → 버킷 인덱스(0..4)."""
    h, m = map(int, tstr.split(":"))
    mins = (h*60 + m) % 1440
    if FIRST_START <= mins < BOUND_08: return 0      # 첫차~08시
    if BOUND_08    <= mins < BOUND_10: return 1      # 08~10시
    if BOUND_10    <= mins < BOUND_18: return 2      # 10~18시
    if BOUND_18    <= mins < BOUND_22: return 3      # 18~22시
    return 4                                         # 22~막차(22:00~24:00, 00:00~00:30)


# ------------------------------
# 표 → 슬롯 DF
# ------------------------------
def parse_table_to_slots(table_outer_html: str, date_label: str) -> pd.DataFrame:
    """
    tbody 그리드를 [노선, 방면(방향), 버킷1..버킷5]로 가정해 30분 슬롯으로 확장.
    버킷 텍스트를 그대로 채움(가공 없음).
    """
    soup = BeautifulSoup(table_outer_html, "html.parser")
    tbody_trs = soup.select("tbody tr")
    if not tbody_trs:
        base = {"날짜": date_label, "노선": "", "방향": ""}
        for c in SLOTS: base[c] = ""
        return pd.DataFrame([base])

    grid = _build_grid(tbody_trs)
    rows = []
    for r in grid:
        # 기본 형태: [노선, 방면, 첫차~08시, 08~10시, 10~18시, 18~22시, 22~막차]
        if len(r) >= 7:
            line, dirn, b0, b1, b2, b3, b4 = r[:7]
        elif len(r) == 6:
            line, b0, b1, b2, b3, b4 = r
            dirn = ""
        else:
            pad = (r + [""]*7)[:7]
            line, dirn, b0, b1, b2, b3, b4 = pad

        row = {"날짜": date_label, "노선": _clean(line), "방향": _clean(dirn)}
        buckets = [_clean(b0), _clean(b1), _clean(b2), _clean(b3), _clean(b4)]
        for slot in SLOTS:
            row[slot] = buckets[bucket_for_slot(slot)]
        rows.append(row)
    return pd.DataFrame(rows)


# ------------------------------
# 날짜 이동: 직접 진입 우선, 클릭 보조
# ------------------------------
def date_list_yyyy_mm_dd(n_days=8):
    """오늘(Asia/Seoul) 포함 n_days개 날짜 문자열 리스트 생성."""
    today = datetime.now(ZoneInfo("Asia/Seoul")).date()
    return [(today - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(n_days)]

def try_open_with_query(driver, base_url: str, date_str: str, sleep_after=1.2) -> bool:
    """?indate=YYYY-MM-DD 로 직접 이동 시도."""
    try:
        driver.get(f"{base_url}?indate={date_str}")
        time.sleep(sleep_after)
        # 표가 있는지 간단 확인
        tables = driver.find_elements(By.TAG_NAME, "table")
        return len(tables) > 0
    except Exception:
        return False

def click_date_fallback(driver, date_str: str, sleep_after=1.2) -> bool:
    """페이지 내에서 해당 날짜 텍스트를 포함하는 요소 클릭(앵커/버튼/스팬 모두 시도)."""
    # 텍스트가 "금일 (YYYY-MM-DD)" 처럼 감싸질 수 있어 contains로 폭넓게 탐색
    xpaths = [
        f"//*[contains(normalize-space(), '{date_str}')]",
        f"//a[contains(normalize-space(), '{date_str}')]",
        f"//button[contains(normalize-space(), '{date_str}')]",
        f"//span[contains(normalize-space(), '{date_str}')]",
    ]
    for xp in xpaths:
        try:
            elem = driver.find_element(By.XPATH, xp)
            driver.execute_script("arguments[0].click();", elem)
            time.sleep(sleep_after)
            return True
        except Exception:
            continue
    return False

def find_delay_table(driver):
    """페이지 내 지연 표 선택(간단 휴리스틱)."""
    tables = driver.find_elements(By.TAG_NAME, "table")
    if not tables:
        raise RuntimeError("표(table)가 보이지 않습니다.")
    # 헤더 텍스트 단서로 식별
    for t in tables:
        txt = (t.text or "")
        if "노선" in txt and "22시~막차" in txt:
            return t
    return tables[0]


# ------------------------------
# 메인
# ------------------------------
def scrape_korail_simple_slots(
    url="https://info.korail.com/mbs/www/neo/delay/delaylist.jsp",
    output_csv="csv/korail_delay_30min.csv",
    headless=True,
    sleep_after_click=1.3,
):
    drv = build_driver(headless=headless)
    all_df = []
    dates = date_list_yyyy_mm_dd(8)  # 금일~7일전
    try:
        for d in dates:
            # 1) ?indate=YYYY-MM-DD 로 직접 진입 시도
            ok = try_open_with_query(drv, url, d, sleep_after_click)
            # 2) 실패 시: 기본 페이지 열고 텍스트 클릭 보조
            if not ok:
                drv.get(url)
                time.sleep(1.0)
                ok = click_date_fallback(drv, d, sleep_after_click)
                if not ok:
                    print(f"[경고] {d}: 날짜 이동 실패 → 건너뜀")
                    continue

            # 3) 표 파싱(필요시 1회 재시도)
            try:
                table_el = find_delay_table(drv)
            except Exception:
                time.sleep(0.6)
                table_el = find_delay_table(drv)

            html = table_el.get_attribute("outerHTML")
            df_slots = parse_table_to_slots(html, date_label=d)
            all_df.append(df_slots)
            print(f"[수집] {d} 행수={len(df_slots)}")

        # 4) 저장
        if all_df:
            final = pd.concat(all_df, ignore_index=True)
        else:
            final = pd.DataFrame(columns=["날짜", "노선", "방향"] + SLOTS)

        final.to_csv(output_csv, index=False, encoding="utf-8-sig")
        print(f"[저장] {output_csv} (총 행수={len(final)})")

    finally:
        try:
            drv.quit()
        except:
            pass


if __name__ == "__main__":
    # 디버깅 시 headless=False
    scrape_korail_simple_slots(
        headless=True,
        sleep_after_click=1.5
    )


[수집] 2025-09-17 행수=22
[수집] 2025-09-16 행수=22
[수집] 2025-09-15 행수=22
[수집] 2025-09-14 행수=22
[수집] 2025-09-13 행수=22
[수집] 2025-09-12 행수=22
[수집] 2025-09-11 행수=22
[수집] 2025-09-10 행수=22
[저장] csv/korail_delay_30min.csv (총 행수=176)
