In [1]:
# Selenium 웹 자동화 관련 라이브러리
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

# 데이터 처리용
import pandas as pd
import time

In [1]:
# Selenium 웹 자동화 관련 라이브러리
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

# 데이터 처리용
import pandas as pd
import time

# KBO 팀 타격 기록 페이지
url = "https://www.koreabaseball.com/Record/Team/Hitter/Basic1.aspx"

# 수집할 연도 목록
years = [2020, 2021, 2022, 2023, 2024, 2025]

# 크롬 드라이버 실행
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

# 명시적 대기 (최대 15초)
wait = WebDriverWait(driver, 15)

# ==============================
# 테이블 CSS Selector 고정
# ==============================
# 매번 selector를 다시 쓰지 않도록 상수화
# 구조가 바뀌지 않는 한 안정적으로 사용 가능
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"


def find_year_select():
    """
    페이지 내 모든 <select> 태그 중
    '연도 선택용 select box'를 찾아 반환한다.

    - KBO 페이지는 select id가 고정적이지 않아
      모든 select를 순회하며 옵션에 연도 값이 있는지로 판별
    - stale 방지를 위해 호출할 때마다 새로 찾음
    """
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            # 우리가 원하는 연도 중 하나라도 옵션에 있으면 해당 select로 판단
            if any(str(y) in opts for y in years):
                return sel
        except:
            # Select로 감싸지 못하는 경우 무시
            pass

    raise RuntimeError("연도 select를 찾지 못했습니다.")


def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 '실제로 갱신될 때까지' 대기하는 함수

    기준:
    1) tbody의 row가 존재해야 함
    2) 첫 번째 행의 텍스트가 이전과 달라질 때까지 대기

    ✔ 단순 sleep보다 훨씬 안정적
    """

    # 테이블 row 존재 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    # 첫 실행 시(prev_text=None)는 비교 대상이 없으므로 바로 종료
    if prev_first_row_text is None:
        return

    def first_row_changed(_):
        """
        첫 행의 텍스트가 이전 연도와 달라졌는지 확인
        """
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            # DOM이 교체 중인 경우 다시 시도
            return False

    wait.until(first_row_changed)


def read_table_as_df():
    """
    현재 페이지의 기록 테이블을 DataFrame으로 변환

    ✔ stale 방지를 위해:
      - 헤더, 바디 모두 매 호출마다 새로 find
    """

    # ---------- 헤더 추출 ----------
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]

    # 빈 헤더 대비 (순위 같은 컬럼)
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # ---------- 바디 추출 ----------
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []

    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    # 컬럼 수 불일치 방지
    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df


def scrape_one_year(year: int, prev_first_row_text: str | None):
    """
    특정 연도의 팀 타격 기록을 수집하는 함수

    흐름:
    1) 연도 select 찾기
    2) 연도 변경
    3) 테이블 갱신 대기
    4) 테이블 읽기 (stale 발생 시 재시도)
    """

    # 연도 선택 (select도 stale 날 수 있어 매번 새로 찾음)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 테이블 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # stale 방지를 위해 최대 3회 재시도
    for _ in range(3):
        try:
            df = read_table_as_df()

            # 연도 컬럼 추가
            df.insert(0, "year", year)

            # 다음 연도 비교를 위한 첫 행 텍스트 저장
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()

            return df, first_row_text

        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")


# ==============================
# 전체 연도 수집 루프
# ==============================
all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)

    # 서버 과부하 방지
    time.sleep(0.5)

# 모든 연도 데이터 결합
result = pd.concat(all_df, ignore_index=True)
result.head()

# 브라우저 종료
driver.quit()


In [2]:
# ==============================
# KBO 팀 타격 세부 기록(Basic2) 크롤링
# ==============================

# 팀 타격 기록 (Basic2) 페이지
url = "https://www.koreabaseball.com/Record/Team/Hitter/Basic2.aspx"

# 수집할 연도 목록
years = [2020, 2021, 2022, 2023, 2024, 2025]

# 크롬 드라이버 실행
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

# 명시적 대기 (최대 15초)
wait = WebDriverWait(driver, 15)

# 위와 같은 구조
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():

    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):

    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))


    wait_table_refresh(prev_first_row_text)


    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)

            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)

result_ = pd.concat(all_df, ignore_index=True)
result_.head()

driver.quit()


In [3]:
# ==============================
# 팀 컬럼명 정규화 함수
# ==============================
# KBO 페이지별 / 연도별로
# 팀 컬럼명이 '팀명', '팀', '구단', 'TEAM' 등으로 다를 수 있기 때문에
# merge 전에 하나의 컬럼명으로 통일해준다.

def normalize_team_col(df):
    """
    데이터프레임 내 팀 컬럼명을 'team'으로 통일하는 함수

    ✔ 다양한 팀 컬럼명 케이스를 모두 고려
    ✔ 처음 발견된 컬럼만 변경 (중복 변경 방지)
    ✔ 이후 year + team 기준 merge를 안정적으로 하기 위함
    """
    for c in ["팀명", "팀", "구단", "TEAM", "Team"]:
        if c in df.columns:
            df = df.rename(columns={c: "team"})
            break  # 첫 번째로 매칭된 컬럼만 변경
    return df


# ==============================
# Basic1 / Basic2 팀 컬럼 통일
# ==============================
# 연도별 팀 기록을 병합하기 전에
# 반드시 팀 컬럼명이 동일해야 함
result_basic1 = normalize_team_col(result)      # Basic1 결과
result_basic2 = normalize_team_col(result_)     # Basic2 결과


# ==============================
# year 컬럼 타입 통일
# ==============================
# 연도는 숫자 비교 / 정렬 / merge 기준으로 자주 쓰이므로
# 문자열(object)이 아닌 int 타입으로 통일
result_basic1["year"] = result_basic1["year"].astype(int)
result_basic2["year"] = result_basic2["year"].astype(int)

In [4]:
# ==============================
# 병합 기준 Key 설정
# ==============================
# 연도 + 팀 단위로 Basic1 / Basic2 기록을 결합
KEYS = ["year", "team"]

# 원본 데이터 보호를 위해 복사본 사용
b1 = result_basic1.copy()
b2 = result_basic2.copy()


# ==============================
# 컬럼명 충돌 방지 (접두어 추가)
# ==============================
# 병합 시 동일한 지표명(AVG, HR 등)이 겹치므로
# key 컬럼(year, team)을 제외한 모든 컬럼에
# 출처를 구분할 수 있는 접두어를 붙여준다.

b1 = b1.rename(
    columns={c: f"h_b1_{c}" for c in b1.columns if c not in KEYS}
)

b2 = b2.rename(
    columns={c: f"h_b2_{c}" for c in b2.columns if c not in KEYS}
)


# ==============================
# 데이터 병합
# ==============================
# ✔ year + team 기준 inner join
# ✔ 두 테이블에 모두 존재하는 팀-연도 조합만 유지
# ✔ 분석용 지표 일관성 확보에 유리
merged = b1.merge(b2, on=KEYS, how="inner")

# 병합 결과 확인
merged.head()


Unnamed: 0,year,h_b1_순위,team,h_b1_AVG,h_b1_G,h_b1_PA,h_b1_AB,h_b1_R,h_b1_H,h_b1_2B,...,h_b2_IBB,h_b2_HBP,h_b2_SO,h_b2_GDP,h_b2_SLG,h_b2_OBP,h_b2_OPS,h_b2_MH,h_b2_RISP,h_b2_PH-BA
0,2020,1,두산,0.293,144,5776,5046,816,1477,263,...,20,68,796,132,0.427,0.365,0.792,142,0.291,0.246
1,2020,2,NC,0.291,144,5833,5102,888,1483,258,...,21,104,997,106,0.462,0.366,0.828,144,0.33,0.286
2,2020,3,KT,0.284,144,5762,5047,813,1432,238,...,20,52,1097,104,0.436,0.358,0.794,144,0.289,0.192
3,2020,4,LG,0.277,144,5681,4999,802,1384,253,...,21,75,969,115,0.428,0.349,0.777,144,0.31,0.223
4,2020,5,롯데,0.276,144,5669,4958,750,1366,252,...,18,55,875,148,0.408,0.353,0.761,144,0.283,0.216


In [5]:
# ==============================
# 병합 결과 행 수 검증
# ==============================
# Basic1 / Basic2 원본 행 수와
# 병합 후 행 수를 비교해
# 데이터 누락이나 과도한 감소가 없는지 확인

print("basic1 rows:", len(result_basic1))
print("basic2 rows:", len(result_basic2))
print("merged rows:", len(merged))


# ==============================
# (year, team) 기준 중복 여부 확인
# ==============================
# 정상적인 경우:
# - year + team 조합은 유일해야 함
# - 중복이 존재하면
#   → 병합 key 설정 오류
#   → 원본 데이터에 중복 행 존재
#   → 팀명 정규화 문제 가능성

dup = merged.duplicated(subset=["year", "team"]).sum()
print("duplicates (year,team):", dup)

basic1 rows: 60
basic2 rows: 60
merged rows: 60
duplicates (year,team): 0


In [5]:
from pathlib import Path

In [6]:
from pathlib import Path

# ==============================
# 원본 데이터 저장 경로 설정
# ==============================
# data/raw/kbo/team 디렉토리에
# 팀 타격 원본 데이터를 저장하기 위한 경로
RAW_DIR = Path("../data/raw/kbo/team")

# 상위 폴더가 없으면 자동 생성
# parents=True  → 중간 디렉토리까지 생성
# exist_ok=True → 이미 존재해도 에러 발생 X
RAW_DIR.mkdir(parents=True, exist_ok=True)


# ==============================
# 병합된 팀 타격 데이터 저장
# ==============================
# 2021~2025년 Basic1 + Basic2 병합 결과
# → 이후 전처리/분석 단계에서 이 파일을 기준으로 사용
save_path = RAW_DIR / "team_hitter_basic_2020_2025.csv"

# index 제거 + 한글 깨짐 방지를 위해 utf-8-sig 사용
merged.to_csv(
    save_path,
    index=False,
    encoding="utf-8-sig"
)

---

In [7]:
# ==============================
# KBO 팀 투수 기록(Basic1) 크롤링
# ==============================
url = "https://www.koreabaseball.com/Record/Team/Pitcher/Basic1.aspx"
years = [2020, 2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    wait_table_refresh(prev_first_row_text)

    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)

result = pd.concat(all_df, ignore_index=True)
result.head()

driver.quit()

In [9]:
# ==============================
# KBO 팀 투수 기록(Basic2) 크롤링
# ==============================
url = "https://www.koreabaseball.com/Record/Team/Pitcher/Basic2.aspx"
years = [2020, 2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():

    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return


    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():

    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):

    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    wait_table_refresh(prev_first_row_text)

    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)

            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)

result_ = pd.concat(all_df, ignore_index=True)
result_.head()

driver.quit()


In [10]:

def normalize_team_col(df):
    for c in ["팀명", "팀", "구단", "TEAM", "Team"]:
        if c in df.columns:
            df = df.rename(columns={c: "team"})
            break
    return df

result_basic1 = normalize_team_col(result)     # Basic1 결과
result_basic2 = normalize_team_col(result_)    # Basic2 결과

# year도 int로 통일
result_basic1["year"] = result_basic1["year"].astype(int)
result_basic2["year"] = result_basic2["year"].astype(int)


In [11]:
KEYS = ["year", "team"]

b1 = result_basic1.copy()
b2 = result_basic2.copy()


b1 = b1.rename(columns={c: f"h_b1_{c}" for c in b1.columns if c not in KEYS})
b2 = b2.rename(columns={c: f"h_b2_{c}" for c in b2.columns if c not in KEYS})

merged = b1.merge(b2, on=KEYS, how="inner") 
merged.head()

Unnamed: 0,year,h_b1_순위,team,h_b1_ERA,h_b1_G,h_b1_W,h_b1_L,h_b1_SV,h_b1_HLD,h_b1_WPCT,...,h_b2_TBF,h_b2_NP,h_b2_AVG,h_b2_2B,h_b2_3B,h_b2_SAC,h_b2_SF,h_b2_IBB,h_b2_WP,h_b2_BK
0,2020,1,두산,4.31,144,79,61,23,45,0.564,...,5681,22408,0.276,258,25,48,41,22,52,4
1,2020,2,LG,4.37,144,79,61,29,67,0.564,...,5655,21725,0.27,240,17,38,56,12,49,4
2,2020,3,키움,4.39,144,80,63,42,81,0.559,...,5556,21349,0.27,208,15,50,39,16,51,7
3,2020,4,KT,4.54,144,81,62,33,71,0.566,...,5680,21496,0.271,227,9,45,48,20,75,1
4,2020,5,NC,4.58,144,83,55,31,88,0.601,...,5690,22362,0.26,199,15,53,35,10,65,7


In [12]:
print("basic1 rows:", len(result_basic1))
print("basic2 rows:", len(result_basic2))
print("merged rows:", len(merged))

dup = merged.duplicated(subset=["year", "team"]).sum()
print("duplicates (year,team):", dup)

basic1 rows: 60
basic2 rows: 60
merged rows: 60
duplicates (year,team): 0


In [13]:
RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

save_path = RAW_DIR / "team_pitcher_basic_2020_2025.csv"
merged.to_csv(save_path, index=False, encoding="utf-8-sig")

---

In [7]:
# ==============================
# KBO 팀 수비 기록 크롤링
# ==============================
url = "https://www.koreabaseball.com/Record/Team/Defense/Basic.aspx"
years = [ 2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():

    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):

    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    wait_table_refresh(prev_first_row_text)

    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
   
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)

result = pd.concat(all_df, ignore_index=True)
result.head()

driver.quit()

In [8]:
RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

all_result = pd.concat(all_df, ignore_index=True)

save_path = RAW_DIR / "team_defense_basic_2021_2025.csv"
all_result.to_csv(save_path, index=False, encoding="utf-8-sig")

---

In [16]:
# ==============================
# KBO 팀 주루 기록 크롤링
# ==============================
url = "https://www.koreabaseball.com/Record/Team/Runner/Basic.aspx"
years = [2020, 2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():

    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return


    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():

    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):

    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))


    wait_table_refresh(prev_first_row_text)

    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)

            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)

result = pd.concat(all_df, ignore_index=True)
result.head()

driver.quit()


In [17]:
RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

all_result = pd.concat(all_df, ignore_index=True)

save_path = RAW_DIR / "team_baserunning_basic_2020_2025.csv"
all_result.to_csv(save_path, index=False, encoding="utf-8-sig")