In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

import pandas as pd
import time

url = "https://www.koreabaseball.com/Record/Team/Hitter/Basic1.aspx"
years = [2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

# ✅ (권장) 이 페이지에서 "기록 테이블"을 한 번만 정확히 지정해두기
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    # 연도 select는 매번 새로 찾는 게 안전함
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 새로 갱신될 때까지 대기.
    - 첫 행 텍스트가 이전과 달라지는 걸 기준으로 기다리면 안정적임.
    """
    # 1) 테이블 row 존재는 기본으로 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    # 2) 첫 행 텍스트가 바뀔 때까지 기다림
    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    """
    매 호출마다 헤더/바디를 '다시' 찾아서 DataFrame 생성 (stale 방지)
    """
    # 헤더
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    # 연도 선택 (select도 stale 날 수 있어서 매번 새로 잡기)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # 읽기 (stale 나면 1~2회 재시도)
    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            # 다음 루프용으로 첫 행 텍스트 반환
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)  # 과도한 요청 방지

result = pd.concat(all_df, ignore_index=True)
result.head()

driver.quit()

In [None]:
url = "https://www.koreabaseball.com/Record/Team/Hitter/Basic2.aspx"
years = [2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

# ✅ (권장) 이 페이지에서 "기록 테이블"을 한 번만 정확히 지정해두기
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    # 연도 select는 매번 새로 찾는 게 안전함
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 새로 갱신될 때까지 대기.
    - 첫 행 텍스트가 이전과 달라지는 걸 기준으로 기다리면 안정적임.
    """
    # 1) 테이블 row 존재는 기본으로 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    # 2) 첫 행 텍스트가 바뀔 때까지 기다림
    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    """
    매 호출마다 헤더/바디를 '다시' 찾아서 DataFrame 생성 (stale 방지)
    """
    # 헤더
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    # 연도 선택 (select도 stale 날 수 있어서 매번 새로 잡기)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # 읽기 (stale 나면 1~2회 재시도)
    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            # 다음 루프용으로 첫 행 텍스트 반환
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)  # 과도한 요청 방지

result_ = pd.concat(all_df, ignore_index=True)
result_.head()

driver.quit()


In [3]:
# 예: 팀 컬럼명이 '팀명'일 수도, '팀'일 수도 있어서 통일
def normalize_team_col(df):
    for c in ["팀명", "팀", "구단", "TEAM", "Team"]:
        if c in df.columns:
            df = df.rename(columns={c: "team"})
            break
    return df

result_basic1 = normalize_team_col(result)     # Basic1 결과
result_basic2 = normalize_team_col(result_)    # Basic2 결과

# year도 int로 통일
result_basic1["year"] = result_basic1["year"].astype(int)
result_basic2["year"] = result_basic2["year"].astype(int)


In [4]:
KEYS = ["year", "team"]

b1 = result_basic1.copy()
b2 = result_basic2.copy()

# key 컬럼 제외하고 접두어 붙이기
b1 = b1.rename(columns={c: f"h_b1_{c}" for c in b1.columns if c not in KEYS})
b2 = b2.rename(columns={c: f"h_b2_{c}" for c in b2.columns if c not in KEYS})

merged = b1.merge(b2, on=KEYS, how="inner")  # 보통 inner 추천
merged.head()

Unnamed: 0,year,h_b1_순위,team,h_b1_AVG,h_b1_G,h_b1_PA,h_b1_AB,h_b1_R,h_b1_H,h_b1_2B,...,h_b2_IBB,h_b2_HBP,h_b2_SO,h_b2_GDP,h_b2_SLG,h_b2_OBP,h_b2_OPS,h_b2_MH,h_b2_RISP,h_b2_PH-BA
0,2021,1,롯데,0.278,144,5726,5009,727,1393,266,...,19,54,969,114,0.399,0.356,0.755,144,0.286,0.253
1,2021,2,두산,0.268,144,5645,4900,738,1314,235,...,13,86,943,137,0.39,0.35,0.74,143,0.284,0.255
2,2021,3,삼성,0.267,144,5552,4836,712,1292,204,...,21,55,902,98,0.399,0.344,0.743,144,0.269,0.212
3,2021,4,KT,0.265,144,5627,4810,719,1276,219,...,27,58,1047,105,0.381,0.356,0.737,142,0.271,0.227
4,2021,5,SSG,0.261,144,5736,4899,755,1278,204,...,23,93,1056,106,0.421,0.353,0.774,143,0.261,0.259


In [5]:
print("basic1 rows:", len(result_basic1))
print("basic2 rows:", len(result_basic2))
print("merged rows:", len(merged))

# 팀/연도 중복 확인
dup = merged.duplicated(subset=["year", "team"]).sum()
print("duplicates (year,team):", dup)

basic1 rows: 50
basic2 rows: 50
merged rows: 50
duplicates (year,team): 0


In [6]:
from pathlib import Path

RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

save_path = RAW_DIR / "team_hitter_basic_2021_2025.csv"
merged.to_csv(save_path, index=False, encoding="utf-8-sig")


---

In [18]:
url = "https://www.koreabaseball.com/Record/Team/Pitcher/Basic1.aspx"
years = [2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

# ✅ (권장) 이 페이지에서 "기록 테이블"을 한 번만 정확히 지정해두기
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    # 연도 select는 매번 새로 찾는 게 안전함
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 새로 갱신될 때까지 대기.
    - 첫 행 텍스트가 이전과 달라지는 걸 기준으로 기다리면 안정적임.
    """
    # 1) 테이블 row 존재는 기본으로 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    # 2) 첫 행 텍스트가 바뀔 때까지 기다림
    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    """
    매 호출마다 헤더/바디를 '다시' 찾아서 DataFrame 생성 (stale 방지)
    """
    # 헤더
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    # 연도 선택 (select도 stale 날 수 있어서 매번 새로 잡기)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # 읽기 (stale 나면 1~2회 재시도)
    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            # 다음 루프용으로 첫 행 텍스트 반환
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)  # 과도한 요청 방지

result = pd.concat(all_df, ignore_index=True)
result.head()


RuntimeError: 연도 select를 찾지 못했습니다.

In [8]:
url = "https://www.koreabaseball.com/Record/Team/Pitcher/Basic2.aspx"
years = [2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

# ✅ (권장) 이 페이지에서 "기록 테이블"을 한 번만 정확히 지정해두기
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    # 연도 select는 매번 새로 찾는 게 안전함
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 새로 갱신될 때까지 대기.
    - 첫 행 텍스트가 이전과 달라지는 걸 기준으로 기다리면 안정적임.
    """
    # 1) 테이블 row 존재는 기본으로 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    # 2) 첫 행 텍스트가 바뀔 때까지 기다림
    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    """
    매 호출마다 헤더/바디를 '다시' 찾아서 DataFrame 생성 (stale 방지)
    """
    # 헤더
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    # 연도 선택 (select도 stale 날 수 있어서 매번 새로 잡기)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # 읽기 (stale 나면 1~2회 재시도)
    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            # 다음 루프용으로 첫 행 텍스트 반환
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)  # 과도한 요청 방지

result_ = pd.concat(all_df, ignore_index=True)
result_.head()


Unnamed: 0,year,순위,팀명,ERA,CG,SHO,QS,BSV,TBF,NP,AVG,2B,3B,SAC,SF,IBB,WP,BK
0,2021,1,LG,3.57,0,18,50,14,5453,21456,0.238,197,18,45,43,18,43,5
1,2021,2,KT,3.67,2,6,76,16,5397,20893,0.246,184,13,46,48,18,56,1
2,2021,3,두산,4.26,2,10,55,15,5631,22163,0.267,212,21,61,61,16,52,7
3,2021,4,삼성,4.3,2,14,66,13,5536,21926,0.266,230,24,45,36,13,58,3
4,2021,5,키움,4.31,1,7,57,16,5599,21670,0.264,223,7,55,48,27,59,4


In [9]:
# 예: 팀 컬럼명이 '팀명'일 수도, '팀'일 수도 있어서 통일
def normalize_team_col(df):
    for c in ["팀명", "팀", "구단", "TEAM", "Team"]:
        if c in df.columns:
            df = df.rename(columns={c: "team"})
            break
    return df

result_basic1 = normalize_team_col(result)     # Basic1 결과
result_basic2 = normalize_team_col(result_)    # Basic2 결과

# year도 int로 통일
result_basic1["year"] = result_basic1["year"].astype(int)
result_basic2["year"] = result_basic2["year"].astype(int)


In [10]:
KEYS = ["year", "team"]

b1 = result_basic1.copy()
b2 = result_basic2.copy()

# key 컬럼 제외하고 접두어 붙이기
b1 = b1.rename(columns={c: f"h_b1_{c}" for c in b1.columns if c not in KEYS})
b2 = b2.rename(columns={c: f"h_b2_{c}" for c in b2.columns if c not in KEYS})

merged = b1.merge(b2, on=KEYS, how="inner")  # 보통 inner 추천
merged.head()

Unnamed: 0,year,h_b1_순위,team,h_b1_ERA,h_b1_G,h_b1_W,h_b1_L,h_b1_SV,h_b1_HLD,h_b1_WPCT,...,h_b2_TBF,h_b2_NP,h_b2_AVG,h_b2_2B,h_b2_3B,h_b2_SAC,h_b2_SF,h_b2_IBB,h_b2_WP,h_b2_BK
0,2021,1,LG,3.57,144,72,58,32,81,0.554,...,5453,21456,0.238,197,18,45,43,18,43,5
1,2021,2,KT,3.67,144,76,59,33,74,0.563,...,5397,20893,0.246,184,13,46,48,18,56,1
2,2021,3,두산,4.26,144,71,65,28,61,0.522,...,5631,22163,0.267,212,21,61,61,16,52,7
3,2021,4,삼성,4.3,144,76,59,46,80,0.563,...,5536,21926,0.266,230,24,45,36,13,58,3
4,2021,5,키움,4.31,144,70,67,30,56,0.511,...,5599,21670,0.264,223,7,55,48,27,59,4


In [11]:
print("basic1 rows:", len(result_basic1))
print("basic2 rows:", len(result_basic2))
print("merged rows:", len(merged))

# 팀/연도 중복 확인
dup = merged.duplicated(subset=["year", "team"]).sum()
print("duplicates (year,team):", dup)

basic1 rows: 50
basic2 rows: 50
merged rows: 50
duplicates (year,team): 0


In [12]:
RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

save_path = RAW_DIR / "team_hitter_basic_2021_2025.csv"
merged.to_csv(save_path, index=False, encoding="utf-8-sig")

---

In [13]:
url = "https://www.koreabaseball.com/Record/Team/Defense/Basic.aspx"
years = [2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

# ✅ (권장) 이 페이지에서 "기록 테이블"을 한 번만 정확히 지정해두기
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    # 연도 select는 매번 새로 찾는 게 안전함
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 새로 갱신될 때까지 대기.
    - 첫 행 텍스트가 이전과 달라지는 걸 기준으로 기다리면 안정적임.
    """
    # 1) 테이블 row 존재는 기본으로 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    # 2) 첫 행 텍스트가 바뀔 때까지 기다림
    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    """
    매 호출마다 헤더/바디를 '다시' 찾아서 DataFrame 생성 (stale 방지)
    """
    # 헤더
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    # 연도 선택 (select도 stale 날 수 있어서 매번 새로 잡기)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # 읽기 (stale 나면 1~2회 재시도)
    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            # 다음 루프용으로 첫 행 텍스트 반환
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)  # 과도한 요청 방지

result = pd.concat(all_df, ignore_index=True)
result.head()


Unnamed: 0,year,순위,팀명,G,E,PKO,PO,A,DP,FPCT,PB,SB,CS,CS%
0,2021,1,SSG,144,102,14,3830,1409,123,0.981,12,105,46,30.5
1,2021,2,LG,144,91,4,3817,1481,147,0.983,12,98,40,29.0
2,2021,3,롯데,144,85,9,3813,1403,123,0.984,8,101,44,30.3
3,2021,4,두산,144,89,8,3808,1296,120,0.983,10,83,48,36.6
4,2021,5,KIA,144,110,9,3807,1368,139,0.979,10,92,43,31.9


In [14]:
RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

all_result = pd.concat(all_df, ignore_index=True)

save_path = RAW_DIR / "team_defense_basic_2021_2025.csv"
all_result.to_csv(save_path, index=False, encoding="utf-8-sig")

---

In [15]:
url = "https://www.koreabaseball.com/Record/Team/Runner/Basic.aspx"
years = [2021, 2022, 2023, 2024, 2025]

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 15)

# ✅ (권장) 이 페이지에서 "기록 테이블"을 한 번만 정확히 지정해두기
TABLE_SEL = "#cphContents_cphContents_cphContents_udpContent > div.record_result > table"
TBODY_ROW_SEL = f"{TABLE_SEL} > tbody > tr"
THEAD_TH_SEL = f"{TABLE_SEL} > thead tr th"

def find_year_select():
    # 연도 select는 매번 새로 찾는 게 안전함
    selects = driver.find_elements(By.CSS_SELECTOR, "select")
    for el in selects:
        try:
            sel = Select(el)
            opts = [o.text.strip() for o in sel.options]
            if any(str(y) in opts for y in years):
                return sel
        except:
            pass
    raise RuntimeError("연도 select를 찾지 못했습니다.")

def wait_table_refresh(prev_first_row_text: str | None):
    """
    연도 변경 후 테이블이 새로 갱신될 때까지 대기.
    - 첫 행 텍스트가 이전과 달라지는 걸 기준으로 기다리면 안정적임.
    """
    # 1) 테이블 row 존재는 기본으로 보장
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, TBODY_ROW_SEL)))

    if prev_first_row_text is None:
        return

    # 2) 첫 행 텍스트가 바뀔 때까지 기다림
    def first_row_changed(_):
        try:
            first_row = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL)
            return first_row.text.strip() != prev_first_row_text
        except StaleElementReferenceException:
            return False

    wait.until(first_row_changed)

def read_table_as_df():
    """
    매 호출마다 헤더/바디를 '다시' 찾아서 DataFrame 생성 (stale 방지)
    """
    # 헤더
    headers = [th.text.strip() for th in driver.find_elements(By.CSS_SELECTOR, THEAD_TH_SEL)]
    headers = [h if h else f"col_{i}" for i, h in enumerate(headers)]

    # 바디
    rows = driver.find_elements(By.CSS_SELECTOR, TBODY_ROW_SEL)
    data = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        data.append([td.text.strip() for td in tds])

    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    return df

def scrape_one_year(year: int, prev_first_row_text: str | None):
    # 연도 선택 (select도 stale 날 수 있어서 매번 새로 잡기)
    year_select = find_year_select()
    year_select.select_by_visible_text(str(year))

    # 갱신 대기
    wait_table_refresh(prev_first_row_text)

    # 읽기 (stale 나면 1~2회 재시도)
    for _ in range(3):
        try:
            df = read_table_as_df()
            df.insert(0, "year", year)
            # 다음 루프용으로 첫 행 텍스트 반환
            first_row_text = driver.find_element(By.CSS_SELECTOR, TBODY_ROW_SEL).text.strip()
            return df, first_row_text
        except StaleElementReferenceException:
            time.sleep(0.5)

    raise RuntimeError(f"{year}년 테이블 읽기 실패(계속 stale)")

all_df = []
prev_text = None

for y in years:
    df_y, prev_text = scrape_one_year(y, prev_text)
    all_df.append(df_y)
    time.sleep(0.5)  # 과도한 요청 방지

result = pd.concat(all_df, ignore_index=True)
result.head()


Unnamed: 0,year,순위,팀명,G,SBA,SB,CS,SB%,OOB,PKO
0,2021,1,삼성,144,160,116,44,72.5,46,7
1,2021,2,KT,144,158,112,46,70.9,48,5
2,2021,3,한화,144,165,109,56,66.1,59,14
3,2021,4,NC,144,147,101,46,68.7,39,9
4,2021,5,SSG,144,143,100,43,69.9,42,3


In [16]:
RAW_DIR = Path("../data/raw/kbo/team")
RAW_DIR.mkdir(parents=True, exist_ok=True)

all_result = pd.concat(all_df, ignore_index=True)

save_path = RAW_DIR / "team_baserunning_basic_2021_2025.csv"
all_result.to_csv(save_path, index=False, encoding="utf-8-sig")