# 1. Settings

In [None]:
# Confirm Selenium
import selenium
print(selenium.__version__)

In [None]:
# selenium으로 키를 조작하기 위한 import
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ETC
import time # to wait not to seem as "bot"
import sys
import chromedriver_autoinstaller
import csv

# 2. 함수 정의
크롤링 페이지 : https://www.president.go.kr/president/speeches

In [None]:
# 섹션 넘기는 버튼 누르기
def click_section_button():
    try:
        section_button = WebDriverWait(driver, 10).until( # Waiting 10sec maximum
            EC.element_to_be_clickable((By.XPATH, f'//*[@id="contents"]/article/div/div[2]/button[3]')))
        driver.execute_script("arguments[0].scrollIntoView();", section_button) # scroll
        time.sleep(1)
        driver.execute_script("arguments[0].click();", section_button)  # Click with JavaScript
    except TimeoutException:
        print(f"Section button not found")
        raise  # if TimeoutException : throw Exception & quit

In [None]:
# 페이지 버튼 누르기
def click_page_button(page_num):
    rest = page_num%5
    if rest== 0:
        num = 5
    else:
        num = rest
    try:
        page_button = WebDriverWait(driver, 10).until( # Waiting 10sec maximum
            EC.element_to_be_clickable((By.XPATH, f'//*[@id="contents"]/article/div/div[2]/ul/li[{num}]/button')))
        driver.execute_script("arguments[0].scrollIntoView();", page_button) # scroll
        time.sleep(1)
        driver.execute_script("arguments[0].click();", page_button)  # Click with JavaScript
    except TimeoutException:
        print(f"Page button {page_num} not found")
        raise  # if TimeoutException : throw Exception & quit

In [None]:
def crawl_speech():
    global idx
    global speech_data
    global driver
    try:
        o_title = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="contents"]/article/div[2]/div/dl/dt/div[1]/p')))
        o_subtitle = driver.find_element(By.XPATH, '//*[@id="contents"]/article/div[2]/div/dl/dd/div/div[2]/p')
        o_date = driver.find_element(By.XPATH, '//*[@id="contents"]/article/div[2]/div/dl/dt/div[2]/span')
        #o_speech = driver.find_element(By.XPATH, '//*[@id="contents"]/article/div[2]/div/dl/dd/div/div[3]')
        o_speech = WebDriverWait(driver, 10).until( # Waiting 10sec maximum
            EC.presence_of_element_located((By.XPATH, '//*[@id="contents"]/article/div[2]/div/dl/dd/div/div[3]')))
        driver.execute_script("arguments[0].scrollIntoView();", o_speech)
        time.sleep(1)
        title = o_title.text
        subtitle = o_subtitle.text
        date = o_date.text
        speech = o_speech.text
        speaker = "윤석열"

        d = {"index": idx, "title": title, "subtitle": subtitle, "date": date, "speaker": speaker, "speech": speech}
        speech_data.append(d)
        print(f"{idx}. {title}")
        idx += 1
    except NoSuchElementException as e:
        print(f"An element was not found: {e}")
    except TimeoutException as e:
        print(f"An element load timed out: {e}")

# 3. 크롤링 (윤석열 대통령)
: 에러 방지를 위해 index 조절하며 나누어서 수행하는 것이 좋음.

In [None]:
# Run Chrome Driver
driver = webdriver.Chrome()

In [None]:
speech_data = []
idx = 1

# WebDriver Reset
driver.get('https://www.president.go.kr/president/speeches')
time.sleep(2)


for page_num in range(1,41): # 페이지 수 조절
    click_page_button(page_num)
    for title_num in range(1, 6):  # 예제에서는 첫 번째 페이지의 두 개 연설을 크롤링
        if (page_num == 1) & (title_num == 1):
            first_click = driver.find_element(By.XPATH, '//*[@id="contents"]/article/div/div[1]/div[2]/a/div[2]/p[1]/span')
            first_click.click()
            time.sleep(2)
            crawl_speech()
            driver.back()
            time.sleep(2)
        try:
            rest_click = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, f'//*[@id="contents"]/article/div/div[1]/ul/li[{title_num}]/div/a/p[1]')))
            driver.execute_script("arguments[0].scrollIntoView();", rest_click)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", rest_click)
            time.sleep(2)
            crawl_speech()
            driver.back()
            time.sleep(2)
        except Exception as e:
            print(f"An Error Occured: {str(e)}")
            break
    if page_num%5 ==0:
        click_section_button()
driver.quit()

print("크롤링 완료!")

# 4. CSV로 저장

In [None]:
# 저장할 경로 지정 (Google Drive 내 특정 폴더 경로)
folder_path = '/content/drive/MyDrive/suyun_speech_corpus/corpus/President_Office'  # 'your_folder_name'을 실제 폴더 이름으로 변경
file_path = f'{folder_path}/president_office_crawling.csv'

# CSV 파일로 저장
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ["index", "title", "subtitle", "date", "speaker", "speech"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in speech_data:
        writer.writerow(row)

print(f"SAVE COMPLETED: {file_path}") # 444개