In [2]:
pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
import pandas as pd

In [None]:
base_url_pattern = "https://www.tesat.or.kr/bbs.frm.list/tesat_study?&page={}&s_cateno=1"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}

all_post_links = []

# 페이지 2부터 125까지 반복
for page in range(2, 126):  
    url = base_url_pattern.format(page)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 게시글 링크 추출
    post_links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if "tesat_study" in href: 
            full_url = urljoin(url, href)
            post_links.append(full_url)
    
    # 리스트 추가
    all_post_links.extend(post_links)

    # 현재 페이지에서 수집된 링크 출력
    print(f"페이지 {page}에서 {len(post_links)}개의 링크 수집")
    for idx, link in enumerate(post_links, start=1):
        print(f"  {idx}: {link}")

# 최종 결과 
print(f"총 {len(all_post_links)}개의 링크를 수집")


페이지 2에서 34개의 링크 수집
  1: https://www.tesat.or.kr/bbs.frm.list/tesat_study
  2: https://www.tesat.or.kr/bbs.frm.list/tesat_study
  3: https://www.tesat.or.kr/bbs.frm.list/tesat_study?s_cateno=1
  4: https://www.tesat.or.kr/bbs.frm.list/tesat_study?s_cateno=5
  5: https://www.tesat.or.kr/bbs.frm.list/tesat_study?s_cateno=4
  6: https://www.tesat.or.kr/bbs.frm.list/tesat_study?s_cateno=3
  7: https://www.tesat.or.kr/bbs.frm.list/tesat_study?s_cateno=2
  8: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23931
  9: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23896
  10: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23870
  11: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23850
  12: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23834
  13: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23833
  14: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23818
  15: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23815
  16: https://www.tesat.or.kr/bbs.frm.

In [5]:
# 링크에 no 있는거만 필터링
import re
def filter_links(urls):
    return [url for url in urls if re.search(r'no=\d+', url) and not re.search(r's_cateno=\d+', url)]

# 'no' 파라미터만 포함된 URL만 남기기
filtered_links = filter_links(all_post_links)

# 최종 링크 리스트
print("\n최종 URL 결과:")
for idx, link in enumerate(filtered_links, start=1):
    print(f"{idx}: {link}")

print(f"\n유효 링크 개수: {len(filtered_links)}개")



최종 URL 결과:
1: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23931
2: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23896
3: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23870
4: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23850
5: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23834
6: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23833
7: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23818
8: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23815
9: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23814
10: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23803
11: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23789
12: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23773
13: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23761
14: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23751
15: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23710
16: https://www.tesat.or.kr/bbs.frm.view/tesat_study?no=23703
17: h

In [None]:
data = []
for url in filtered_links:
    try:
        response = requests.get(url)
        response.encoding = 'euc-kr' 
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        paragraphs = soup.select('.txt_view01 p')

        # 저장 리스트
        questions = []
        explanations = []
        answers = []
        current_section = None  

        for p in paragraphs:
            text = p.get_text(strip=True) 

            if text.startswith("문제") or "[문제]" in text:  # 문제 시작
                current_section = "문제"
                questions.append(text)
            elif text.startswith("해설") or "[해설]" in text:  # 해설 시작
                current_section = "해설"
                explanations.append(text)
            elif re.search(r"(정답|정답:|정답은)", text):  # 정답 시작
                current_section = "정답"
                answers.append(text)
            else:  # 추가
                if current_section == "문제":
                    questions.append(text)
                elif current_section == "해설":
                    explanations.append(text)
                elif current_section == "정답":
                    answers.append(text)

        # 정답 후처리 (선택지 번호만 남기기)
        processed_answers = []
        for answer in answers:
            match = re.search(r"정답[:은]?(.+)", answer)  
            if match:
                processed_answers.append(match.group(1).strip())

        data.append({
            "문제": " ".join(questions),
            "해설": " ".join(explanations),
            "정답": ", ".join(processed_answers)
        })
    except Exception as e:
        print(f"처리 중 오류 발생: {url}, 오류: {e}")


In [15]:
output_file = "training_data.csv"

with open(output_file, mode="w", newline="", encoding="utf-8-sig") as file:
    writer = csv.DictWriter(file, fieldnames=["문제", "해설", "정답"])
    writer.writeheader()
    writer.writerows(data)

print("저장 완료")

저장 완료


In [16]:
#데이터 가공
input_file = "training_data.csv"
df = pd.read_csv(input_file, encoding="utf-8-sig")
df.dropna(how="all", inplace=True)
df.to_csv(input_file, index=False, encoding="utf-8-sig")