In [7]:
import collections
import collections.abc
collections.Callable = collections.abc.Callable

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ✅ 셀레니움 드라이버 생성
def create_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

# ✅ 키워드로 상위 N개 문서의 제목+URL 수집
def get_article_urls_from_keyword(keyword, max_items=5):
    driver = create_driver()
    base_url = "https://encykorea.aks.ac.kr"
    driver.get(base_url)
    time.sleep(1)

    search_input = driver.find_element(By.ID, "keyword")
    search_input.clear()
    search_input.send_keys(keyword)
    driver.find_element(By.CLASS_NAME, "main-search").click()
    time.sleep(2)

    items = driver.find_elements(By.CSS_SELECTOR, "ul.encyclopedia-list li.item")
    results = []
    for item in items[:max_items]:
        try:
            link_tag = item.find_element(By.CSS_SELECTOR, "a")
            title_tag = item.find_element(By.CSS_SELECTOR, ".title")
            href = link_tag.get_attribute("href")
            title = title_tag.text.strip()
            results.append((title, href))
        except:
            continue

    driver.quit()
    return results

# ✅ 본문 파싱
def parse_article_contents(title, url, keyword):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        content = soup.find("div", class_="contents-detail-contents")
        if not content:
            return {"키워드": keyword, "제목": title, "URL": url, "정의": "본문 없음"}

        sections = content.find_all("div", class_="detail-section")
        result = {"키워드": keyword, "제목": title, "URL": url}
        for sec in sections:
            title_tag = sec.find(class_="section-title")
            body_tag = sec.find(class_="section-body")
            if title_tag and body_tag:
                key = title_tag.get_text(strip=True)
                value = body_tag.get_text(separator="\n").strip()
                result[key] = value

        return result
    except Exception as e:
        return {"키워드": keyword, "제목": title, "URL": url, "정의": f"❌ 오류: {e}"}

# ✅ 전체 크롤링
def crawl_multiple_articles_per_keyword(keywords, max_items=5):
    all_results = []
    for kw in keywords:
        print(f"🔍 [{kw}] 문서 {max_items}개 검색 중...")
        articles = get_article_urls_from_keyword(kw, max_items=max_items)
        if not articles:
            all_results.append({"키워드": kw, "제목": "❌ 문서 없음", "URL": "", "정의": "N/A"})
            continue
        for title, url in articles:
            data = parse_article_contents(title, url, kw)
            all_results.append(data)
            time.sleep(1)
    return pd.DataFrame(all_results)

# ✅ 키워드 예시
keywords = ["유관순", "청동기", "비파형 동검"]
df = crawl_multiple_articles_per_keyword(keywords, max_items=5)

# ✅ 엑셀 저장
df.to_excel("키워드별_5개문서_백과결과.xlsx", index=False)
print("✅ 저장 완료: 키워드별_5개문서_백과결과.xlsx")


🔍 [유관순] 문서 5개 검색 중...
🔍 [청동기] 문서 5개 검색 중...
🔍 [비파형 동검] 문서 5개 검색 중...
✅ 저장 완료: 키워드별_5개문서_백과결과.xlsx


In [10]:
import collections
import collections.abc
collections.Callable = collections.abc.Callable

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ✅ 셀레니움 드라이버 생성
def create_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

# ✅ 키워드로 상위 N개 문서 검색
def get_article_urls_from_keyword(keyword, max_items=5):
    driver = create_driver()
    base_url = "https://encykorea.aks.ac.kr"
    driver.get(base_url)
    time.sleep(1)

    search_input = driver.find_element(By.ID, "keyword")
    search_input.clear()
    search_input.send_keys(keyword)
    driver.find_element(By.CLASS_NAME, "main-search").click()
    time.sleep(2)

    items = driver.find_elements(By.CSS_SELECTOR, "ul.encyclopedia-list li.item")
    results = []
    for item in items[:max_items]:
        try:
            link_tag = item.find_element(By.CSS_SELECTOR, "a")
            title_tag = item.find_element(By.CSS_SELECTOR, ".title")
            href = link_tag.get_attribute("href")
            title = title_tag.text.strip()
            results.append((title, href))
        except:
            continue

    driver.quit()
    return results

# ✅ 본문 파싱
def parse_article_contents(title, url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        content = soup.find("div", class_="contents-detail-contents")
        if not content:
            return {"제목": title, "URL": url, "정의": "본문 없음"}

        sections = content.find_all("div", class_="detail-section")
        result = {"제목": title, "URL": url}
        for sec in sections:
            title_tag = sec.find(class_="section-title")
            body_tag = sec.find(class_="section-body")
            if title_tag and body_tag:
                key = title_tag.get_text(strip=True)
                value = body_tag.get_text(separator="\n").strip()
                result[key] = value

        return result
    except Exception as e:
        return {"제목": title, "URL": url, "정의": f"❌ 오류: {e}"}

# ✅ 원본 데이터 기반 크롤링 수행
def crawl_articles_from_excel_rows(excel_path, max_keywords=5, articles_per_keyword=5):
    df = pd.read_excel(excel_path)
    all_results = []

    for idx, row in df.iterrows():
        row_info = {
            "열1": row.get("열1", ""),
            "열2": row.get("열2", ""),
            "열3": row.get("열3", ""),
            "열4": row.get("열4", "")
        }

        keywords = str(row_info["열4"]).split(',') if pd.notna(row_info["열4"]) else []
        for kw in keywords[:max_keywords]:
            kw = kw.strip()
            if not kw:
                continue

            print(f"🔍 ({idx}) 키워드 '{kw}' 문서 검색 중...")
            articles = get_article_urls_from_keyword(kw, articles_per_keyword)
            if not articles:
                result = row_info.copy()
                result.update({"키워드": kw, "제목": "❌ 문서 없음", "URL": "", "정의": "N/A"})
                all_results.append(result)
                continue

            for title, url in articles:
                parsed = parse_article_contents(title, url)
                result = row_info.copy()
                result["키워드"] = kw
                result.update(parsed)
                all_results.append(result)
                time.sleep(1)

    return pd.DataFrame(all_results)

# ✅ 실행
excel_path = "split_4.xlsx"  # 실제 파일명
df_final = crawl_articles_from_excel_rows(excel_path, max_keywords=2, articles_per_keyword=5)

# ✅ 저장
df_final.to_excel("4번.xlsx", index=False)
print("✅ 저장 완료: 교과서_키워드별_문서결과_5개씩.xlsx")


🔍 (0) 키워드 '최윤덕' 문서 검색 중...
🔍 (0) 키워드 '김종서' 문서 검색 중...
🔍 (1) 키워드 '무역소' 문서 검색 중...


KeyboardInterrupt: 

In [12]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ✅ 셀레니움 드라이버 생성 함수
def create_driver():
    options = Options()
    options.add_argument("--headless")  # 창 없이 실행
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

# ✅ 키워드로 미디어 탭 이미지 수집
def get_media_results(keyword, max_items=5):
    base_url = f"https://encykorea.aks.ac.kr/Media/Search/{keyword}"
    driver = create_driver()
    driver.get(base_url)
    time.sleep(2)

    items = driver.find_elements(By.CSS_SELECTOR, "ul.media-list li.item")
    results = []

    for item in items[:max_items]:
        try:
            title = item.find_element(By.CLASS_NAME, "title").text.strip()
            img_tag = item.find_element(By.CSS_SELECTOR, ".media-img")
            style_attr = img_tag.get_attribute("style")
            start = style_attr.find('url("') + 5
            end = style_attr.find('")', start)
            img_url = style_attr[start:end]
            results.append({"제목": title, "이미지 URL": img_url})
        except Exception as e:
            print("❌ 항목 처리 실패:", e)

    driver.quit()
    return results

# ✅ 엑셀 기반으로 이미지 크롤링 통합
def crawl_media_from_excel(excel_path, max_keywords=3, images_per_keyword=5):
    df = pd.read_excel(excel_path)[:3]
    all_results = []

    for idx, row in df.iterrows():
        row_info = {
            "열1": row.get("열1", ""),
            "열2": row.get("열2", ""),
            "열3": row.get("열3", ""),
            "열4": row.get("열4", "")
        }

        keywords = str(row_info["열4"]).split(',') if pd.notna(row_info["열4"]) else []
        for kw in keywords[:max_keywords]:
            kw = kw.strip()
            if not kw:
                continue

            print(f"🖼️ ({idx}) 키워드 '{kw}' 이미지 검색 중...")
            images = get_media_results(kw, max_items=images_per_keyword)
            if not images:
                result = row_info.copy()
                result.update({"키워드": kw, "제목": "❌ 이미지 없음", "이미지 URL": ""})
                all_results.append(result)
                continue

            for img in images:
                result = row_info.copy()
                result["키워드"] = kw
                result.update(img)
                all_results.append(result)
                time.sleep(0.5)

    return pd.DataFrame(all_results)

# ✅ 실행
if __name__ == "__main__":
    excel_path = "교과서_본문_정리_열이름통일 (3).xlsx"  # 실제 파일명
    df_media_final = crawl_media_from_excel(excel_path, max_keywords=2, images_per_keyword=5)
    df_media_final.to_excel("교과서_키워드_이미지결과.xlsx", index=False)
    print("✅ 저장 완료: 교과서_키워드_이미지결과.xlsx")


🖼️ (1) 키워드 '청동기' 이미지 검색 중...
🖼️ (1) 키워드 '청동기시대' 이미지 검색 중...
🖼️ (2) 키워드 '청동기 유물' 이미지 검색 중...
✅ 저장 완료: 교과서_키워드_이미지결과.xlsx


In [1]:
import collections
import collections.abc
collections.Callable = collections.abc.Callable

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ✅ 셀레니움 드라이버 생성
def create_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

# ✅ 키워드로 상위 N개 문서 검색
def get_article_urls_from_keyword(keyword, max_items=5):
    driver = create_driver()
    base_url = "https://encykorea.aks.ac.kr"
    driver.get(base_url)
    time.sleep(1)

    search_input = driver.find_element(By.ID, "keyword")
    search_input.clear()
    search_input.send_keys(keyword)
    driver.find_element(By.CLASS_NAME, "main-search").click()
    time.sleep(2)

    items = driver.find_elements(By.CSS_SELECTOR, "ul.encyclopedia-list li.item")
    results = []
    for item in items[:max_items]:
        try:
            link_tag = item.find_element(By.CSS_SELECTOR, "a")
            title_tag = item.find_element(By.CSS_SELECTOR, ".title")
            href = link_tag.get_attribute("href")
            title = title_tag.text.strip()
            results.append((title, href))
        except:
            continue

    driver.quit()
    return results

# ✅ 본문 파싱
def parse_article_contents(title, url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        content = soup.find("div", class_="contents-detail-contents")
        if not content:
            return {"제목": title, "URL": url, "정의": "본문 없음"}

        sections = content.find_all("div", class_="detail-section")
        result = {"제목": title, "URL": url}
        for sec in sections:
            title_tag = sec.find(class_="section-title")
            body_tag = sec.find(class_="section-body")
            if title_tag and body_tag:
                key = title_tag.get_text(strip=True)
                value = body_tag.get_text(separator="\n").strip()
                result[key] = value

        return result
    except Exception as e:
        return {"제목": title, "URL": url, "정의": f"❌ 오류: {e}"}

# ✅ 원본 데이터 기반 크롤링 수행
def crawl_articles_from_excel_rows(excel_path, max_keywords=5, articles_per_keyword=5, save_path="4번_1-1.xlsx"):
    df = pd.read_excel(excel_path)[50:]
    all_results = []

    try:
        for idx, row in df.iterrows():
            row_info = {
                "열1": row.get("열1", ""),
                "열2": row.get("열2", ""),
                "열3": row.get("열3", ""),
                "열4": row.get("열4", "")
            }

            keywords = str(row_info["열4"]).split(',') if pd.notna(row_info["열4"]) else []
            for kw in keywords[:max_keywords]:
                kw = kw.strip()
                if not kw:
                    continue

                print(f"🔍 ({idx}) 키워드 '{kw}' 문서 검색 중...")
                articles = get_article_urls_from_keyword(kw, articles_per_keyword)
                if not articles:
                    result = row_info.copy()
                    result.update({"키워드": kw, "제목": "❌ 문서 없음", "URL": "", "정의": "N/A"})
                    all_results.append(result)
                    continue

                for title, url in articles:
                    parsed = parse_article_contents(title, url)
                    result = row_info.copy()
                    result["키워드"] = kw
                    result.update(parsed)
                    all_results.append(result)
                    time.sleep(1)

    except Exception as e:
        print(f"❌ 오류 발생: {e}")
        print("⚠️ 수집된 데이터까지 저장 후 종료합니다...")

    finally:
        final_df = pd.DataFrame(all_results)
        final_df.to_excel(save_path, index=False)
        print(f"✅ 저장 완료: {save_path}")

# ✅ 실행
crawl_articles_from_excel_rows("split_4.xlsx", max_keywords=2, articles_per_keyword=5)


🔍 (50) 키워드 '도요토미 히데요시' 문서 검색 중...
🔍 (51) 키워드 '임진왜란' 문서 검색 중...
🔍 (51) 키워드 '부산성' 문서 검색 중...
🔍 (52) 키워드 '임진왜란' 문서 검색 중...
🔍 (52) 키워드 '수군' 문서 검색 중...
🔍 (53) 키워드 '이순신' 문서 검색 중...
🔍 (53) 키워드 '옥포해전' 문서 검색 중...
🔍 (54) 키워드 '의병' 문서 검색 중...
🔍 (54) 키워드 '승군' 문서 검색 중...
🔍 (56) 키워드 '정유재란' 문서 검색 중...
✅ 저장 완료: 4번_1-1.xlsx


KeyboardInterrupt: 