In [None]:
# !pip install firecrawl-py playwright browser-use langchain-mcp-adapters python-dotenv
# !playwright install chromium

In [None]:
from langchain_mcp_adapters.client import (
    MultiServerMCPClient,
)  # ✅ 수정: client에서 import
from dotenv import load_dotenv
import os, platform
import asyncio
import json
from datetime import datetime

load_dotenv()
_client = None
_tools = None


def get_exa_config():
    """Exa MCP 예시"""
    MCP_KEY = os.getenv("MCP_KEY")

    if platform.system() == "Windows":
        return {
            "command": "cmd",
            "args": [
                "/c",
                "npx",
                "-y",
                "@smithery/cli@latest",
                "run",
                "exa",
                "--key",
                MCP_KEY,
                "--profile",
                "usual-reindeer-MZSQQr",
            ],
            "transport": "stdio",
        }
    else:
        return {
            "command": "npx",
            "args": [
                "-y",
                "@smithery/cli@latest",
                "run",
                "exa",
                "--key",
                MCP_KEY,
                "--profile",
                "usual-reindeer-MZSQQr",
            ],
            "transport": "stdio",
        }


def get_firecrawl_config():
    """Firecrawl MCP 예시"""
    FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")

    if platform.system() == "Windows":
        return {
            "command": "cmd",
            "args": [
                "/c",
                "npx",
                "-y",
                "@smithery/cli@latest",
                "run",
                "@Krieg2065/firecrawl-mcp-server",
                "--key",
                "c3944d56-893b-4175-b5ea-712c46eb16f8",
                "--profile",
                "continental-lynx-d4NE7j",
            ],
            "transport": "stdio",
        }
    else:
        return {
            "command": "npx",
            "args": [
                "-y",
                "@smithery/cli@latest",
                "run",
                "@Krieg2065/firecrawl-mcp-server",
                "--key",
                "c3944d56-893b-4175-b5ea-712c46eb16f8",
                "--profile",
                "continental-lynx-d4NE7j",
            ],
            "transport": "stdio",
        }


def get_playwright_config():
    """Playwright MCP 설정"""
    if platform.system() == "Windows":
        return {
            "command": "cmd",
            "args": [
                "/c",
                "npx",
                "-y",
                "@smithery/cli@latest",
                "run",
                "@microsoft/playwright-mcp",
                "--key",
                "c3944d56-893b-4175-b5ea-712c46eb16f8",
            ],
            "transport": "stdio",
        }
    else:
        return {
            "command": "npx",
            "args": [
                "-y",
                "@smithery/cli@latest",
                "run",
                "@microsoft/playwright-mcp",
                "--key",
                "c3944d56-893b-4175-b5ea-712c46eb16f8",
            ],
            "transport": "stdio",
        }


async def get_client():
    """MCP 클라이언트 초기화 (0.1.0+ 호환)"""
    global _client
    if _client is None:
        _client = MultiServerMCPClient(
            {
                "firecrawl": get_firecrawl_config(),
                "playwright": get_playwright_config(),
            }
        )
        # ⚠️ __aenter__() 제거: 0.1.0+에서 NotImplementedError 발생
    return _client


async def get_tools():
    """사용 가능한 도구 가져오기"""
    global _tools
    if _tools is None:
        client = await get_client()
        _tools = await client.get_tools()  # ✅ 수정: get_tools() 메서드 사용
    return _tools


print("✅ MCP 클라이언트 설정 완료!")

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


def create_chrome_driver():
    """Chrome 드라이버 생성"""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    )
    return webdriver.Chrome(options=options)


def wait_for_page_load(driver, wait_time=3):
    """페이지 로딩 대기"""
    time.sleep(wait_time)


def get_current_page_html(driver):
    """현재 페이지 HTML 가져오기"""
    return driver.page_source


def find_article_elements(driver):
    """페이지에서 기사 요소들 찾기"""
    # 기사 목록 컨테이너의 모든 링크 찾기
    articles = driver.find_elements(By.CSS_SELECTOR, "a[href*='Detail.aspx']")
    return articles if articles else driver.find_elements(By.TAG_NAME, "a")


def click_article(driver, article_element, index):
    """기사 클릭하고 상세 페이지로 이동"""
    print(f"      [{index}] 클릭 중...")
    article_element.click()
    wait_for_page_load(driver)


def go_back_to_list(driver):
    """목록 페이지로 돌아가기"""
    driver.back()
    wait_for_page_load(driver)


def extract_title_from_soup(soup):
    """제목 추출"""
    title_tag = soup.find("h1") or soup.find("h2") or soup.find("title")
    return title_tag.get_text(strip=True) if title_tag else ""


def extract_date_from_soup(soup):
    """날짜 추출"""
    date_tag = soup.find("time") or soup.find(
        class_=lambda x: x and "date" in str(x).lower()
    )
    return date_tag.get_text(strip=True) if date_tag else ""


def extract_content_from_soup(soup):
    """내용 추출"""
    body_tag = (
        soup.find("article") or soup.find("div", class_="content") or soup.find("body")
    )
    return body_tag.get_text(separator=" ", strip=True)[:1000] if body_tag else ""


def parse_current_page_as_article(driver):
    """현재 페이지를 기사 데이터로 파싱"""
    html = get_current_page_html(driver)
    soup = BeautifulSoup(html, "html.parser")

    return {
        "url": driver.current_url,
        "title": extract_title_from_soup(soup),
        "date": extract_date_from_soup(soup),
        "content": extract_content_from_soup(soup),
        "source": "하우징포스트",
    }


print("Selenium 클릭 크롤링 함수 준비 완료")

Selenium 클릭 크롤링 함수 준비 완료


In [2]:
import os
from pathlib import Path
from datetime import datetime
import json


def navigate_to_page(driver, url):
    """페이지로 이동"""
    driver.get(url)
    wait_for_page_load(driver)


def scrape_article_by_clicking(driver, article_element, index):
    """기사를 클릭해서 크롤링"""
    try:
        click_article(driver, article_element, index)
        article_data = parse_current_page_as_article(driver)
        print(f"         수집: {article_data['title'][:30]}")
        go_back_to_list(driver)
        return article_data
    except Exception as e:
        print(f"         실패: {e}")
        return None


def scrape_articles_from_page(driver):
    """현재 페이지의 모든 기사 크롤링"""
    article_elements = find_article_elements(driver)
    print(f"   기사 요소 {len(article_elements)}개 발견")

    articles = []
    for index, element in enumerate(article_elements, 1):
        # 매번 요소를 다시 찾기 (stale element 방지)
        current_elements = find_article_elements(driver)
        if index <= len(current_elements):
            article = scrape_article_by_clicking(
                driver, current_elements[index - 1], index
            )
            if article and article.get("title"):
                articles.append(article)
            time.sleep(1)

    return articles


def build_page_url(base_url, page_number):
    """페이지 URL 생성"""
    return base_url if page_number == 1 else f"{base_url}&page={page_number}"


def scrape_single_page(driver, page_number, base_url):
    """단일 페이지 크롤링"""
    page_url = build_page_url(base_url, page_number)
    print(f"\n페이지 {page_number}: {page_url}")

    navigate_to_page(driver, page_url)
    articles = scrape_articles_from_page(driver)

    print(f"   수집 완료: {len(articles)}개")
    time.sleep(2)

    return articles


def crawl_multiple_pages(base_url, total_pages):
    """여러 페이지 크롤링"""
    driver = create_chrome_driver()

    try:
        all_articles = []
        for page_num in range(1, total_pages + 1):
            articles = scrape_single_page(driver, page_num, base_url)
            all_articles.extend(articles)
        return all_articles
    finally:
        driver.quit()


def generate_filename_with_timestamp():
    """타임스탬프가 포함된 파일명 생성"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    return f"housing_post_{timestamp}.json"


def create_json_output(articles):
    """JSON 출력 데이터 생성"""
    return {
        "crawled_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "total": len(articles),
        "articles": articles,
    }


def save_articles_to_json(articles, output_dir="src/data/policy_factors"):
    """기사를 JSON 파일로 저장"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    filename = generate_filename_with_timestamp()
    filepath = Path(output_dir) / filename

    json_data = create_json_output(articles)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)

    print(f"\n저장: {filepath}")
    print(f"   기사 수: {len(articles)}개")

    return str(filepath)


def crawl_housing_post(
    base_url="https://housing-post.com/List.aspx?CNO=11389", pages=3
):
    """하우징포스트 크롤링 (클릭 방식)"""
    print(f"크롤링 시작: {pages}페이지")
    print("=" * 60)

    articles = crawl_multiple_pages(base_url, pages)

    print("\n" + "=" * 60)
    print(f"완료! 총 {len(articles)}개 수집")

    return articles


print("클릭 방식 크롤링 함수 준비 완료")

클릭 방식 크롤링 함수 준비 완료


In [None]:
# 실행


def show_article_preview(article):
    """기사 미리보기 출력"""
    print(f"\n제목: {article['title']}")
    print(f"날짜: {article['date']}")
    print(f"URL: {article['url']}")
    print(f"내용: {article['content'][:80]}...")


def show_results_preview(articles, preview_count=3):
    """결과 미리보기"""
    print("\n" + "=" * 60)
    print("결과 미리보기")
    print("=" * 60)

    preview_articles = articles[:preview_count]
    list(map(show_article_preview, preview_articles))

    remaining = len(articles) - preview_count
    if remaining > 0:
        print(f"\n... 외 {remaining}개")


def main():
    """메인 실행"""
    articles = crawl_housing_post(
        base_url="https://housing-post.com/List.aspx?CNO=11389", pages=3
    )

    if articles:
        save_articles_to_json(articles)
        show_results_preview(articles)
        return articles
    else:
        print("결과 없음")
        return []


# 실행
main()

크롤링 시작: 3페이지

페이지 1: https://housing-post.com/List.aspx?CNO=11389
   기사 요소 183개 발견
      [1] 클릭 중...
         실패: Message: element not interactable
  (Session info: chrome=141.0.7390.108); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#elementnotinteractableexception
Stacktrace:
	GetHandleVerifier [0x0x7ff7dcd2e8e5+80021]
	GetHandleVerifier [0x0x7ff7dcd2e940+80112]
	(No symbol) [0x0x7ff7dcab0425]
	(No symbol) [0x0x7ff7dcb09dc3]
	(No symbol) [0x0x7ff7dcafb7c8]
	(No symbol) [0x0x7ff7dcb3122a]
	(No symbol) [0x0x7ff7dcafb056]
	(No symbol) [0x0x7ff7dcb31440]
	(No symbol) [0x0x7ff7dcb5968a]
	(No symbol) [0x0x7ff7dcb31003]
	(No symbol) [0x0x7ff7dcaf95d1]
	(No symbol) [0x0x7ff7dcafa3f3]
	GetHandleVerifier [0x0x7ff7dcfedc7d+2960429]
	GetHandleVerifier [0x0x7ff7dcfe7f3a+2936554]
	GetHandleVerifier [0x0x7ff7dd008977+3070247]
	GetHandleVerifier [0x0x7ff7dcd483ce+185214]
	GetHandleVerifier [0x0x7ff7dcd4fe1f+216527]
	GetHandleVe

KeyboardInterrupt: 

: 

In [None]:
# 현재 작업 디렉토리 확인
import os

print(f"현재 작업 디렉토리: {os.getcwd()}")
print(f"저장될 경로: {os.path.join(os.getcwd(), 'housing_post_articles.json')}")

In [None]:
# HTML 구조 확인 (디버깅용)
def debug_page_structure(url):
    """페이지 HTML 구조 확인"""
    html = scrape_page(url)
    if not html:
        return

    soup = BeautifulSoup(html, "html.parser")

    # 모든 링크 출력
    print("\n모든 링크:")
    all_links = soup.find_all("a", href=True)
    print(f"총 {len(all_links)}개 링크 발견")

    # 처음 10개 링크만 출력
    for i, link in enumerate(all_links[:10], 1):
        href = link.get("href", "")
        text = link.get_text(strip=True)[:50]
        print(f"  [{i}] {href[:80]}")
        print(f"      텍스트: {text}")

    # Detail이 포함된 링크만 찾기
    print("\n'Detail' 포함 링크:")
    detail_links = [l for l in all_links if "detail" in l.get("href", "").lower()]
    print(f"  {len(detail_links)}개 발견")

    for link in detail_links[:5]:
        print(f"  - {link.get('href', '')}")


# 테스트
debug_page_structure("https://housing-post.com/List.aspx?CNO=11389")

# Selenium 클릭 방식 크롤링

## 특징
- 실제 클릭: 기사 링크를 추출하지 않고 직접 클릭
- JavaScript 지원: 동적 콘텐츠 크롤링
- 뒤로가기: 클릭 후 자동으로 목록으로 복귀
- 가독성: 작은 함수들로 구성

## 동작 방식

```
1. 목록 페이지 접근
2. 기사 요소 찾기
3. 첫 번째 기사 클릭
4. 상세 페이지에서 데이터 추출
5. 뒤로가기
6. 두 번째 기사 클릭
7. 반복...
```

## 핵심 함수

```python
# 드라이버 생성
create_chrome_driver()

# 페이지 이동
navigate_to_page(driver, url)

# 기사 요소 찾기
find_article_elements(driver)

# 기사 클릭
click_article(driver, element, index)

# 현재 페이지 데이터 추출
parse_current_page_as_article(driver)

# 뒤로가기
go_back_to_list(driver)

# 크롤링 실행
scrape_article_by_clicking(driver, element, index)
scrape_articles_from_page(driver)
crawl_multiple_pages(base_url, total_pages)
```

## 저장 위치
```
src/data/policy_factors/housing_post_20250120_143522.json
```

## 필요 패키지
```bash
pip install selenium beautifulsoup4
```