In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_cnn_news_with_selenium(category_url):
    # Selenium WebDriver 설정
    options = webdriver.ChromeOptions()
    options.headless = False 
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        driver.get(category_url)
        
        # 기사 요소 대기
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "container__headline-text"))
        )
        
        # BeautifulSoup으로 HTML 파싱
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        articles = []

        # 기사 제목과 링크 추출
        for article in soup.find_all('span', class_='container__headline-text'):
            title = article.get_text().strip()  # 기사 제목
            link_element = article.find_parent("a")  # 상위 <a> 태그 찾기
            if link_element and "href" in link_element.attrs:
                link = link_element['href']
                if not link.startswith("http"):  # 상대 경로 처리
                    link = f"https://edition.cnn.com{link}"

                # 기사 본문 전문 추출
                content = extract_article_content(driver, link)

                articles.append({"title": title, "link": link, "content": content})

        return articles

    except Exception as e:
        print(f"Error occurred while scraping {category_url}: {e}")
        return []
    
    finally:
        driver.quit()  # 브라우저 종료


def extract_article_content(driver, article_url):
    """기사의 전문 추출"""
    try:
        driver.get(article_url)
        time.sleep(3)  # 페이지 로딩 대기
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # 본문 내용 추출 
        paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph')
        content = " ".join([p.get_text().strip() for p in paragraphs])
        return content[:200] + "..." if len(content) > 200 else content
    except Exception as e:
        print(f"Error while extracting content from {article_url}: {e}")
        return "No content available."


# 카테고리
categories = {
    "world": "https://edition.cnn.com/world",
    "business": "https://edition.cnn.com/business",
    "science": "https://edition.cnn.com/science",
    "health": "https://edition.cnn.com/health",
    "politics": "https://edition.cnn.com/politics",
    "entertainment": "https://edition.cnn.com/entertainment",
    "sport": "https://edition.cnn.com/sport"
}

# 전체 뉴스 데이터를 저장할 리스트 초기화
cnn_news = []

# 결과 출력
for category, url in categories.items():
    print(f"Fetching articles for category: {category}")
    articles = scrape_cnn_news_with_selenium(url)
    
    if articles:
        cnn_news.extend(articles)
        for article in articles[:1]:  # 상위 1개 기사만 출력
            print(f"Title: {article['title']}\nLink: {article['link']}\nContent: {article['content'][:200]}...\n")
    else:
        print(f"No articles found for category: {category}")
    print("="*50)

# 데이터프레임으로 변환 및 CSV 저장
if cnn_news:
    df = pd.DataFrame(cnn_news)
    
    # CSV 파일로 저장
    csv_file_name = 'cnn_news.csv'
    df.to_csv(csv_file_name, index=False, encoding='utf-8-sig')
    print(f"\nSaved {len(all_news)} articles to {csv_file_name}")
else:
    print("No news data to save.")


Fetching articles for category: world
Title: An Austrian woman has been kidnapped in Niger’s Agadez city, authorities say
Link: https://edition.cnn.com/2025/01/12/africa/austrian-kidnapped-niger-intl-latam/index.html
Content: An Austrian woman has been kidnapped by gunmen in Niger’s Agadez city, local residents and the Austrian foreign ministry said on Sunday, the first time a European citizen is known to have been kidnapp...

Fetching articles for category: business
Title: Meta oversight co-chair says the company looks like it’s ‘buckling to political pressure’ by ending fact-checking program
Link: https://edition.cnn.com/2025/01/12/business/meta-oversight-fact-checking-political/index.html?utm_source=business_ribbon
Content: Meta’s decision to end its fact-checking program looks like the company is “caving” to political pressures, said Meta’s oversight board co-chair Michael McConnell. McConnell, who is a law professor at...

Fetching articles for category: science
Title: Ice core ma

NameError: name 'all_news' is not defined

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

def fetch_article_content(driver, article_url):
    """
    기사 본문을 추출하는 함수. 100자까지 요약.
    """
    try:
        driver.get(article_url)
        time.sleep(3)  # 페이지 로딩 대기
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 기사 본문 추출
        paragraphs = soup.find_all('div', class_='zn-body__paragraph')
        content = " ".join([p.get_text().strip() for p in paragraphs])

        # 본문 100자 제한
        if len(content) > 100:
            content = content[:100] + "..."  # 100자까지만 표시하고 나머지는 생략
        return content if content else "No content available"
    except Exception as e:
        print(f"Error occurred while fetching article content: {e}")
        return "Failed to fetch content"

def scrape_cnn_news_with_selenium(category_url, max_articles=5):
    """
    주어진 카테고리 URL에서 기사를 크롤링하는 함수.
    """
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(category_url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        articles = []
        # 주어진 클래스 이름으로 기사 목록 추출 (수정된 클래스 사용)
        article_elements = soup.find_all('span', class_='container__headline-text', limit=max_articles)

        for article in article_elements:
            title = article.get_text().strip()  # 기사 제목
            link_tag = article.find_parent('a')  # 부모 <a> 태그에서 링크 추출
            if link_tag and link_tag.get('href'):
                link = link_tag['href']
                if not link.startswith("http"):
                    link = f"https://edition.cnn.com{link}"

                # 본문 추출 및 요약
                summary = fetch_article_content(driver, link)
                articles.append({"title": title, "link": link, "summary": summary})

    except Exception as e:
        print(f"Error occurred while scraping {category_url}: {e}")
        return []
    finally:
        driver.quit()

    return articles

# 카테고리별로 크롤링
categories = {
    "world": "https://edition.cnn.com/world",
}

# 각 카테고리에서 기사를 크롤링하고 출력
for category, url in categories.items():
    print(f"Fetching up to 5 articles for category: {category}")
    articles = scrape_cnn_news_with_selenium(url, max_articles=5)

    if articles:
        for article in articles:
            print(f"Title: {article['title']}")
            print(f"Link: {article['link']}")
            print(f"Summary: {article['summary']}\n")
    else:
        print(f"No articles found for category: {category}")
    print("=" * 50)


Fetching up to 5 articles for category: world
Title: A cut undersea internet cable is making Taiwan worried about ‘gray zone’ tactics from Beijing
Link: https://edition.cnn.com/2025/01/09/china/undersea-cable-taiwan-intl-hnk/index.html
Summary: No content available

Title: Philippines’ Black Nazarene procession draws hundreds of thousands of devotees
Link: https://edition.cnn.com/2025/01/09/asia/philippines-black-nazarene-procession-intl-hnk/index.html
Summary: No content available

Title: Kenyan government critics mysteriously disappeared. They came back silenced
Link: https://edition.cnn.com/2025/01/09/africa/kenya-government-critics-intl/index.html
Summary: No content available

Title: Airstrike on village in western Myanmar kills at least 40 people, groups say
Link: https://edition.cnn.com/2025/01/09/asia/myanmar-village-army-airstrike-intl-hnk/index.html
Summary: No content available

Title: Venezuelan opposition leader Machado free after being ‘violently intercepted,’ her team sa