In [18]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from datetime import datetime
from pathlib import Path

In [24]:
def setup_driver(headless=True):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(service=ChromeService(), options=options)
    return driver

In [25]:
def scrape_cnn_world_news(driver):
    """Scrape latest world news from CNN.
    Returns a DataFrame with columns: title, url, scraped_at."""
    url = "https://edition.cnn.com/world"
    driver.get(url)
    # Wait until news blocks are loaded
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "container__headline"))
    )

    headlines = driver.find_elements(By.CLASS_NAME, "container__headline")

    news_data = []
    for item in headlines:
        try:
            title = item.find_element(By.CLASS_NAME, "container__headline-text").text.strip()
            link = item.find_element(By.XPATH, "ancestor::a[1]").get_attribute("href")
            news_data.append({
                "title": title,
                "url": link,
                "scraped_at": datetime.now().isoformat()
            })
        except Exception as e:
            print(f"⚠️ Problem with element: {e}")
            continue

    return pd.DataFrame(news_data)

In [30]:
def scrape_article_content(driver, url, debug=True):
    """Scrape content from a single CNN article.
    Returns a dictionary with article details."""
    try:
        if debug:
            print(f"🌐 Loading: {url}")
        
        driver.get(url)
        # Wait for the article to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        # Give page time to fully load
        import time
        time.sleep(2)
        
        # Extract article title with more selectors
        title = ""
        title_selectors = [
            "h1[data-editable='headlineText']",
            ".headline__text",
            "h1.pg-headline", 
            "h1",
            ".article-title",
            "[data-component-name='Headline'] h1",
            ".content-title"
        ]
        
        for selector in title_selectors:
            try:
                title_element = driver.find_element(By.CSS_SELECTOR, selector)
                title = title_element.text.strip()
                if title and debug:
                    print(f"✅ Title found with selector '{selector}': {title[:50]}...")
                if title:
                    break
            except:
                continue
        
        if not title and debug:
            print("⚠️ No title found with any selector")
        
        # Extract article text content with expanded selectors
        content = ""
        content_selectors = [
            # Original selectors
            ".article__content .zn-body__paragraph",
            ".zn-body__paragraph",
            ".article-body .zn-body__paragraph", 
            "[data-component-name='ArticleBody'] p",
            # Additional selectors
            ".article-body p",
            ".story-body p",
            ".content-body p",
            ".article-content p",
            "[data-component-name='StandardArticleBody'] p",
            ".inline-placeholder p",
            ".l-container .zn-body__paragraph",
            "div[data-module='ArticleBody'] p",
            # Fallback - any paragraph in main content areas
            "main p",
            "article p", 
            ".content p"
        ]
        
        paragraphs = []
        successful_selector = None
        
        for selector in content_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    found_paragraphs = [elem.text.strip() for elem in elements if elem.text.strip()]
                    if found_paragraphs:
                        paragraphs = found_paragraphs
                        successful_selector = selector
                        if debug:
                            print(f"✅ Content found with selector '{selector}': {len(paragraphs)} paragraphs")
                        break
            except Exception as e:
                if debug:
                    print(f"❌ Selector '{selector}' failed: {e}")
                continue
        
        if paragraphs:
            content = "\n\n".join(paragraphs)
        elif debug:
            print("⚠️ No content found with any selector")
            # Debug: Let's see what's actually on the page
            print("🔍 Page title:", driver.title)
            print("🔍 URL loaded:", driver.current_url)
            
            # Try to find any text content
            try:
                body_text = driver.find_element(By.TAG_NAME, "body").text
                print(f"🔍 Total body text length: {len(body_text)} characters")
                if len(body_text) > 100:
                    print(f"🔍 First 200 chars: {body_text[:200]}...")
            except:
                print("🔍 Could not extract body text")
        
        # Extract publish date with more selectors
        publish_date = ""
        date_selectors = [
            ".timestamp",
            "[data-editable='dateTime']",
            ".article__meta time",
            "time",
            ".publish-date",
            ".article-date",
            "[datetime]"
        ]
        
        for selector in date_selectors:
            try:
                date_element = driver.find_element(By.CSS_SELECTOR, selector)
                publish_date = date_element.get_attribute("datetime") or date_element.text.strip()
                if publish_date and debug:
                    print(f"✅ Date found: {publish_date}")
                if publish_date:
                    break
            except:
                continue
        
        # Extract author with more selectors
        author = ""
        author_selectors = [
            ".byline__name",
            "[data-editable='byline']", 
            ".article__byline",
            ".byline",
            ".author-name",
            ".article-author",
            "[data-component-name='Byline']"
        ]
        
        for selector in author_selectors:
            try:
                author_element = driver.find_element(By.CSS_SELECTOR, selector)
                author = author_element.text.strip()
                if author and debug:
                    print(f"✅ Author found: {author}")
                if author:
                    break
            except:
                continue
        
        result = {
            "title": title,
            "content": content,
            "author": author,
            "publish_date": publish_date,
            "url": url,
            "scraped_at": datetime.now().isoformat(),
            "content_length": len(content),
            "successful_selector": successful_selector
        }
        
        if debug:
            print(f"📊 Result: Title={'✅' if title else '❌'}, Content={'✅' if content else '❌'} ({len(content)} chars), Author={'✅' if author else '❌'}")
        
        return result
        
    except Exception as e:
        print(f"❌ Failed to scrape {url}: {e}")
        return {
            "title": "",
            "content": "",
            "author": "",
            "publish_date": "",
            "url": url,
            "scraped_at": datetime.now().isoformat(),
            "content_length": 0,
            "error": str(e),
            "successful_selector": None
        }

In [31]:
def scrape_cnn_articles_with_content(driver, max_articles=10, debug=False):
    """Scrape CNN world news with full article content.
    
    Args:
        driver: Selenium WebDriver instance
        max_articles: Maximum number of articles to scrape (default: 10)
        debug: Enable debug output (default: False)
    
    Returns:
        DataFrame with columns: title, content, author, publish_date, url, scraped_at, content_length
    """
    # First get the list of articles from the main page
    print("📰 Getting article list from CNN World News...")
    df_links = scrape_cnn_world_news(driver)
    
    if df_links.empty:
        print("⚠️ No articles found on main page")
        return pd.DataFrame()
    
    print(f"📋 Found {len(df_links)} articles. Scraping content for first {max_articles}...")
    
    # Filter out empty URLs and limit the number of articles to scrape
    valid_urls = df_links['url'].dropna().head(max_articles).tolist()
    
    if not valid_urls:
        print("⚠️ No valid URLs found")
        return pd.DataFrame()
    
    print(f"🔗 Processing {len(valid_urls)} valid URLs...")
    
    articles_data = []
    successful_scrapes = 0
    
    for i, url in enumerate(valid_urls, 1):
        print(f"\n🔍 Scraping article {i}/{len(valid_urls)}")
        if debug:
            print(f"URL: {url}")
        else:
            print(f"URL: {url[:80]}...")
        
        article_data = scrape_article_content(driver, url, debug=debug)
        articles_data.append(article_data)
        
        if article_data['content_length'] > 0:
            successful_scrapes += 1
            print(f"✅ Success: {article_data['content_length']} characters")
        else:
            print(f"❌ No content extracted")
        
        # Add a delay to be respectful to the server
        import time
        time.sleep(2)  # Increased delay
    
    print(f"\n📊 Scraping completed:")
    print(f"Total articles processed: {len(articles_data)}")
    print(f"Successful content extractions: {successful_scrapes}")
    print(f"Success rate: {(successful_scrapes/len(articles_data)*100):.1f}%")
    
    return pd.DataFrame(articles_data)

In [32]:
# Debug cell - Test individual article scraping
def debug_single_article():
    """Debug function to test scraping a single article"""
    driver = setup_driver(headless=False)  # Set to False to see what's happening
    try:
        # First get some URLs to test
        df_links = scrape_cnn_world_news(driver)
        if not df_links.empty:
            # Test the first valid URL
            test_urls = df_links['url'].dropna().head(3).tolist()
            print(f"Testing {len(test_urls)} URLs:")
            
            for i, url in enumerate(test_urls):
                print(f"\n{'='*60}")
                print(f"Testing URL {i+1}: {url}")
                print('='*60)
                
                result = scrape_article_content(driver, url, debug=True)
                
                print(f"\nResults for URL {i+1}:")
                print(f"Title: {result['title'][:100] if result['title'] else 'EMPTY'}")
                print(f"Content length: {result['content_length']} characters")
                print(f"Author: {result['author'] if result['author'] else 'EMPTY'}")
                print(f"Publish date: {result['publish_date'] if result['publish_date'] else 'EMPTY'}")
                
                if result['content']:
                    print(f"Content preview: {result['content'][:200]}...")
                
                # Don't test too many at once
                if i >= 2:
                    break
                    
        else:
            print("No URLs found to test")
            
    finally:
        driver.quit()

# Uncomment the line below to run the debug function
# debug_single_article()

In [33]:
def save_to_csv(df: pd.DataFrame, path: Path):
    """Save DataFrame to CSV file."""
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    print(f"✅ Data saved to {path}")

In [34]:
if __name__ == "__main__":
    driver = setup_driver(headless=True)
    try:
        # Option 1: Scrape just the headlines (original functionality)
        print("Option 1: Scraping headlines only...")
        df_headlines = scrape_cnn_world_news(driver)
        if not df_headlines.empty:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")    
            filename = f"data/cnn_world_news_headlines_{timestamp}.csv"
            save_to_csv(df_headlines, Path(filename))
        
        # Option 2: Scrape full articles with content (NEW!)
        print("\nOption 2: Scraping full articles with content...")
        df_articles = scrape_cnn_articles_with_content(driver, max_articles=5)  # Start with 5 articles
        if not df_articles.empty:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")    
            filename = f"data/cnn_world_news_full_{timestamp}.csv"
            save_to_csv(df_articles, Path(filename))
            
            # Display some statistics
            print(f"\n📊 Scraping Statistics:")
            print(f"Total articles scraped: {len(df_articles)}")
            print(f"Average content length: {df_articles['content_length'].mean():.0f} characters")
            print(f"Articles with content: {(df_articles['content_length'] > 0).sum()}")
        else:
            print("⚠️ No articles with content found.")
            
    except Exception as e:
        print(f"❌ Error during scraping: {e}")
    finally:
        driver.quit()

Option 1: Scraping headlines only...
✅ Data saved to data\cnn_world_news_headlines_20250917_200833.csv

Option 2: Scraping full articles with content...
📰 Getting article list from CNN World News...
📋 Found 89 articles. Scraping content for first 5...
🔗 Processing 5 valid URLs...

🔍 Scraping article 1/5
URL: https://edition.cnn.com/2025/09/17/middleeast/saudi-arabia-pakistan-defense-pact...
❌ No content extracted

🔍 Scraping article 2/5
URL: https://edition.cnn.com/2025/09/17/middleeast/northern-afghanistan-taliban-inter...
❌ No content extracted

🔍 Scraping article 3/5
URL: https://edition.cnn.com/world/live-news/trump-uk-state-visit-09-17-2025-intl...
❌ No content extracted

🔍 Scraping article 4/5
URL: https://edition.cnn.com/2025/09/17/americas/bolsonaro-skin-cancer-brazil-latam-i...
❌ No content extracted

🔍 Scraping article 5/5
URL: https://edition.cnn.com/2025/09/17/middleeast/israeli-oscars-palestinian-film-in...
❌ No content extracted

📊 Scraping completed:
Total articles proce