# Scrap Latest News from CNN

## Headlines and Links

In [2]:
import requests
from bs4 import BeautifulSoup
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}


def get_cnn_news():
    """Extract headlines from CNN's website using more reliable methods"""
    url = "https://www.cnn.com"  # Use main CNN domain which has more content
    
    try:
        # Make request with headers and appropriate timeout
        response = requests.get(url, headers=HEADERS, timeout=15)
        
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            return []
            
        soup = BeautifulSoup(response.content, "html.parser")
        
        # CNN structure changes frequently - try multiple strategies
        headlines = []
        seen_urls = set()
        
        # Strategy 1: Find headline containers directly
        for container in soup.select('div[class*="headline"], span[class*="headline"], h3[class*="headline"]'):
            link_tag = container.find('a', href=True) or container.find_parent('a', href=True)
            if link_tag:
                text = container.get_text(strip=True)
                link = link_tag.get('href', '')
                if text and link and link not in seen_urls:
                    if link.startswith('/'):
                        link = f"https://www.cnn.com{link}"
                    headlines.append((text, link))
                    seen_urls.add(link)
        
        # Strategy 2: Find all article links by their URL pattern
        if len(headlines) < 5:
            for link_tag in soup.find_all('a', href=True):
                link = link_tag.get('href', '')
                text = link_tag.get_text(strip=True)
                
                # CNN articles typically include year in URL
                if (text and link and link not in seen_urls and 
                    ('/20' in link or 'article' in link) and 
                    not link.endswith('.jpg') and not link.endswith('.png')):
                    
                    if link.startswith('/'):
                        link = f"https://www.cnn.com{link}"
                    
                    # Ensure it's a CNN link
                    if 'cnn.com' in link:
                        headlines.append((text, link))
                        seen_urls.add(link)
        
        # Limit to 10 headlines
        return headlines[:10]
        
    except Exception as e:
        print(f"Error scraping CNN: {e}")
        return []

print("\nCNN News Headlines:")
for idx, (text, link) in enumerate(get_cnn_news(), 1):
    print(f"{idx}. {text}\n   {link}")


CNN News Headlines:
1. Brooklyn Bridge incident
   https://www.cnn.com/2025/05/17/us/manhattan-brooklyn-bridge-ship
2. Brooklyn Bridge incident
   https://www.cnn.com/2025/05/17/us/manhattan-brooklyn-bridge-ship
3. Russia-Ukraine talks
   https://www.cnn.com/2025/05/16/europe/ukraine-russia-talks-npw-analysis-intl
4. Russia-Ukraine talks
   https://www.cnn.com/2025/05/16/europe/ukraine-russia-talks-npw-analysis-intl
5. Romania’s presidential election re-run
   https://www.cnn.com/2025/05/16/europe/romania-presidential-election-simion-dan-intl-cmd
6. Romania’s presidential election re-run
   https://www.cnn.com/2025/05/16/europe/romania-presidential-election-simion-dan-intl-cmd
7. Eurovision Song Contest
   https://www.cnn.com/2025/05/17/entertainment/eurovision-song-contest-winner-austria-jj-latam-intl
8. Eurovision Song Contest
   https://www.cnn.com/2025/05/17/entertainment/eurovision-song-contest-winner-austria-jj-latam-intl
9. Poland election
   https://www.cnn.com/2025/05/16/euro

## Save to file with Images

In [3]:
import requests
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime
import time
from urllib.parse import urlparse
import hashlib

# Define headers to avoid being blocked
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
}

def get_cnn_news(save_images=True, fetch_content=True, max_articles=10):
    """
    Extract headlines from CNN's website with enhanced functionality:
    - Extracts headlines and links
    - Fetches article content (paragraphs)
    - Downloads and saves images
    - Creates organized folders
    
    Args:
        save_images (bool): Whether to download and save images
        fetch_content (bool): Whether to fetch the full article content
        max_articles (int): Maximum number of articles to scrape
        
    Returns:
        list: List of dictionaries containing article data
    """
    url = "https://www.cnn.com"
    
    # Create directory for saving data
    today = datetime.now().strftime("%Y%m%d")
    base_dir = f"_images_cnn_{today}"
    if save_images and not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    try:
        # Make request with headers and appropriate timeout
        response = requests.get(url, headers=HEADERS, timeout=15)
        
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            return []
            
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find headline containers using multiple strategies
        headlines = []
        seen_urls = set()
        
        # Strategy 1: Find headline containers directly
        for container in soup.select('div[class*="headline"], span[class*="headline"], h3[class*="headline"]'):
            link_tag = container.find('a', href=True) or container.find_parent('a', href=True)
            if link_tag:
                text = container.get_text(strip=True)
                link = link_tag.get('href', '')
                if text and link and link not in seen_urls:
                    if link.startswith('/'):
                        link = f"https://www.cnn.com{link}"
                    
                    # Initialize article data dictionary
                    article_data = {
                        "headline": text,
                        "link": link,
                        "content": "",
                        "images": []
                    }
                    
                    headlines.append(article_data)
                    seen_urls.add(link)
        
        # Strategy 2: Find all article links by their URL pattern
        if len(headlines) < 3:
            for link_tag in soup.find_all('a', href=True):
                link = link_tag.get('href', '')
                text = link_tag.get_text(strip=True)
                
                # CNN articles typically include year in URL
                if (text and link and link not in seen_urls and 
                    ('/20' in link or 'article' in link) and 
                    not link.endswith('.jpg') and not link.endswith('.png')):
                    
                    if link.startswith('/'):
                        link = f"https://www.cnn.com{link}"
                    
                    # Ensure it's a CNN link
                    if 'cnn.com' in link:
                        article_data = {
                            "headline": text,
                            "link": link,
                            "content": "",
                            "images": []
                        }
                        
                        headlines.append(article_data)
                        seen_urls.add(link)
        
        # Limit to max_articles
        headlines = headlines[:max_articles]
        
        # Fetch article content and images if requested
        if fetch_content:
            for idx, article in enumerate(headlines):
                print(f"Fetching article {idx+1}/{len(headlines)}: {article['headline']}")
                
                try:
                    # Add a delay to avoid overwhelming the server
                    time.sleep(1)
                    
                    # Fetch article page
                    article_response = requests.get(article['link'], headers=HEADERS, timeout=15)
                    if article_response.status_code != 200:
                        print(f"  Error: Could not fetch article. Status code: {article_response.status_code}")
                        continue
                        
                    article_soup = BeautifulSoup(article_response.content, "html.parser")
                    
                    # Extract article content (paragraphs)
                    # Look for the main content area with multiple fallbacks
                    content_container = article_soup.select_one('div[class*="article__content"], div[class*="body"], div[class*="story-body"]')
                    if not content_container:
                        content_container = article_soup  # Fallback to the entire page
                    
                    # Extract paragraphs
                    paragraphs = content_container.find_all('p')
                    article_text = '\n\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
                    article['content'] = article_text
                    
                    # Extract images if requested
                    if save_images:
                        # Find all images in the article
                        images = content_container.find_all('img', src=True)
                        
                        # Create article-specific directory (using a hash of the headline to avoid invalid filenames)
                        article_hash = hashlib.md5(article['headline'].encode()).hexdigest()[:10]
                        article_dir = os.path.join(base_dir, article_hash)
                        if not os.path.exists(article_dir):
                            os.makedirs(article_dir)
                        
                        # Download and save images
                        for i, img in enumerate(images):
                            img_url = img['src']
                            
                            # Skip data URLs and invalid URLs
                            if img_url.startswith('data:') or not img_url or img_url == '#':
                                continue
                                
                            # Make URL absolute if needed
                            if img_url.startswith('/'):
                                img_url = f"https://www.cnn.com{img_url}"
                                
                            try:
                                # Get the image extension
                                img_ext = os.path.splitext(urlparse(img_url).path)[1]
                                if not img_ext:
                                    img_ext = '.jpg'  # Default extension
                                
                                # Create the image filename
                                img_filename = f"image_{i+1}{img_ext}"
                                img_path = os.path.join(article_dir, img_filename)
                                
                                # Download the image
                                img_response = requests.get(img_url, headers=HEADERS, timeout=10)
                                if img_response.status_code == 200:
                                    with open(img_path, 'wb') as f:
                                        f.write(img_response.content)
                                    article['images'].append({
                                        'url': img_url,
                                        'local_path': img_path
                                    })
                                    print(f"  Saved image: {img_path}")
                            except Exception as e:
                                print(f"  Error saving image {img_url}: {e}")
                
                except Exception as e:
                    print(f"  Error processing article: {e}")
        
        return headlines
        
    except Exception as e:
        print(f"Error scraping CNN: {e}")
        return []

# Get the news with enhanced functionality
print("CNN News Headlines with Content and Images:")
articles = get_cnn_news(save_images=True, fetch_content=True, max_articles=5)

# Display the results
for idx, article in enumerate(articles, 1):
    print(f"\n{idx}. {article['headline']}")
    print(f"   Link: {article['link']}")
    
    # Show content preview (first 200 characters)
    content_preview = article['content'][:200].replace('\n', ' ')
    if article['content'] and len(article['content']) > 200:
        content_preview += "..."
    print(f"   Content preview: {content_preview}")
    
    # Show images
    if article['images']:
        print(f"   Downloaded {len(article['images'])} images")
    else:
        print("   No images downloaded")
    
    print("-" * 80)

# Save all data to a markdown file for easy viewing
markdown_path = f"cnn_news_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
with open(markdown_path, 'w', encoding='utf-8') as f:
    f.write(f"# CNN News Articles - {datetime.now().strftime('%Y-%m-%d')}\n\n")
    
    for idx, article in enumerate(articles, 1):
        f.write(f"## {idx}. {article['headline']}\n\n")
        f.write(f"**Link**: [{article['link']}]({article['link']})\n\n")
        
        if article['content']:
            f.write("### Content:\n\n")
            f.write(article['content'].replace('\n', '\n\n'))
            f.write("\n\n")
        
        if article['images']:
            f.write("### Images:\n\n")
            for img in article['images']:
                f.write(f"- [{img['url']}]({img['local_path']})\n")
            f.write("\n")
        
        f.write("---\n\n")

print(f"\nSaved all articles to {markdown_path}")

CNN News Headlines with Content and Images:
Fetching article 1/5: Brooklyn Bridge incident
  Saved image: _images_cnn_20250518\16f2d32bf2\image_1.jpg
  Saved image: _images_cnn_20250518\16f2d32bf2\image_2.jpg
  Saved image: _images_cnn_20250518\16f2d32bf2\image_3.jpg
Fetching article 2/5: Brooklyn Bridge incident
  Saved image: _images_cnn_20250518\16f2d32bf2\image_1.jpg
  Saved image: _images_cnn_20250518\16f2d32bf2\image_2.jpg
  Saved image: _images_cnn_20250518\16f2d32bf2\image_3.jpg
Fetching article 3/5: Russia-Ukraine talks
  Saved image: _images_cnn_20250518\6c9d82cac4\image_1.jpg
  Saved image: _images_cnn_20250518\6c9d82cac4\image_2.JPG
  Saved image: _images_cnn_20250518\6c9d82cac4\image_3.jpg
Fetching article 4/5: Russia-Ukraine talks
  Saved image: _images_cnn_20250518\6c9d82cac4\image_1.jpg
  Saved image: _images_cnn_20250518\6c9d82cac4\image_2.JPG
  Saved image: _images_cnn_20250518\6c9d82cac4\image_3.jpg
Fetching article 5/5: Romania’s presidential election re-run
  Saved

## Avoid yourself from being in trouble

In [15]:
import requests
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime
import time
from urllib.parse import urlparse
import hashlib
import random
import logging
from requests.exceptions import RequestException, Timeout, ConnectionError
from fake_useragent import UserAgent

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='cnn_scraper.log'
)
logger = logging.getLogger('cnn_scraper')

# Generate random user agent
try:
    ua = UserAgent()
    random_ua = ua.random
except:
    random_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

# Define headers to avoid being blocked
HEADERS = {
    'User-Agent': random_ua,
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml',
    'Referer': 'https://www.google.com/',
    'DNT': '1'
}

# List of free proxies - update these with working proxies
PROXIES = [
    'http://103.83.232.122:80',
    'http://103.216.82.19:6666',
    'http://34.87.84.105:80',
    None  # No proxy as fallback
]

def get_cnn_news(save_images=True, fetch_content=True, max_articles=10, retry_limit=3):
    """
    Extract headlines from CNN's website with enhanced functionality:
    - Extracts headlines and links
    - Fetches article content (paragraphs)
    - Downloads and saves images
    - Creates organized folders
    - Uses proxies and implements retry logic
    
    Args:
        save_images (bool): Whether to download and save images
        fetch_content (bool): Whether to fetch the full article content
        max_articles (int): Maximum number of articles to scrape
        retry_limit (int): Max number of retries for failed requests
        
    Returns:
        list: List of dictionaries containing article data
    """
    url = "https://www.cnn.com"
    
    # Create directory for saving data
    today = datetime.now().strftime("%Y%m%d")
    base_dir = f"_images_cnn_{today}"
    if save_images and not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    try:
        # Make request with headers, proxies and appropriate timeout
        articles_fetched = False
        for attempt in range(retry_limit):
            try:
                # Random delay to avoid scraping patterns
                time.sleep(random.uniform(1, 3))
                
                # Try with different proxies
                proxy = random.choice(PROXIES)
                proxies = {'http': proxy, 'https': proxy} if proxy else None
                
                logger.info(f"Connecting to CNN with proxy: {proxy}")
                response = requests.get(
                    url, 
                    headers=HEADERS, 
                    proxies=proxies,
                    timeout=15,
                    verify=True
                )
                
                if response.status_code == 200:
                    articles_fetched = True
                    break
                else:
                    logger.warning(f"Attempt {attempt+1}: Received status code {response.status_code}")
            
            except (ConnectionError, Timeout) as e:
                logger.warning(f"Attempt {attempt+1} failed: {str(e)}")
                continue
        
        if not articles_fetched:
            logger.error("Failed to connect to CNN after multiple attempts")
            return []
            
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find headline containers using multiple strategies
        headlines = []
        seen_urls = set()
        
        # Strategy 1: Find headline containers directly
        for container in soup.select('div[class*="headline"], span[class*="headline"], h3[class*="headline"]'):
            link_tag = container.find('a', href=True) or container.find_parent('a', href=True)
            if link_tag:
                text = container.get_text(strip=True)
                link = link_tag.get('href', '')
                if text and link and link not in seen_urls:
                    if link.startswith('/'):
                        link = f"https://www.cnn.com{link}"
                    
                    # Initialize article data dictionary
                    article_data = {
                        "headline": text,
                        "link": link,
                        "content": "",
                        "images": [],
                        "timestamp": datetime.now().isoformat()
                    }
                    
                    headlines.append(article_data)
                    seen_urls.add(link)
        
        # Strategy 2: Find all article links by their URL pattern
        if len(headlines) < 3:
            for link_tag in soup.find_all('a', href=True):
                link = link_tag.get('href', '')
                text = link_tag.get_text(strip=True)
                
                # CNN articles typically include year in URL
                if (text and link and link not in seen_urls and 
                    ('/20' in link or 'article' in link) and 
                    not link.endswith('.jpg') and not link.endswith('.png')):
                    
                    if link.startswith('/'):
                        link = f"https://www.cnn.com{link}"
                    
                    # Ensure it's a CNN link
                    if 'cnn.com' in link:
                        article_data = {
                            "headline": text,
                            "link": link,
                            "content": "",
                            "images": [],
                            "timestamp": datetime.now().isoformat()
                        }
                        
                        headlines.append(article_data)
                        seen_urls.add(link)
        
        # Limit to max_articles
        headlines = headlines[:max_articles]
        
        # Fetch article content and images if requested
        if fetch_content:
            for idx, article in enumerate(headlines):
                logger.info(f"Fetching article {idx+1}/{len(headlines)}: {article['headline']}")
                
                try:
                    # Random delay between requests (1-5 seconds)
                    time.sleep(random.uniform(1, 5))
                    
                    # Try multiple proxies for each article
                    article_fetched = False
                    for attempt in range(retry_limit):
                        try:
                            # Rotate proxies
                            proxy = random.choice(PROXIES)
                            proxies = {'http': proxy, 'https': proxy} if proxy else None
                            
                            # Fetch article page
                            article_response = requests.get(
                                article['link'], 
                                headers=HEADERS, 
                                proxies=proxies,
                                timeout=15
                            )
                            
                            if article_response.status_code == 200:
                                article_fetched = True
                                break
                            else:
                                logger.warning(f"  Attempt {attempt+1}: Status code: {article_response.status_code}")
                        
                        except (ConnectionError, Timeout) as e:
                            logger.warning(f"  Article fetch attempt {attempt+1} failed: {str(e)}")
                            continue
                    
                    if not article_fetched:
                        logger.error(f"  Failed to fetch article after {retry_limit} attempts")
                        continue
                        
                    article_soup = BeautifulSoup(article_response.content, "html.parser")
                    
                    # Extract article content (paragraphs)
                    # Look for the main content area with multiple fallbacks
                    content_container = article_soup.select_one('div[class*="article__content"], div[class*="body"], div[class*="story-body"]')
                    if not content_container:
                        content_container = article_soup  # Fallback to the entire page
                    
                    # Extract paragraphs
                    paragraphs = content_container.find_all('p')
                    article_text = '\n\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
                    article['content'] = article_text
                    
                    # Extract images if requested
                    if save_images:
                        # Find all images in the article
                        images = content_container.find_all('img', src=True)
                        
                        # Create article-specific directory (using a hash of the headline to avoid invalid filenames)
                        article_hash = hashlib.md5(article['headline'].encode()).hexdigest()[:10]
                        article_dir = os.path.join(base_dir, article_hash)
                        if not os.path.exists(article_dir):
                            os.makedirs(article_dir)
                        
                        # Download and save images
                        for i, img in enumerate(images):
                            img_url = img['src']
                            
                            # Skip data URLs and invalid URLs
                            if img_url.startswith('data:') or not img_url or img_url == '#':
                                continue
                                
                            # Make URL absolute if needed
                            if img_url.startswith('/'):
                                img_url = f"https://www.cnn.com{img_url}"
                                
                            try:
                                # Random delay between image downloads
                                time.sleep(random.uniform(0.5, 2))
                                
                                # Get the image extension
                                img_ext = os.path.splitext(urlparse(img_url).path)[1]
                                if not img_ext:
                                    img_ext = '.jpg'  # Default extension
                                
                                # Create the image filename
                                img_filename = f"image_{i+1}{img_ext}"
                                img_path = os.path.join(article_dir, img_filename)
                                
                                # Try multiple proxies for each image
                                for attempt in range(retry_limit):
                                    try:
                                        # Rotate proxies
                                        proxy = random.choice(PROXIES)
                                        proxies = {'http': proxy, 'https': proxy} if proxy else None
                                        
                                        # Download the image
                                        img_response = requests.get(
                                            img_url, 
                                            headers=HEADERS, 
                                            proxies=proxies,
                                            timeout=10
                                        )
                                        
                                        if img_response.status_code == 200:
                                            with open(img_path, 'wb') as f:
                                                f.write(img_response.content)
                                            article['images'].append({
                                                'url': img_url,
                                                'local_path': img_path
                                            })
                                            logger.info(f"  Saved image: {img_path}")
                                            break
                                    except Exception as e:
                                        logger.warning(f"  Image download attempt {attempt+1} failed: {str(e)}")
                                        if attempt == retry_limit - 1:
                                            logger.error(f"  Failed to download image {img_url} after {retry_limit} attempts")
                            except Exception as e:
                                logger.error(f"  Error processing image {img_url}: {e}")
                
                except Exception as e:
                    logger.error(f"  Error processing article: {e}")
        
        return headlines
        
    except Exception as e:
        logger.error(f"Error scraping CNN: {e}")
        return []

# Get the news with enhanced functionality
print("CNN News Headlines with Content and Images:")
articles = get_cnn_news(save_images=True, fetch_content=True, max_articles=5)

# Display the results
for idx, article in enumerate(articles, 1):
    print(f"\n{idx}. {article['headline']}")
    print(f"   Link: {article['link']}")
    
    # Show content preview (first 200 characters)
    content_preview = article['content'][:200].replace('\n', ' ')
    if article['content'] and len(article['content']) > 200:
        content_preview += "..."
    print(f"   Content preview: {content_preview}")
    
    # Show images
    if article['images']:
        print(f"   Downloaded {len(article['images'])} images")
    else:
        print("   No images downloaded")
    
    print("-" * 80)

# Save all data to a markdown file for easy viewing
markdown_path = f"cnn_news_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
with open(markdown_path, 'w', encoding='utf-8') as f:
    f.write(f"# CNN News Articles - {datetime.now().strftime('%Y-%m-%d')}\n\n")
    
    for idx, article in enumerate(articles, 1):
        f.write(f"## {idx}. {article['headline']}\n\n")
        f.write(f"**Link**: [{article['link']}]({article['link']})\n\n")
        
        if article['content']:
            f.write("### Content:\n\n")
            f.write(article['content'].replace('\n', '\n\n'))
            f.write("\n\n")
        
        if article['images']:
            f.write("### Images:\n\n")
            for img in article['images']:
                f.write(f"- [{img['url']}]({img['local_path']})\n")
            f.write("\n")
        
        f.write("---\n\n")

print(f"\nSaved all articles to {markdown_path}")

CNN News Headlines with Content and Images:

1. Nervous about the prospect of empty shelves and inflation, the US president sent in his even-keeled, professional negotiators to Geneva
   Link: https://www.cnn.com/2025/05/12/business/china-trade-deal-trump
   Content preview: President Donald Trump’s shock-and-awe tariff approach threatened to rupture the global financial system and drive the US economy into recession. Nervous about the prospect of empty store shelves and ...
   No images downloaded
--------------------------------------------------------------------------------

2. Nervous about the prospect of empty shelves and inflation, the US president sent in his even-keeled, professional negotiators to Geneva
   Link: https://www.cnn.com/2025/05/12/business/china-trade-deal-trump
   Content preview: President Donald Trump’s shock-and-awe tariff approach threatened to rupture the global financial system and drive the US economy into recession. Nervous about the prospect of empty s