In [2]:
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient

# MongoDB setup
client = MongoClient("mongodb+srv://jashanpreetkaur:jashangill@newsanalytics.rq1k3.mongodb.net/?retryWrites=true&w=majority&appName=NewsAnalytics")
db = client["news_database"]
collection = db["cnn_news"]

# CNN homepage URL
base_url = "https://www.cnn.com"

# Send a GET request to the CNN homepage
try:
    response = requests.get(base_url, timeout=10)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Failed to retrieve page: {e}")
    exit()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all article links on the homepage
articles = soup.find_all('a', class_='container__link')

news_set = set()
news_list = []

def extract_details(article_url):
    """Fetch the article page and extract timestamp and full content."""
    try:
        article_response = requests.get(article_url, timeout=10)
        article_response.raise_for_status()
        article_soup = BeautifulSoup(article_response.text, 'html.parser')
        
        # Extract timestamp
        timestamp_tag = article_soup.find('div', class_='timestamp vossi-timestamp')
        timestamp = timestamp_tag.text.strip() if timestamp_tag else None
        
        # Extract full article content
        paragraphs = article_soup.find_all('p', class_='paragraph')
        summary = " ".join(p.text.strip() for p in paragraphs) if paragraphs else None
        
        return timestamp, summary
    except requests.exceptions.RequestException:
        return None, None

for article in articles:
    headline = article.find('span', class_='container__headline-text')
    link = article.get('href', '')
    full_link = base_url + link if link.startswith("/") else link
    
    # Avoid duplicates
    if full_link in news_set:
        continue
    news_set.add(full_link)
    
    timestamp, summary = extract_details(full_link)
    
    news_item = {
        "headline": headline.text.strip() if headline else None,
        "link": full_link,
        "summary": summary,
        "timestamp": timestamp
    }
    
    # Avoid inserting items with null values
    if all(news_item.values()):
        news_list.append(news_item)

# Insert data into MongoDB
if news_list:
    collection.insert_many(news_list)
    print("News data successfully uploaded to MongoDB.")
else:
    print("No valid news data to upload.")

# Print extracted news headlines, links, summaries, and timestamps
for news in news_list:
    print(f"Headline: {news['headline']}")
    print(f"Link: {news['link']}")
    print(f"Summary: {news['summary']}")
    print(f"Timestamp: {news['timestamp']}\n")

print("News data successfully scraped.")


News data successfully uploaded to MongoDB.
Headline: Trump presidency
Link: https://www.cnn.com/politics/live-news/trump-administration-news-02-16-25/index.html
Timestamp: Updated
        6:35 PM EST, Sun February 16, 2025

Headline: ‘SNL’ 50th anniversary
Link: https://www.cnn.com/2025/02/16/entertainment/how-to-watch-snl50-anniversary-special/index.html
Summary: “Saturday Night Live” is celebrating its big 50th anniversary milestone in a way that only the long-running sketch show can: with a very star-studded special. “SNL50: The Anniversary Special” is set to air this Sunday with favorite “SNL” alums like Adam Sandler, Eddie Murphy and Kate McKinnon, among many others. Paul McCartney is the featured musical guest, while Bad Bunny, Sabrina Carpenter and other artists are also tapped to appear on the special. Here’s everything you need to know about how to tune in to this three-hour birthday celebration for NBC’s most beloved live comedy sketch show: The “Anniversary Special” will ai