In [6]:
!pip install requests beautifulsoup4
!pip install pymongo


Collecting pymongo
  Downloading pymongo-4.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [24]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient, errors
import datetime

# Function to connect to MongoDB
def connect_mongo():
    try:
        client = MongoClient("mongodb+srv://Muskan:MUSKAN25@newsanalytics.rzmco.mongodb.net/?retryWrites=true&w=majority&appName=NEWSANALYTICS")
        db = client["news_database"]
        collection = db["bbc_headlines"]

        # Remove duplicate entries with empty or duplicate links before creating the index
        print("Cleaning up duplicate entries...")
        collection.delete_many({"link": ""})  # Remove records with empty links
        print("Removed empty links.")

        # Ensure unique index on the 'link' field
        collection.create_index("link", unique=True)
        print("Unique index created on 'link' field.")

        return collection
    except Exception as e:
        print("Error connecting to MongoDB:", e)
        return None

# Function to scrape BBC News headlines
def scrape_bbc_news():
    url = "https://www.bbc.com/news"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all headlines
            headlines = soup.select('h2')

            if not headlines:
                print("No headlines found. The page structure may have changed.")
                return []

            news_list = []
            for headline in headlines:
                title = headline.get_text(strip=True)
                link_tag = headline.find_parent('a')  # Get the parent anchor tag for link
                link = link_tag['href'] if link_tag else ''
                full_link = f"https://www.bbc.com{link}" if link.startswith('/') else link

                # Skip empty links
                if full_link.strip():
                    news_list.append({
                        "title": title,
                        "link": full_link,
                        "scraped_at": datetime.datetime.now().isoformat()
                    })

            return news_list
        else:
            print(f"Failed to retrieve BBC News. Status code: {response.status_code}")
            return []

    except Exception as e:
        print("Error occurred while scraping BBC News:", e)
        return []

# Function to store news in MongoDB with deduplication
def store_in_mongo(news_data):
    collection = connect_mongo()
    if collection is not None:
        inserted_count = 0
        for news in news_data:
            try:
                collection.insert_one(news)
                inserted_count += 1
            except errors.DuplicateKeyError:
                print(f"Duplicate entry found, skipping: {news['title']}")

        print(f"Inserted {inserted_count} new articles into MongoDB.")
    else:
        print("Failed to connect to MongoDB.")

# Main function to run the scraper and store data
if __name__ == "__main__":
    print("Starting BBC News scraping...")
    news_data = scrape_bbc_news()

    if news_data:
        for news in news_data:
            print(f"Title: {news['title']}\nLink: {news['link']}\n")

        store_in_mongo(news_data)
    else:
        print("No news articles scraped.")


Starting BBC News scraping...
Title: Nvidia shares sink as Chinese AI app spooks markets
Link: https://www.bbc.com/news/articles/c0qw7z2v1pgo

Title: Is China's AI tool DeepSeek as good as it seems?
Link: https://www.bbc.com/news/articles/cx2jxvn0r51o

Title: Palestinians return to north Gaza on foot, with belongings in hand
Link: https://www.bbc.com/news/videos/cy5k03yq16lo

Title: Belgian footballer Nainggolan arrested in cocaine trafficking sting
Link: https://www.bbc.com/news/articles/c627zx21k94o

Title: Holocaust survivors recall horrors of Auschwitz as Prince William and Kate attend London memorial
Link: https://www.bbc.com/news/live/c5yep0l5545t

Title: Is China's AI tool DeepSeek as good as it seems?
Link: https://www.bbc.com/news/articles/cx2jxvn0r51o

Title: Trump administration fires justice department officials who investigated him
Link: https://www.bbc.com/news/live/cjw461nelzdt

Title: Israel says eight hostages due to be freed in first phase are dead
Link: https://www.b