In [1]:
!pip install requests beautifulsoup4 pandas openpyxl



In [4]:
import requests   # Import the requests library to fetch web content
from bs4 import BeautifulSoup   # Import BeautifulSoup for HTML parsing
import pandas as pd  # Import pandas for data manipulation and Excel handling


# Fetch the HTML of the BBC homepage
url = "https://www.bbc.com" # The URL of the website to scrape
response = requests.get(url)  # Fetch the page content using the GET method

# Checking if the request was successful
if response.status_code == 200:
    print("Successfully fetched the webpage.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content using BeautifulSoup


headlines = soup.find_all('a', class_='gs-c-promo-heading') # Find all <a> tags with the class 'gs-c-promo-heading'
print(f"Found {len(headlines)} headlines!")

headlines_list = []  # Initialize an empty list to store our headlines and URLs

for headline in headlines:  # Loop through each headline in the 'headline' list
    headline_text = headline.get_text() # Extract the text of the headline
    headline_url = headline.get('href') # Extract the URL (href attribute) of the link

    # If the URL starts with a '/', it is a relative URL
    if headline_url.startswith("/"):
        headline_url = "https://www.bbc.com" + headline_url # Fix the relative URL by pretending the base URL

    # Add the headline and its URL to our list
    headlines_list.append({"headline": headline_text, "url": headline_url})

# Check if any headlines were added to the list
print(f"Extracted {len(headlines_list)} headlines!")

df = pd.DataFrame(headlines_list) # Convert the list of dictionaries into a pandas DataFrame
print(df.head())

df.to_excel('bbc_headlines.xlsx', index=False)  # save the DataFrame to an Excel file
print("Data saved to 'bbc_headlines.xlsx' successfully!")
# 'bbc_headlines.xlsx' is the name of the file where the data will be saved.
# 'index=False' means we don't want to save the index (row numbers) in the Excel file.







Successfully fetched the webpage.
Found 0 headlines!
Extracted 0 headlines!
Empty DataFrame
Columns: []
Index: []
Data saved to 'bbc_headlines.xlsx' successfully!
