In [1]:
!pip install requests beautifulsoup4 pandas openpyxl



In [9]:
import requests   # Import the requests library to fetch web content
from bs4 import BeautifulSoup   # Import BeautifulSoup for HTML parsing
import pandas as pd  # Import pandas for data manipulation and Excel handling


# Fetch the HTML of the BBC homepage
url = "https://news.ycombinator.com/" # The URL of the website to scrape
response = requests.get(url)  # Fetch the page content using the GET method

# Checking if the request was successful
if response.status_code == 200:
    print("Successfully fetched the webpage.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

soup = BeautifulSoup(response.text, 'html.parser') # Parse the HTML content using BeautifulSoup


# On Hacker News, each headline is inside a <span> tag with class 'titleline'
headlines = soup.find_all('span', class_='titleline') # Find all <a> tags with the class 'gs-c-promo-heading'
print(f"Found {len(headlines)} headlines.")

data = []  # Create an empty list to store the final structured data

# Loop over each headline element
for h in headlines:
    title = h.get_text()  # Extract the visible headline text
    link = h.find('a')['href']  # Extract the 'href' link (URL) from the <a> tag inside the <span>

    # Append the text and link as a dictionary into the list
    data.append({"headline": title, "url": link})

# Convert the list of dictionaries to a pandas DataFrame (like a table)
df = pd.DataFrame(data)

print(df.head()) # Print the first 5 rows to see what we scraped

df.to_excel("hacker_news_headlines.xlsx", index=False)
print("Saved to hacker_news_headlines.xlsx!")

# Final confirmation message
print("Saved to hacker_news_headlines.xlsx!") # Tell the user it worked!




Successfully fetched the webpage.
Found 30 headlines.
                                            headline  \
0            LLMs understand nullability (dmodel.ai)   
1  Decoding the 90s: Cryptography in Early Softwa...   
2  Why Catullus Continues to Seduce Us (newyorker...   
3  Show HN: Uncurl.dev – Convert curl commands to...   
4  A startup doesn't need to be a unicorn (mattgi...   

                                                 url  
0              https://dmodel.ai/nullability-gentle/  
1  https://www.botanica.software/post/decoding-th...  
2  https://www.newyorker.com/magazine/2025/04/07/...  
3                                https://uncurl.dev/  
4  https://mattgiustwilliamson.substack.com/p/you...  
Saved to hacker_news_headlines.xlsx!
Saved to hacker_news_headlines.xlsx!
