In [3]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd


In [6]:
def extract_headlines():
    url = "https://www.reuters.com/world/"
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/114.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print("Failed to fetch page:", response.status_code)
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Flexible selector: pick any headline links inside the main news wrapper
    articles = soup.select('a[data-testid="Heading"]')

    headlines = []
    for article in articles:
        title = article.get_text(strip=True)
        href = article.get('href')
        link = f"https://www.reuters.com{href}" if href and href.startswith('/') else href
        headlines.append({
            "title": title,
            "url": link,
            "date": datetime.datetime.utcnow().isoformat()
        })

    return headlines


In [7]:
headlines = extract_headlines()
df = pd.DataFrame(headlines)
df.head()


Unnamed: 0,title,url,date
0,Sports,https://www.reuters.com/sports/,2025-07-07T01:00:47.674494
1,Science,https://www.reuters.com/science/,2025-07-07T01:00:47.674494
2,Lifestyle,https://www.reuters.com/lifestyle/,2025-07-07T01:00:47.674494
3,Graphics,https://www.reuters.com/graphics/,2025-07-07T01:00:47.674494
4,Pictures,https://www.reuters.com/pictures/,2025-07-07T01:00:47.674494


In [8]:
df.to_csv("../data/headlines_raw.csv", index=False)
print("Saved extracted headlines to data/headlines_raw.csv")


Saved extracted headlines to data/headlines_raw.csv
