In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import json
from datetime import datetime, timedelta

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in the background
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Function to generate dates for January 2025
def get_january_2025_dates():
    start_date = datetime(2025, 1, 1)
    end_date = datetime(2025, 1, 7)
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date.strftime("%Y_%m_%d"))
        current_date += timedelta(days=1)
    return dates

# Dictionary to store the count of news links per day
daily_counts = {}

# Get all dates for January 2025
january_dates = get_january_2025_dates()

# Counter for total links across all days
total_links = 0

# Iterate through each date
for date in january_dates:
    print(f"Scraping sitemap for date: {date}")
    current_url = f"https://finance.yahoo.com/sitemap/{date}"

    # List to store news links for this date
    news_links = []

    while True:
        # Load the page
        driver.get(current_url)
        time.sleep(5)  # Wait for JavaScript to load

        # Get page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find the sitemap content div
        sitemap_div = soup.find("div", {"id": "module-sitemapcontent"})

        # Scrape news links from the current page
        if sitemap_div:
            for li in sitemap_div.find_all("li", class_="List(n) Py(3px) Lh(1.2)"):
                a_tag = li.find("a", href=True)
                if a_tag and a_tag["href"].startswith("https://finance.yahoo.com/news"):
                    news_links.append(a_tag["href"])
        else:
            print(f"Could not find the sitemap content section on {current_url}")

        # Check for the "Next" link
        next_link = soup.find("a", class_="C($c-fuji-grey-k)", string="Next")
        if next_link and "href" in next_link.attrs:
            # Update the current_url to the next page
            current_url = next_link["href"]
            print(f"Moving to next page: {current_url}")
        else:
            # No "Next" link found, we've reached the final page for this date
            print(f"Finished scraping pages for date: {date}")
            break

    # Store the count for this date
    formatted_date = date.replace("_", "-")
    daily_counts[formatted_date] = len(news_links)
    total_links += len(news_links)

    # Save the links for this date to a separate JSON file
    json_file = f"data/yahoo_finance_news_{date}.json"
    with open(json_file, mode="w", encoding="utf-8") as file:
        json.dump({"date": formatted_date, "links": news_links}, file, indent=4)
    print(f"Saved {len(news_links)} links to {json_file}")

# Close the driver
driver.quit()

Scraping sitemap for date: 2025_01_01
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735698000000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735708163000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735721040000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735723800000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735728300000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735730408000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735733283000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735736410000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735740000000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735741020000
Moving to next page: https://finance.yahoo.com/sitemap/2025_01_01_start1735742416000
Moving to next page: https:

In [4]:
# Print the summary of daily counts
print("\nSummary of news links collected per day:")
print("Date        | Number of News Links")
print("------------|---------------------")
for date, count in daily_counts.items():
    print(f"{date} | {count}")

print(f"\nTotal news links collected: {total_links}")


Summary of news links collected per day:
Date        | Number of News Links
------------|---------------------
2025-01-01 | 831
2025-01-02 | 1799
2025-01-03 | 1895
2025-01-04 | 498
2025-01-05 | 361
2025-01-06 | 747
2025-01-07 | 449

Total news links collected: 6580
