In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import json
import csv
from datetime import datetime, timedelta
import os

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in the background
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Function to generate dates for January 2025
def get_january_2025_dates():
    start_date = datetime(2025, 1, 1)
    end_date = datetime(2025, 1, 3)
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date.strftime("%Y_%m_%d"))
        current_date += timedelta(days=1)
    return dates

# Dictionary to store the count of news links per day
daily_counts = {}

# Get all dates for January 2025
january_dates = get_january_2025_dates()

# Counter for total links across all days
total_links = 0

# Path to the 'data' folder where JSON files are stored
data_folder = "data"

# Iterate through each date
for date in january_dates:
    print(f"Processing date: {date}")

    # Construct the path to the JSON file
    json_file = os.path.join(data_folder, f"yahoo_finance_news_{date}.json")
    try:
        with open(json_file, mode="r", encoding="utf-8") as file:
            data = json.load(file)
            news_links = data["links"]
            formatted_date = data["date"]
    except FileNotFoundError:
        print(f"JSON file for {date} not found in {data_folder}. Skipping...")
        daily_counts[date.replace("_", "-")] = 0
        continue

    # Store the count for this date
    daily_counts[formatted_date] = len(news_links)
    total_links += len(news_links)

    # List to store news articles for this date
    news_articles = []

    # Scrape the content of each news link
    for link in news_links:
        print(f"Scraping article: {link}")
        driver.get(link)
        time.sleep(3)  # Wait for JavaScript to load

        # Get page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find the article body
        article_body = soup.find("div", class_="body yf-lk6bwk")
        if article_body:
            # Extract text from all <p> tags within the article body
            paragraphs = article_body.find_all("p")
            article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
        else:
            article_text = "Article body not found"

        # Add the article data to the list
        news_articles.append((formatted_date, link, article_text))

    # Save the news articles for this date to a separate CSV file
    csv_file = f"yahoo_finance_news_content_{date}.csv"
    with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Date", "Link", "Content"])
        # Write the data
        for article in news_articles:
            writer.writerow(article)
    print(f"Saved {len(news_articles)} articles to {csv_file}")

# Print the summary of daily counts
print("\nSummary of news links processed per day:")
print("Date        | Number of News Links")
print("------------|---------------------")
for date, count in daily_counts.items():
    print(f"{date} | {count}")

print(f"\nTotal news links processed: {total_links}")

# Close the driver
driver.quit()

Processing date: 2025_01_01
Scraping article: https://finance.yahoo.com/news/nova-wellness-group-berhad-klse-000256211.html
Scraping article: https://finance.yahoo.com/news/undiscovered-gems-3-small-caps-000358318.html
Scraping article: https://finance.yahoo.com/news/sinofert-holdings-2-other-dividend-000457419.html
Scraping article: https://finance.yahoo.com/news/3-us-penny-stocks-market-000916579.html
Scraping article: https://finance.yahoo.com/news/withdrew-95k-retirement-plan-put-001212716.html
Scraping article: https://finance.yahoo.com/news/sentry-donates-additional-1-million-001500873.html
Scraping article: https://finance.yahoo.com/news/social-security-checks-came-early-001809129.html
Scraping article: https://finance.yahoo.com/news/one-don-agro-international-insider-001816919.html
Scraping article: https://finance.yahoo.com/news/walgreens-intel-lead-list-stocks-002000767.html
Scraping article: https://finance.yahoo.com/news/shareholders-orica-asx-ori-red-002559768.html
Scrapin

KeyboardInterrupt: 