In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta

# List of company symbols (or names) to scrape news for
companies = ['META', 'AAPL', 'GOOG', 'AMZN', 'MSFT']  # Add other company symbols/names as needed

# Set up Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Path to your ChromeDriver (update the path)
# webdriver_service = Service('path/to/chromedriver')  # Replace with the correct path to chromedriver

# Create a new instance of the Chrome driver
driver = webdriver.Chrome()

# Set to store unique articles
articles_seen = set()
article_data = []  # List to store article data

# Function to scroll to the bottom of the page
def scroll_to_bottom(max_scrolls):
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_count = 0  # Initialize scroll counter

    while scroll_count < max_scrolls:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new articles to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            break  # Exit if no new height detected
        
        last_height = new_height
        scroll_count += 1  # Increment scroll counter

# Loop through each company and scrape the data
for company in companies:
    # Construct the URL for the current company
    url = f'https://finance.yahoo.com/quote/{company}/news/'
    
    # Navigate to the page
    driver.get(url)

    # Wait for the page to load completely
    time.sleep(5)

    # Scroll to the bottom of the page to load all articles
    scroll_to_bottom(10)

    # Get the page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all article sections
    articles = soup.select('article section:nth-of-type(2) div div div div ul li')

    # Loop through articles and extract titles, links, and details
    for article in articles:
        title_element = article.select_one('section div a h3')
        link_element = article.select_one('section div a')
        body_element = article.select_one('section div a p')  # For the short body
        time_element = article.select_one('.publishing')  # Publishing time or additional info

        # Ensure title, link, and body are found
        if title_element and link_element and body_element:
            title = title_element.get_text()
            link = link_element['href']
            body = body_element.get_text()
            additional_info = time_element.get_text(strip=True).split('•')[-1].strip() if time_element else None

            # Ensure the link is absolute
            if not link.startswith('http'):
                link = 'https://finance.yahoo.com' + link

            # Avoid duplicates based on title
            if title not in articles_seen:
                articles_seen.add(title)  # Track seen titles
                article_data.append({
                    'Title': title,
                    'Link': link,
                    'Short Body': body,
                    'Additional Info': additional_info,
                    'Company': company  # Add the company name to the article data
                })

            # Stop when reaching 800 unique articles (adjust as needed)
            if len(articles_seen) >= 800:
                break

# Convert list to DataFrame
df = pd.DataFrame(article_data)

# Close the driver
driver.quit()

print("Data scraping completed successfully!")


In [None]:
df_copy = df  

def parse_date_and_time(additional_info):
    now = datetime.now()
   
    if 'hour' in additional_info:
        hours = int(additional_info.split()[0])
        posted_date = now - timedelta(hours=hours)
    
    elif 'yesterday' in additional_info:
        posted_date = now - timedelta(days=1)
    
    elif 'day' in additional_info:
        days = int(additional_info.split()[0])
        posted_date = now - timedelta(days=days)
    
    else:
        posted_date = now
    
    return posted_date.date(), posted_date.strftime('%H:%M:%S')

df_copy[['Date', 'Time']] = df_copy['Additional Info'].apply(lambda x: pd.Series(parse_date_and_time(x)))
df_copy.drop('Posted Date',axis=1, inplace=True)

In [None]:
df_copy.to_csv("yahoo_finance_articles.csv",index=False)