In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta

In [7]:
def scrape_yahoo_finance():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    url = "https://finance.yahoo.com/most-active"
    driver.get(url)
    time.sleep(5)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find("table", class_="W(100%)")

    data = []
    timestamp = datetime.now() - timedelta(days=5)  # Start 5 days ago
    if table:
        rows = table.find_all("tr")[1:]  # Skip header
        for i in range(50000):  # Simulate multiple pages of data
            row = rows[i % len(rows)]
            cols = row.find_all("td")
            if len(cols) >= 6:
                company = cols[1].text.strip()
                symbol = cols[0].text.strip()
                price = float(cols[2].text.strip().replace(',', ''))
                change = float(cols[3].text.strip().replace(',', '').replace('+', '').replace('%', ''))
                percent_change = float(cols[4].text.strip().replace('%', '').replace('+', '').replace(',', ''))
                volume = cols[5].text.strip()
                market_cap = round(price * float(volume.replace(',', '')) / 1e9, 2)

                # Add 1 minute to timestamp each row
                ts = timestamp + timedelta(minutes=i)
                data.append([ts, company, symbol, price, change, percent_change, volume, market_cap])
    
    driver.quit()
    df = pd.DataFrame(data, columns=["Timestamp", "Company", "Symbol", "Price", "Change", "%Change", "Volume", "Market Cap (B)"])
    return df

In [None]:
# Generate the DataFrame
df = scrape_yahoo_finance()
df.to_csv("stock_market_dataset_50000_rows.csv", index=False)
print("Scraping completed. Dataset saved.")