In [None]:
import yfinance as yf
import os
import pandas as pd

# Create a directory to store S&P 500 stock data
directory = "SP500_05_25"
if not os.path.exists(directory):
    os.makedirs(directory)

# Retrieve the list of S&P 500 stocks from Wikipedia
sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
table = pd.read_html(sp500_url)[0]
tickers = table['Symbol'].tolist()

# Fetch stock data and save as CSV
for ticker in tickers:
    print(f"Fetching data for: {ticker}")
    stock_data = yf.download(ticker, start="2005-01-01", end="2025-03-25")
    
    # Check if the number of data points is less than 100 days
    if len(stock_data) < 100:
        print(f"Skipping {ticker}, insufficient data: {len(stock_data)} days")
        continue  # Skip this stock and do not save data

    # Keep only the required columns (Open, High, Low, Close, Volume)
    stock_data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']]
    
    # Save data as CSV
    stock_data.to_csv(f"{directory}/{ticker}.csv")
    print(f"Data for {ticker} has been saved")

print("All eligible stock data has been successfully saved.")


In [None]:
import os
import pandas as pd

# Set storage path
directory = "SP500_05_25"

# Process each CSV file
for ticker in os.listdir(directory):
    if ticker.endswith(".csv"):
        file_path = os.path.join(directory, ticker)
        
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Delete the second and third rows
        df = df.drop([0, 1]).reset_index(drop=True)

        # Remove rows containing NaN values
        df = df.dropna()

        # Rename the first column to 'Date'
        df.columns.values[0] = 'Date'

        # Save the modified file
        df.to_csv(file_path, index=False)

        print(f"Processing complete: {ticker}")

print("🎉 All CSV files have been cleaned!")
