In [2]:
import yfinance as yf
import pandas as pd
import datetime
import os

def generate_sp500_data():
    try:
        payload = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
        sp500_table = payload[0]
        tickers = sp500_table['Symbol'].values.tolist()
        tickers = [ticker.replace('.', '-') for ticker in tickers]
        print(f"Found {len(tickers)} tickers on Wikipedia.")
    except Exception as e:
        print(f"Error fetching Wikipedia list: {e}")
        return
    selected_tickers = tickers[:200] 
    end_date = datetime.datetime.now()
    start_date = end_date - datetime.timedelta(days=20*365)
    
    print(f"Downloading data for {len(selected_tickers)} companies from {start_date.date()} to {end_date.date()}...")
    print("This may take a minute...")

    data = yf.download(selected_tickers, start=start_date, end=end_date, progress=True, threads=True)['Adj Close']
    

    threshold = 0.95 * len(data)
    before_count = data.shape[1]
    data_clean = data.dropna(axis=1, thresh=threshold)

    data_clean = data_clean.ffill().dropna()
    
    after_count = data_clean.shape[1]
    print(f"Data cleaning: kept {after_count} of {before_count} companies that have ~20 years of history.")
    print(f"Final dataset shape: {data_clean.shape}")

    output_path = 'data/portfolio_allocation.csv'

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    data_clean.to_csv(output_path)
    print(f"Success! File saved to: {output_path}")
    print("\nFirst 5 rows:")
    print(data_clean.head())

if __name__ == "__main__":
    generate_sp500_data()

Error fetching Wikipedia list: HTTP Error 403: Forbidden
