In [None]:
import yfinance as yf
import pandas as pd
import datetime
import os

def generate_sp500_data():
    print("--- Starting S&P 500 Data Download ---")
    
    # 1. Get S&P 500 tickers from Wikipedia
    try:
        payload = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
        sp500_table = payload[0]
        tickers = sp500_table['Symbol'].values.tolist()
        # Fix tickers with dots (e.g., BRK.B -> BRK-B) for Yahoo Finance
        tickers = [ticker.replace('.', '-') for ticker in tickers]
        print(f"Found {len(tickers)} tickers on Wikipedia.")
    except Exception as e:
        print(f"Error fetching Wikipedia list: {e}")
        return

    # 2. Select top 200 tickers (or all if you prefer)
    # Using the first 200 ensures we get major companies often listed first or by weight if the table is sorted
    selected_tickers = tickers[:200] 

    # 3. Define date range: Last 20 years
    end_date = datetime.datetime.now()
    start_date = end_date - datetime.timedelta(days=20*365)
    
    print(f"Downloading data for {len(selected_tickers)} companies from {start_date.date()} to {end_date.date()}...")
    print("This may take a minute...")

    # 4. Download data (Adj Close only)
    # threads=True uses multi-threading to download faster
    data = yf.download(selected_tickers, start=start_date, end=end_date, progress=True, threads=True)['Adj Close']
    
    # 5. Clean Data
    # Remove columns with too many NaNs (companies that didn't exist 20 years ago)
    # We require at least 95% of the data points to be present
    threshold = 0.95 * len(data)
    before_count = data.shape[1]
    data_clean = data.dropna(axis=1, thresh=threshold)
    
    # Forward fill remaining small gaps and drop any leading NaNs
    data_clean = data_clean.ffill().dropna()
    
    after_count = data_clean.shape[1]
    print(f"Data cleaning: kept {after_count} of {before_count} companies that have ~20 years of history.")
    print(f"Final dataset shape: {data_clean.shape}")

    # 6. Save to CSV
    output_path = 'data/portfolio_allocation.csv'
    
    # Ensure data directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    data_clean.to_csv(output_path)
    print(f"Success! File saved to: {output_path}")
    print("\nFirst 5 rows:")
    print(data_clean.head())

if __name__ == "__main__":
    # Ensure yfinance is installed: pip install yfinance
    generate_sp500_data()