Phase 1: Data Collection and Exploration for Equity Factor Model

In [10]:
import pandas as pd
import numpy as np
import yfinance as yf
import datetime
import time
import os
import requests
import io
import glob
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings('ignore')

class EquityDataCollector:
    """
    Collects equity price data for multiple stocks and indexes
    """
    def __init__(self, cache_dir='data'):
        """
        Initialize the data collector
        
        Parameters:
        -----------
        cache_dir : str
            Directory to store cached data
        """
        self.cache_dir = cache_dir
        
        # Create cache directory if it doesn't exist
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
            
        # Create subdirectories
        for subdir in ['prices', 'options', 'factors', 'preprocessed']:
            subdir_path = os.path.join(cache_dir, subdir)
            if not os.path.exists(subdir_path):
                os.makedirs(subdir_path)
        
        # Initialize S&P 500 constituents
        self.sp500 = None
        
    def get_sp500_constituents(self, refresh=False):
        """
        Get current S&P 500 constituents with sector information
        
        Parameters:
        -----------
        refresh : bool
            Whether to refresh the constituents list from the web
            
        Returns:
        --------
        pandas.DataFrame: S&P 500 constituents with sector information
        """
        cache_file = os.path.join(self.cache_dir, 'sp500_constituents.csv')
        
        if os.path.exists(cache_file) and not refresh:
            # Load from cache
            print(f"Loading S&P 500 constituents from cache: {cache_file}")
            sp500 = pd.read_csv(cache_file)
        else:
            try:
                # Scrape Wikipedia for S&P 500 constituents
                print("Fetching S&P 500 constituents from Wikipedia...")
                url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
                tables = pd.read_html(url)
                sp500 = tables[0]
                
                # Rename columns
                sp500 = sp500.rename(columns={
                    'Symbol': 'ticker',
                    'Security': 'company_name',
                    'GICS Sector': 'sector',
                    'GICS Sub-Industry': 'sub_industry',
                    'CIK': 'cik',
                    'Headquarters Location': 'location',
                    'Date added': 'date_added',
                    'Founded': 'founded'
                })
                
                # Clean tickers (some may have dots or special characters)
                sp500['ticker'] = sp500['ticker'].str.replace('.', '-')
                
                # Save to cache
                sp500.to_csv(cache_file, index=False)
                print(f"Saved S&P 500 constituents to: {cache_file}")
                
            except Exception as e:
                print(f"Error fetching S&P 500 constituents: {e}")
                # Use a fallback if available
                if os.path.exists(cache_file):
                    print(f"Using cached S&P 500 constituents as fallback")
                    sp500 = pd.read_csv(cache_file)
                else:
                    # Create a simple fallback with major stocks
                    print("Creating fallback S&P 500 data with major stocks")
                    sp500 = pd.DataFrame({
                        'ticker': ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'NVDA', 'JPM', 'V', 'PG',
                                 'UNH', 'HD', 'BAC', 'XOM', 'PFE', 'CSCO', 'CVX', 'ADBE', 'CRM', 'NFLX'],
                        'company_name': ['Apple Inc.', 'Microsoft Corp.', 'Amazon.com Inc.', 'Alphabet Inc.',
                                       'Meta Platforms Inc.', 'Tesla Inc.', 'NVIDIA Corp.', 'JPMorgan Chase & Co.',
                                       'Visa Inc.', 'Procter & Gamble Co.', 'UnitedHealth Group Inc.',
                                       'Home Depot Inc.', 'Bank of America Corp.', 'Exxon Mobil Corp.',
                                       'Pfizer Inc.', 'Cisco Systems Inc.', 'Chevron Corp.',
                                       'Adobe Inc.', 'Salesforce Inc.', 'Netflix Inc.'],
                        'sector': ['Information Technology', 'Information Technology', 'Consumer Discretionary',
                                 'Communication Services', 'Communication Services', 'Consumer Discretionary',
                                 'Information Technology', 'Financials', 'Financials', 'Consumer Staples',
                                 'Health Care', 'Consumer Discretionary', 'Financials', 'Energy',
                                 'Health Care', 'Information Technology', 'Energy', 'Information Technology',
                                 'Information Technology', 'Communication Services']
                    })
                    sp500.to_csv(cache_file, index=False)
        
        self.sp500 = sp500
        return sp500
    
    def get_sample_stocks(self, n_stocks=50, min_per_sector=3, seed=42):
        """
        Get a balanced sample of stocks from the S&P 500
        
        Parameters:
        -----------
        n_stocks : int
            Number of stocks to sample
        min_per_sector : int
            Minimum number of stocks per sector
        seed : int
            Random seed for reproducibility
            
        Returns:
        --------
        pandas.DataFrame: Sampled stocks with sector information
        """
        if self.sp500 is None:
            self.get_sp500_constituents()
        
        np.random.seed(seed)
        
        # Ensure we have at least min_per_sector stocks from each sector
        sectors = self.sp500['sector'].unique()
        
        # Initialize sampled stocks
        sampled_stocks = pd.DataFrame()
        
        # First, take min_per_sector stocks from each sector
        for sector in sectors:
            sector_stocks = self.sp500[self.sp500['sector'] == sector]
            
            # If sector has fewer stocks than min_per_sector, take all of them
            if len(sector_stocks) <= min_per_sector:
                sampled_stocks = pd.concat([sampled_stocks, sector_stocks])
            else:
                # Sample min_per_sector stocks from this sector
                sector_sample = sector_stocks.sample(min_per_sector)
                sampled_stocks = pd.concat([sampled_stocks, sector_sample])
        
        # If we need more stocks, sample randomly from the remaining
        if len(sampled_stocks) < n_stocks:
            remaining = self.sp500[~self.sp500['ticker'].isin(sampled_stocks['ticker'])]
            additional_sample = remaining.sample(min(len(remaining), n_stocks - len(sampled_stocks)))
            sampled_stocks = pd.concat([sampled_stocks, additional_sample])
        
        # If we have more stocks than requested, trim randomly
        if len(sampled_stocks) > n_stocks:
            sampled_stocks = sampled_stocks.sample(n_stocks)
        
        # Reset index
        sampled_stocks = sampled_stocks.reset_index(drop=True)
        
        print(f"Selected {len(sampled_stocks)} stocks from {len(sectors)} sectors")
        
        # Print sector distribution
        sector_counts = sampled_stocks['sector'].value_counts()
        print("\nSector distribution:")
        for sector, count in sector_counts.items():
            print(f"  {sector}: {count} stocks")
        
        return sampled_stocks
    
    def fetch_historical_prices(self, tickers, start_date='2020-01-01', end_date=None, 
                              interval='1d', adjust=True, refresh=False):
        """
        Fetch historical price data for multiple tickers
        
        Parameters:
        -----------
        tickers : list
            List of ticker symbols
        start_date : str
            Start date for data (YYYY-MM-DD)
        end_date : str, optional
            End date for data (YYYY-MM-DD), defaults to today
        interval : str
            Data interval ('1d', '1wk', '1mo')
        adjust : bool
            Whether to use adjusted prices
        refresh : bool
            Whether to refresh data from the source
            
        Returns:
        --------
        dict: Dictionary of pandas.DataFrame price data by ticker
        """
        # Set default end date to today if not provided
        if end_date is None:
            end_date = datetime.datetime.now().strftime('%Y-%m-%d')
        
        # Create filename based on parameters
        filename = f"prices_{start_date}_to_{end_date}_{interval}.pkl"
        cache_file = os.path.join(self.cache_dir, 'prices', filename)
        
        # Check if data is cached and we don't want to refresh
        if os.path.exists(cache_file) and not refresh:
            print(f"Loading cached price data from: {cache_file}")
            try:
                price_data = pd.read_pickle(cache_file)
                
                # Check if we have all the tickers
                missing_tickers = [ticker for ticker in tickers if ticker not in price_data]
                
                if not missing_tickers:
                    return price_data
                else:
                    print(f"Missing {len(missing_tickers)} tickers in cache. Fetching missing data...")
                    tickers = missing_tickers
            except Exception as e:
                print(f"Error loading cached price data: {e}")
                price_data = {}
        else:
            price_data = {}
        
        # Fetch data for all tickers
        print(f"Fetching historical price data for {len(tickers)} tickers...")
        
        # Use tqdm for progress bar
        for ticker in tqdm(tickers):
            try:
                # Fetch data with retry logic
                attempts = 0
                max_attempts = 3
                success = False
                
                while not success and attempts < max_attempts:
                    try:
                        stock = yf.Ticker(ticker)
                        hist = stock.history(start=start_date, end=end_date, interval=interval, auto_adjust=adjust)
                        
                        if not hist.empty:
                            # Store in dictionary
                            price_data[ticker] = hist
                            success = True
                        else:
                            print(f"No data returned for {ticker}, retrying ({attempts+1}/{max_attempts})...")
                            attempts += 1
                            time.sleep(1)  # Wait before retrying
                    except Exception as e:
                        print(f"Error fetching data for {ticker}: {e}")
                        attempts += 1
                        time.sleep(2)  # Longer wait on exception
                
                if not success:
                    print(f"Failed to fetch data for {ticker} after {max_attempts} attempts")
            
            except Exception as e:
                print(f"Unexpected error with {ticker}: {e}")
        
        # Save to cache
        print(f"Saving price data to cache: {cache_file}")
        pd.to_pickle(price_data, cache_file)
        
        return price_data
    
    def fetch_factor_data(self, start_date='2020-01-01', end_date=None, refresh=False):
        """
        Fetch common risk factor data (Fama-French, momentum, volatility)
        
        Parameters:
        -----------
        start_date : str
            Start date for data (YYYY-MM-DD)
        end_date : str, optional
            End date for data (YYYY-MM-DD), defaults to today
        refresh : bool
            Whether to refresh data from the source
            
        Returns:
        --------
        pandas.DataFrame: Factor data
        """
        # Set default end date to today if not provided
        if end_date is None:
            end_date = datetime.datetime.now().strftime('%Y-%m-%d')
        
        # Create filename based on parameters
        filename = f"factors_{start_date}_to_{end_date}.csv"
        cache_file = os.path.join(self.cache_dir, 'factors', filename)
        
        # Check if data is cached and we don't want to refresh
        if os.path.exists(cache_file) and not refresh:
            print(f"Loading cached factor data from: {cache_file}")
            try:
                return pd.read_csv(cache_file, index_col=0, parse_dates=True)
            except Exception as e:
                print(f"Error loading cached factor data: {e}")
                # Continue to fetch new data
        
        # Create a DataFrame with dates as index
        start_dt = pd.to_datetime(start_date)
        end_dt = pd.to_datetime(end_date)
        
        # Create date range
        date_range = pd.date_range(start=start_dt, end=end_dt, freq='B')  # Business days
        factor_data = pd.DataFrame(index=date_range)
        
        # First, try to get Fama-French 5 factors
        try:
            print("Fetching Fama-French factor data...")
            
            # Define URL for Fama-French data
            ff_url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
            
            # Create a specific fallback function for Fama-French data
            def get_fama_french_data():
                # Direct download and read
                try:
                    raw_data = pd.read_csv(ff_url, skiprows=3)
                    
                    # Find where the data starts (after header rows)
                    for i, row in raw_data.iterrows():
                        if isinstance(row.iloc[0], str) and row.iloc[0].isdigit():
                            data_start = i
                            break
                    else:
                        data_start = 0
                    
                    # Extract actual data
                    ff_data = raw_data.iloc[data_start:]
                    
                    # Convert first column to date if needed
                    if ff_data.columns[0] != 'date':
                        ff_data = ff_data.rename(columns={ff_data.columns[0]: 'date'})
                    
                    # Ensure date is in correct format
                    ff_data['date'] = pd.to_datetime(ff_data['date'], format='%Y%m%d', errors='coerce')
                    
                    # Remove rows with invalid dates
                    ff_data = ff_data.dropna(subset=['date'])
                    
                    # Set date as index
                    ff_data = ff_data.set_index('date')
                    
                    # Convert numeric columns to float
                    for col in ff_data.columns:
                        ff_data[col] = pd.to_numeric(ff_data[col], errors='coerce')
                    
                    # Convert from percent to decimal
                    for col in ff_data.columns:
                        if ff_data[col].dtype == 'float64':
                            ff_data[col] = ff_data[col] / 100.0
                    
                    return ff_data
                
                except Exception as e:
                    print(f"Error in Fama-French data extraction: {e}")
                    return None
            
            # Try to get Fama-French data
            ff_factors = get_fama_french_data()
            
            # If successful, add to factor_data
            if ff_factors is not None and not ff_factors.empty:
                # Filter for date range
                ff_factors = ff_factors[(ff_factors.index >= start_dt) & (ff_factors.index <= end_dt)]
                
                # Add to factor_data
                for col in ff_factors.columns:
                    if col in ['Mkt-RF', 'Mkt_RF', 'Mkt.RF', 'mkt_rf', 'Mkt RF']:
                        factor_data['mkt_rf'] = ff_factors[col]
                    elif col.lower() in ['smb']:
                        factor_data['smb'] = ff_factors[col]
                    elif col.lower() in ['hml']:
                        factor_data['hml'] = ff_factors[col]
                    elif col.lower() in ['rmw']:
                        factor_data['rmw'] = ff_factors[col]
                    elif col.lower() in ['cma']:
                        factor_data['cma'] = ff_factors[col]
                    elif col.lower() in ['rf']:
                        factor_data['rf'] = ff_factors[col]
                
                print("Successfully added Fama-French factors")
            else:
                print("Could not retrieve Fama-French factors, using simulated data")
                
                # Create simulated Fama-French factors
                np.random.seed(42)  # For reproducibility
                
                # Generate random values with realistic parameters
                factor_data['mkt_rf'] = np.random.normal(0.0005, 0.01, len(date_range))  # Market minus risk-free
                factor_data['smb'] = np.random.normal(0.0002, 0.005, len(date_range))    # Small minus big
                factor_data['hml'] = np.random.normal(0.0001, 0.006, len(date_range))    # High minus low
                factor_data['rmw'] = np.random.normal(0.0001, 0.004, len(date_range))    # Robust minus weak
                factor_data['cma'] = np.random.normal(0.0001, 0.004, len(date_range))    # Conservative minus aggressive
                factor_data['rf'] = np.random.normal(0.0001, 0.0001, len(date_range))    # Risk-free rate
                
                print("Using simulated Fama-French factors")
        
        except Exception as e:
            print(f"Error fetching Fama-French factor data: {e}")
            
            # Create simulated factors if not already present
            if 'mkt_rf' not in factor_data.columns:
                np.random.seed(42)  # For reproducibility
                factor_data['mkt_rf'] = np.random.normal(0.0005, 0.01, len(date_range))  # Market minus risk-free
                factor_data['smb'] = np.random.normal(0.0002, 0.005, len(date_range))    # Small minus big
                factor_data['hml'] = np.random.normal(0.0001, 0.006, len(date_range))    # High minus low
                factor_data['rmw'] = np.random.normal(0.0001, 0.004, len(date_range))    # Robust minus weak
                factor_data['cma'] = np.random.normal(0.0001, 0.004, len(date_range))    # Conservative minus aggressive
                factor_data['rf'] = np.random.normal(0.0001, 0.0001, len(date_range))    # Risk-free rate
                
                print("Using simulated Fama-French factors")
        
        # Now get market data for momentum and volatility
        try:
            print("Fetching market data for momentum and volatility factors...")
            
            # Use SPY as market proxy
            spy = yf.download('SPY', start=start_date, end=end_date, progress=False)
            
            if not spy.empty and 'Close' in spy.columns:
                # Calculate returns
                spy['return'] = spy['Close'].pct_change()
                
                # Calculate momentum (12-month rolling return)
                spy['momentum'] = spy['Close'].pct_change(252).fillna(0)
                
                # Calculate volatility (21-day rolling std dev of returns)
                spy['volatility'] = spy['return'].rolling(21).std().fillna(spy['return'].std())
                
                # Create new DataFrame with momentum and volatility
                spy_factors = pd.DataFrame({'momentum': spy['momentum'], 'volatility': spy['volatility']})
                
                # Align indices between factor_data and spy_factors using merge
                # Convert spy index to DataFrame first
                spy_df = spy_factors.reset_index()
                if 'index' in spy_df.columns:
                    spy_df.rename(columns={'index': 'date'}, inplace=True)
                elif 'Date' in spy_df.columns:
                    spy_df.rename(columns={'Date': 'date'}, inplace=True)
                
                # Convert factor_data index to DataFrame
                factor_df = factor_data.reset_index()
                if 'index' in factor_df.columns:
                    factor_df.rename(columns={'index': 'date'}, inplace=True)
                
                # Merge on date
                merged_df = pd.merge(factor_df, spy_df, on='date', how='left')
                
                # Set date as index again
                merged_df.set_index('date', inplace=True)
                
                # Update factor_data with the merged values
                factor_data = merged_df
                
                print("Successfully added momentum and volatility factors")
            else:
                print("Could not retrieve market data, using simulated momentum and volatility")
                
                # Create simulated momentum and volatility
                np.random.seed(43)  # Different seed from Fama-French
                factor_data['momentum'] = np.random.normal(0.001, 0.02, len(date_range))
                factor_data['volatility'] = np.abs(np.random.normal(0.01, 0.005, len(date_range)))
                
                print("Using simulated momentum and volatility factors")
            
        except Exception as e:
            print(f"Error fetching market data: {e}")
            
            # Create simulated momentum and volatility if not already present
            if 'momentum' not in factor_data.columns:
                np.random.seed(43)  # Different seed from Fama-French
                factor_data['momentum'] = np.random.normal(0.001, 0.02, len(date_range))
                factor_data['volatility'] = np.abs(np.random.normal(0.01, 0.005, len(date_range)))
                
                print("Using simulated momentum and volatility factors")
        
        # Fill missing values with forward and backward fill
        factor_data = factor_data.fillna(method='ffill').fillna(method='bfill')
        
        # Save to cache
        print(f"Saving factor data to cache: {cache_file}")
        factor_data.to_csv(cache_file)
        
        return factor_data
    
    def get_options_data(self, tickers, dte_min=10, dte_max=120, refresh=False):
        """
        Fetch options data for a list of tickers using an OptionsDataFetcher
        
        Parameters:
        -----------
        tickers : list
            List of ticker symbols
        dte_min : int
            Minimum days to expiration
        dte_max : int
            Maximum days to expiration
        refresh : bool
            Whether to refresh data from the source
            
        Returns:
        --------
        dict: Dictionary of options data by ticker
        """
        # Create filename based on parameters
        today = datetime.datetime.now().strftime('%Y%m%d')
        filename = f"options_data_{today}.pkl"
        cache_file = os.path.join(self.cache_dir, 'options', filename)
        
        # Check if data is cached and we don't want to refresh
        if os.path.exists(cache_file) and not refresh:
            print(f"Loading cached options data from: {cache_file}")
            try:
                options_data = pd.read_pickle(cache_file)
                
                # Check if we have all the tickers
                missing_tickers = [ticker for ticker in tickers if ticker not in options_data]
                
                if not missing_tickers:
                    return options_data
                else:
                    print(f"Missing {len(missing_tickers)} tickers in cache. Fetching missing data...")
                    tickers = missing_tickers
            except Exception as e:
                print(f"Error loading cached options data: {e}")
                options_data = {}
        else:
            options_data = {}
        
        # Import your OptionsDataFetcher class
        try:
            # Try to import your existing module
            from options_module import OptionsDataFetcher
            options_fetcher = OptionsDataFetcher()
            print("Using your custom OptionsDataFetcher")
        except ImportError:
            # Create a basic mock OptionsDataFetcher if your module isn't available
            print("Custom OptionsDataFetcher not found, using mock implementation")
            options_fetcher = MockOptionsDataFetcher()
        
        # Fetch options data for all tickers
        print(f"Fetching options data for {len(tickers)} tickers...")
        
        for ticker in tqdm(tickers):
            try:
                # Fetch options data
                option_chain = options_fetcher.get_option_chain(ticker, dte_min=dte_min, dte_max=dte_max)
                
                if option_chain is not None:
                    options_data[ticker] = option_chain
                    
                    # Optional: Save individual ticker data as CSV
                    calls_file = os.path.join(self.cache_dir, 'options', f"{ticker}_calls.csv")
                    puts_file = os.path.join(self.cache_dir, 'options', f"{ticker}_puts.csv")
                    
                    option_chain['calls'].to_csv(calls_file, index=False)
                    option_chain['puts'].to_csv(puts_file, index=False)
                    
                    print(f"Saved options data for {ticker}")
                else:
                    print(f"No options data available for {ticker}")
                
                # Be nice to the API
                time.sleep(1)
                
            except Exception as e:
                print(f"Error fetching options data for {ticker}: {e}")
        
        # Save to cache
        print(f"Saving options data to cache: {cache_file}")
        pd.to_pickle(options_data, cache_file)
        
        return options_data


class MockOptionsDataFetcher:
    """Mock implementation of OptionsDataFetcher for testing"""
    
    def __init__(self):
        self.current_date = datetime.datetime.now().date()
    
    def get_option_chain(self, ticker, dte_min=10, dte_max=120):
        """
        Generate mock option chain data for testing
        
        Parameters:
        -----------
        ticker : str
            Ticker symbol
        dte_min : int
            Minimum days to expiration
        dte_max : int
            Maximum days to expiration
            
        Returns:
        --------
        dict: Mock option chain data with calls and puts
        """
        print(f"Generating mock options data for {ticker}")
        
        # Create fake spot price - different for each ticker for variety
        # Hash the ticker to get a consistent but varied price
        ticker_hash = sum(ord(c) for c in ticker)
        base_price = 100 + (ticker_hash % 400)  # Price between 100 and 500
        spot_price = base_price + (np.random.random() * 10 - 5)  # Add some randomness
        
        # Generate expiration dates
        current_date = self.current_date
        expiration_dates = []
        
        # Weekly expirations
        for week in range(1, 8):
            dte = week * 7
            if dte_min <= dte <= dte_max:
                exp_date = (current_date + datetime.timedelta(days=dte)).strftime('%Y-%m-%d')
                expiration_dates.append(exp_date)
        
        # Monthly expirations
        for month in range(1, 6):
            dte = month * 30
            if dte_min <= dte <= dte_max:
                exp_date = (current_date + datetime.timedelta(days=dte)).strftime('%Y-%m-%d')
                expiration_dates.append(exp_date)
        
        # Generate strike prices (centered around spot price)
        strike_range = 0.3  # Strike prices within ±30% of spot
        strikes = np.linspace(
            spot_price * (1 - strike_range),
            spot_price * (1 + strike_range),
            15
        )
        
        # Generate option chains
        calls_data = []
        puts_data = []
        
        for exp_date in expiration_dates:
            # Convert to datetime for calculations
            exp_datetime = datetime.datetime.strptime(exp_date, '%Y-%m-%d').date()
            dte = (exp_datetime - current_date).days
            tte = dte / 365.0  # Time to expiration in years
            
            for strike in strikes:
                # Calculate moneyness
                moneyness = strike / spot_price
                
                # Base IV with smile effect
                base_iv = 0.2 + 0.1 * (moneyness - 1) ** 2
                
                # Add term structure effect (higher IV for longer dates)
                iv_term = 0.05 * np.sqrt(tte)
                
                # Generate call option
                call = {
                    'contractSymbol': f"{ticker}{exp_date.replace('-', '')}C{int(strike * 100):08d}",
                    'strike': strike,
                    'lastPrice': max(0.01, spot_price - strike) + np.random.random() * 2,
                    'bid': max(0.01, spot_price - strike) * 0.95,
                    'ask': max(0.01, spot_price - strike) * 1.05 + 0.1,
                    'impliedVolatility': base_iv + iv_term,
                    'volume': int(np.random.exponential(100) * (1.5 - abs(moneyness - 1))),
                    'openInterest': int(np.random.exponential(500) * (1.5 - abs(moneyness - 1))),
                    'inTheMoney': strike < spot_price,
                    'expirationDate': exp_date,
                    'dte': dte,
                    'tte': tte,
                    'moneyness': moneyness
                }
                
                # Generate put option
                put = {
                    'contractSymbol': f"{ticker}{exp_date.replace('-', '')}P{int(strike * 100):08d}",
                    'strike': strike,
                    'lastPrice': max(0.01, strike - spot_price) + np.random.random() * 2,
                    'bid': max(0.01, strike - spot_price) * 0.95,
                    'ask': max(0.01, strike - spot_price) * 1.05 + 0.1,
                    'impliedVolatility': base_iv + iv_term + 0.02,  # Puts typically have higher IV
                    'volume': int(np.random.exponential(100) * (1.5 - abs(moneyness - 1))),
                    'openInterest': int(np.random.exponential(500) * (1.5 - abs(moneyness - 1))),
                    'inTheMoney': strike > spot_price,
                    'expirationDate': exp_date,
                    'dte': dte,
                    'tte': tte,
                    'moneyness': moneyness
                }
                
                # Adjust prices for realistic values
                call['lastPrice'] = max(0.01, call['lastPrice'])
                put['lastPrice'] = max(0.01, put['lastPrice'])
                call['bid'] = max(0.01, call['bid'])
                put['bid'] = max(0.01, put['bid'])
                call['ask'] = max(call['bid'] + 0.01, call['ask'])
                put['ask'] = max(put['bid'] + 0.01, put['ask'])
                
                # Add a usePrice column (mid price)
                call['usePrice'] = (call['bid'] + call['ask']) / 2
                put['usePrice'] = (put['bid'] + put['ask']) / 2
                
                calls_data.append(call)
                puts_data.append(put)
        
        # Convert to DataFrames
        calls_df = pd.DataFrame(calls_data)
        puts_df = pd.DataFrame(puts_data)
        
        # Return options data
        return {
            'ticker': ticker,
            'spot_price': spot_price,
            'calls': calls_df,
            'puts': puts_df,
            'quote_date': current_date,
            'risk_free_rate': 0.05  # 5% risk-free rate
        }


class DataPreprocessor:
    """
    Preprocesses and cleans financial data for modeling
    """
    def __init__(self, cache_dir='data'):
        """
        Initialize the data preprocessor
        
        Parameters:
        -----------
        cache_dir : str
            Directory for cached data
        """
        self.cache_dir = cache_dir
        
    def calculate_returns(self, price_data, period='daily'):
        """
        Calculate returns from price data
        
        Parameters:
        -----------
        price_data : dict
            Dictionary of price DataFrames by ticker
        period : str
            'daily', 'weekly', or 'monthly'
            
        Returns:
        --------
        dict: Dictionary of return DataFrames by ticker
        """
        returns_data = {}
        
        for ticker, prices in price_data.items():
            try:
                # Make a copy of the price data
                df = prices.copy()
                
                # Calculate returns based on specified period
                if 'Close' in df.columns:
                    if period == 'daily':
                        df['return'] = df['Close'].pct_change()
                    elif period == 'weekly':
                        df['return'] = df['Close'].pct_change(5)
                    elif period == 'monthly':
                        df['return'] = df['Close'].pct_change(21)
                    else:
                        raise ValueError(f"Invalid period: {period}")
                else:
                    # If no Close column, try Adj Close or use first numeric column
                    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
                    if 'Adj Close' in df.columns:
                        df['return'] = df['Adj Close'].pct_change()
                    elif len(numeric_cols) > 0:
                        df['return'] = df[numeric_cols[0]].pct_change()
                    else:
                        print(f"Could not calculate returns for {ticker}: no suitable price column found")
                        df['return'] = 0  # Default to zero returns
                
                # Store in dictionary
                returns_data[ticker] = df
            except Exception as e:
                print(f"Error calculating returns for {ticker}: {e}")
                # Add placeholder with zero returns
                returns_data[ticker] = prices.copy()
                returns_data[ticker]['return'] = 0
        
        return returns_data
    
    def calculate_stock_factors(self, price_data, market_ticker='SPY'):
        """
        Calculate stock-specific factors
        
        Parameters:
        -----------
        price_data : dict
            Dictionary of price DataFrames by ticker
        market_ticker : str
            Ticker to use as market benchmark
            
        Returns:
        --------
        pandas.DataFrame: Stock factors by ticker and date
        """
        # Get market data if available, otherwise create dummy
        if market_ticker in price_data:
            market_data = price_data[market_ticker].copy()
            
            # Calculate market returns
            if 'Close' in market_data.columns:
                market_data['market_return'] = market_data['Close'].pct_change()
            elif 'Adj Close' in market_data.columns:
                market_data['market_return'] = market_data['Adj Close'].pct_change()
            else:
                # Use first numeric column
                numeric_cols = market_data.select_dtypes(include=['float64', 'int64']).columns
                if len(numeric_cols) > 0:
                    market_data['market_return'] = market_data[numeric_cols[0]].pct_change()
                else:
                    print(f"Could not calculate market returns: no suitable price column found")
                    # Create a synthetic market return
                    market_data['market_return'] = np.random.normal(0.0005, 0.01, len(market_data))
        else:
            print(f"Market ticker {market_ticker} not found in price data. Using synthetic market returns.")
            
            # Create a synthetic market index with dates from the first ticker
            if price_data:
                first_ticker = list(price_data.keys())[0]
                market_data = pd.DataFrame(index=price_data[first_ticker].index)
                market_data['market_return'] = np.random.normal(0.0005, 0.01, len(market_data))
            else:
                print("No price data available. Cannot calculate stock factors.")
                return pd.DataFrame()
        
        # Initialize factors list
        factors = []
        
        # Process each stock
        for ticker, prices in price_data.items():
            if ticker == market_ticker:
                continue
                
            try:
                # Make a copy
                df = prices.copy()
                
                # Calculate daily returns
                if 'Close' in df.columns:
                    df['return'] = df['Close'].pct_change()
                elif 'Adj Close' in df.columns:
                    df['return'] = df['Adj Close'].pct_change()
                else:
                    # Use first numeric column
                    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
                    if len(numeric_cols) > 0:
                        df['return'] = df[numeric_cols[0]].pct_change()
                    else:
                        print(f"Could not calculate returns for {ticker}: no suitable price column found")
                        df['return'] = 0  # Default to zero returns
                
                # Calculate size factor (log of market cap if volume is available)
                if 'Close' in df.columns and 'Volume' in df.columns:
                    df['size'] = np.log(df['Close'] * df['Volume'])
                else:
                    # Use a random size factor if volume not available
                    df['size'] = np.random.normal(10, 2, len(df))
                
                # Calculate value factor (approximate with price ratio)
                if 'Low' in df.columns and 'High' in df.columns:
                    df['value'] = df['Low'] / df['High'].rolling(20).mean()
                else:
                    # Use a random value factor
                    df['value'] = np.random.normal(0.8, 0.1, len(df))
                
                # Calculate momentum (12-month rolling return)
                if 'Close' in df.columns:
                    df['momentum_12m'] = df['Close'].pct_change(252)
                elif 'Adj Close' in df.columns:
                    df['momentum_12m'] = df['Adj Close'].pct_change(252)
                else:
                    # Use cumulative return of daily returns
                    df['momentum_12m'] = df['return'].rolling(252).apply(lambda x: (1 + x).prod() - 1)
                
                # Calculate volatility (1-month rolling volatility)
                df['volatility'] = df['return'].rolling(21).std()
                
                # Calculate beta using a rolling window
                # Align market returns with stock returns
                if 'market_return' in market_data.columns:
                    market_returns = market_data['market_return']
                    aligned_market = market_returns.loc[df.index.intersection(market_returns.index)]
                    
                    # Ensure alignment
                    if not aligned_market.empty and len(aligned_market) > 30:
                        # Calculate betas using 1-year rolling window
                        betas = []
                        for i in range(252, len(df)):
                            if i >= len(aligned_market):
                                continue
                            
                            stock_window = df['return'].iloc[i-252:i]
                            market_window = aligned_market.iloc[i-252:i]
                            
                            # Skip if either window has all NaNs
                            if stock_window.isna().all() or market_window.isna().all():
                                betas.append(np.nan)
                                continue
                            
                            # Calculate covariance
                            cov = np.cov(stock_window.fillna(0), market_window.fillna(0))[0, 1]
                            var = np.var(market_window.fillna(0))
                            
                            # Calculate beta
                            if var != 0:
                                beta = cov / var
                            else:
                                beta = np.nan
                            
                            betas.append(beta)
                        
                        # Pad beginning with NaNs
                        betas = [np.nan] * 252 + betas
                        
                        # Ensure length matches df
                        betas = betas[:len(df)]
                        
                        # Assign to dataframe
                        df['beta'] = betas
                    else:
                        # If not enough data, assign NaNs
                        df['beta'] = np.nan
                else:
                    # If market returns not available, assign NaNs
                    df['beta'] = np.nan
                
                # Fill NaNs
                df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
                
                # Prepare data for the factors DataFrame
                stock_factors = df[['return', 'size', 'value', 'momentum_12m', 'volatility', 'beta']].copy()
                stock_factors['ticker'] = ticker
                
                # Add to list
                factors.append(stock_factors)
                
            except Exception as e:
                print(f"Error calculating factors for {ticker}: {e}")
                # Skip this ticker
        
        # Combine all factors
        if factors:
            combined_factors = pd.concat(factors)
            
            # Reset index to include date as a column
            combined_factors = combined_factors.reset_index()
            
            # Rename index to date if needed
            if 'index' in combined_factors.columns and 'date' not in combined_factors.columns:
                combined_factors = combined_factors.rename(columns={'index': 'date'})
            
            return combined_factors
        else:
            print("No valid factors calculated.")
            return pd.DataFrame()
    
    def align_data(self, price_returns, factor_data):
        """
        Align price returns with factor data
        
        Parameters:
        -----------
        price_returns : dict
            Dictionary of return DataFrames by ticker
        factor_data : pandas.DataFrame
            Factor data
            
        Returns:
        --------
        pandas.DataFrame: Aligned returns and factors
        """
        # Initialize list to store aligned data
        aligned_data = []
        
        # Process each ticker
        for ticker, returns in price_returns.items():
            try:
                # Make a copy and ensure it has a return column
                df = returns.copy()
                if 'return' not in df.columns:
                    print(f"No return column for {ticker}, skipping")
                    continue
                
                # Keep only the return column and reset index
                df_returns = df[['return']].copy()
                df_returns = df_returns.reset_index()
                df_returns['ticker'] = ticker
                
                # Ensure it has a date column - handle both 'Date' and 'index' cases
                if 'date' not in df_returns.columns:
                    if 'Date' in df_returns.columns:
                        df_returns.rename(columns={'Date': 'date'}, inplace=True)
                    elif 'index' in df_returns.columns:
                        df_returns.rename(columns={'index': 'date'}, inplace=True)
                
                # Add to list
                aligned_data.append(df_returns)
                
            except Exception as e:
                print(f"Error aligning data for {ticker}: {e}")
        
        # Combine all returns if we have any
        if not aligned_data:
            print("No valid return data to align with factors.")
            return pd.DataFrame()
            
        combined_returns = pd.concat(aligned_data)
        
        # Reset factor_data index to create date column
        factor_data_reset = factor_data.reset_index()
        
        # Ensure it has a date column - handle both 'Date' and 'index' cases
        if 'date' not in factor_data_reset.columns:
            if 'Date' in factor_data_reset.columns:
                factor_data_reset.rename(columns={'Date': 'date'}, inplace=True)
            elif 'index' in factor_data_reset.columns:
                factor_data_reset.rename(columns={'index': 'date'}, inplace=True)
        
        # Make sure date columns are datetime
        combined_returns['date'] = pd.to_datetime(combined_returns['date'])
        factor_data_reset['date'] = pd.to_datetime(factor_data_reset['date'])
        
        # Handle timezone differences - normalize datetime to remove timezone information
        combined_returns['date'] = combined_returns['date'].dt.tz_localize(None)
        factor_data_reset['date'] = factor_data_reset['date'].dt.tz_localize(None)
        
        # Perform the merge
        try:
            aligned = pd.merge(combined_returns, factor_data_reset, on='date', how='left')
            
            # Fill missing values
            aligned = aligned.fillna(method='ffill').fillna(method='bfill').fillna(0)
            
            return aligned
            
        except Exception as e:
            print(f"Error merging returns and factors: {e}")
            
            # Create a minimal aligned dataset with returns only
            print("Creating minimal aligned dataset with returns only.")
            aligned_data = pd.DataFrame()
            
            for ticker, returns in price_returns.items():
                # Extract returns and add ticker
                ticker_returns = returns[['return']].copy() if 'return' in returns.columns else pd.DataFrame(index=returns.index)
                ticker_returns['ticker'] = ticker
                
                # Reset index to convert to column
                ticker_returns = ticker_returns.reset_index()
                
                # Ensure it has a date column - handle both 'Date' and 'index' cases
                if 'date' not in ticker_returns.columns:
                    if 'Date' in ticker_returns.columns:
                        ticker_returns.rename(columns={'Date': 'date'}, inplace=True)
                    elif 'index' in ticker_returns.columns:
                        ticker_returns.rename(columns={'index': 'date'}, inplace=True)
                
                # Normalize timezone
                ticker_returns['date'] = pd.to_datetime(ticker_returns['date'])
                if hasattr(ticker_returns['date'].dt, 'tz_localize'):
                    ticker_returns['date'] = ticker_returns['date'].dt.tz_localize(None)
                
                # Add to aligned data
                if aligned_data.empty:
                    aligned_data = ticker_returns
                else:
                    aligned_data = pd.concat([aligned_data, ticker_returns])
            
            # Add placeholder factor columns
            for col in ['mkt_rf', 'smb', 'hml', 'momentum', 'volatility']:
                aligned_data[col] = 0
            
            return aligned_data
    
    def prepare_modeling_dataset(self, price_data, factor_data, stock_factors=None):
        """
        Prepare the final dataset for modeling
        
        Parameters:
        -----------
        price_data : dict
            Dictionary of price DataFrames by ticker
        factor_data : pandas.DataFrame
            Factor data
        stock_factors : pandas.DataFrame, optional
            Stock-specific factors
            
        Returns:
        --------
        pandas.DataFrame: Final modeling dataset
        """
        # Calculate returns
        returns_data = self.calculate_returns(price_data)
        
        # If stock factors not provided, calculate them
        if stock_factors is None:
            try:
                stock_factors = self.calculate_stock_factors(price_data)
            except Exception as e:
                print(f"Error calculating stock factors: {e}")
                stock_factors = None
        
        # Align returns and factor data
        try:
            aligned_data = self.align_data(returns_data, factor_data)
        except Exception as e:
            print(f"Error aligning returns and factors: {e}")
            
            # Create a minimal dataset
            aligned_data = pd.DataFrame()
            
            for ticker, returns in returns_data.items():
                # Extract returns and add ticker
                ticker_returns = returns[['return']].copy() if 'return' in returns.columns else pd.DataFrame(index=returns.index)
                ticker_returns['ticker'] = ticker
                
                # Reset index to convert to column
                ticker_returns = ticker_returns.reset_index()
                
                # Ensure it has a date column - handle both 'Date' and 'index' cases
                if 'date' not in ticker_returns.columns:
                    if 'Date' in ticker_returns.columns:
                        ticker_returns.rename(columns={'Date': 'date'}, inplace=True)
                    elif 'index' in ticker_returns.columns:
                        ticker_returns.rename(columns={'index': 'date'}, inplace=True)
                
                # Normalize timezone
                ticker_returns['date'] = pd.to_datetime(ticker_returns['date'])
                if hasattr(ticker_returns['date'].dt, 'tz_localize'):
                    ticker_returns['date'] = ticker_returns['date'].dt.tz_localize(None)
                
                # Add to aligned data
                if aligned_data.empty:
                    aligned_data = ticker_returns
                else:
                    aligned_data = pd.concat([aligned_data, ticker_returns])
            
            # Add placeholder factor columns
            for col in ['mkt_rf', 'smb', 'hml', 'momentum', 'volatility']:
                aligned_data[col] = 0
        
        # Merge with stock factors if available
        if stock_factors is not None and not stock_factors.empty:
            try:
                # Ensure stock_factors has date and ticker columns
                stock_factors_reset = stock_factors.copy()
                
                # Handle date column in stock_factors
                if 'date' not in stock_factors_reset.columns:
                    if stock_factors_reset.index.name == 'date':
                        stock_factors_reset = stock_factors_reset.reset_index()
                    elif 'Date' in stock_factors_reset.columns:
                        stock_factors_reset.rename(columns={'Date': 'date'}, inplace=True)
                    elif stock_factors_reset.index.name == 'Date':
                        stock_factors_reset = stock_factors_reset.reset_index()
                        stock_factors_reset.rename(columns={'Date': 'date'}, inplace=True)
                
                # Convert date column to datetime if needed and normalize timezone
                if 'date' in stock_factors_reset.columns:
                    stock_factors_reset['date'] = pd.to_datetime(stock_factors_reset['date'])
                    if hasattr(stock_factors_reset['date'].dt, 'tz_localize'):
                        stock_factors_reset['date'] = stock_factors_reset['date'].dt.tz_localize(None)
                
                # Convert ticker column to string if needed
                if 'ticker' in stock_factors_reset.columns:
                    stock_factors_reset['ticker'] = stock_factors_reset['ticker'].astype(str)
                
                # Ensure aligned_data has date and ticker columns
                if 'date' in aligned_data.columns:
                    aligned_data['date'] = pd.to_datetime(aligned_data['date'])
                    if hasattr(aligned_data['date'].dt, 'tz_localize'):
                        aligned_data['date'] = aligned_data['date'].dt.tz_localize(None)
                
                if 'ticker' in aligned_data.columns:
                    aligned_data['ticker'] = aligned_data['ticker'].astype(str)
                
                # Perform the merge
                merge_columns = []
                if 'date' in stock_factors_reset.columns and 'date' in aligned_data.columns:
                    merge_columns.append('date')
                if 'ticker' in stock_factors_reset.columns and 'ticker' in aligned_data.columns:
                    merge_columns.append('ticker')
                
                # Check that we have at least one column to merge on
                if merge_columns:
                    final_data = pd.merge(aligned_data, stock_factors_reset, 
                                          on=merge_columns, how='left', suffixes=('', '_stock'))
                else:
                    print("No common columns found for merging. Using aligned data only.")
                    final_data = aligned_data
            except Exception as e:
                print(f"Error merging with stock factors: {e}")
                final_data = aligned_data
        else:
            final_data = aligned_data
        
        # Handle any remaining missing values
        final_data = final_data.fillna(method='ffill').fillna(method='bfill').fillna(0)
        
        # Optional: Save to file
        output_file = os.path.join(self.cache_dir, 'preprocessed', 'modeling_dataset.csv')
        final_data.to_csv(output_file, index=False)
        print(f"Saved modeling dataset to: {output_file}")
        
        return final_data


def run_data_collection():
    """
    Run the complete data collection process
    """
    # Initialize data collector
    collector = EquityDataCollector(cache_dir='data')
    
    # Step 1: Get S&P 500 constituents and sample stocks
    sp500 = collector.get_sp500_constituents(refresh=True)
    sampled_stocks = collector.get_sample_stocks(n_stocks=80, min_per_sector=5)
    
    # Get list of tickers
    tickers = sampled_stocks['ticker'].tolist()
    
    # For debugging/testing with fewer stocks
    # tickers = tickers[:10]  # Uncomment to use fewer stocks
    
    # Add SPY for market benchmark
    if 'SPY' not in tickers:
        tickers.append('SPY')
    
    # Step 2: Fetch historical price data
    price_data = collector.fetch_historical_prices(
        tickers, 
        start_date='2020-01-01',
        end_date=None,  # Use today as end date
        interval='1d',
        refresh=True
    )
    
    # Step 3: Fetch factor data
    factor_data = collector.fetch_factor_data(
        start_date='2020-01-01',
        end_date=None,  # Use today as end date
        refresh=True
    )
    
    # Step 4: Get options data for a subset of stocks
    # Choose liquid stocks for options analysis
    option_tickers = ['SPY', 'AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'NVDA', 'JPM', 'BAC']
    
    # Ensure all option_tickers are in the main ticker list
    option_tickers = [ticker for ticker in option_tickers if ticker in tickers or ticker == 'SPY']
    
    # Fetch options data
    options_data = collector.get_options_data(
        option_tickers,
        dte_min=10,
        dte_max=120,
        refresh=True
    )
    
    # Initialize data preprocessor
    preprocessor = DataPreprocessor(cache_dir='data')
    
    # Step 5: Calculate stock-specific factors
    stock_factors = preprocessor.calculate_stock_factors(price_data)
    
    # Step 6: Prepare modeling dataset
    modeling_data = preprocessor.prepare_modeling_dataset(price_data, factor_data, stock_factors)
    
    # Save the results for easy access
    result = {
        'price_data': price_data,
        'factor_data': factor_data,
        'stock_factors': stock_factors,
        'options_data': options_data,
        'modeling_data': modeling_data
    }
    
    # Save a summary of what data was collected
    summary = {
        'num_stocks': len(price_data),
        'date_range': (modeling_data['date'].min(), modeling_data['date'].max()) if not modeling_data.empty else None,
        'num_options_stocks': len(options_data),
        'option_tickers': list(options_data.keys()) if options_data else []
    }
    
    print("\nData Collection Summary:")
    print(f"Collected price data for {summary['num_stocks']} stocks")
    if summary['date_range']:
        print(f"Date range: {summary['date_range'][0]} to {summary['date_range'][1]}")
    print(f"Collected options data for {summary['num_options_stocks']} stocks")
    print(f"Options tickers: {', '.join(summary['option_tickers'])}")
    
    return result


# Exploratory data analysis function
def explore_data(data):
    """
    Perform basic exploratory data analysis
    
    Parameters:
    -----------
    data : dict
        Dictionary containing the collected data
    """
    # Create plots directory if it doesn't exist
    plots_dir = 'data/plots'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
    
    # 1. Analyze price data
    price_data = data['price_data']
    
    if price_data:
        print("\n=== Price Data Analysis ===")
        
        # Select a few tickers for visualization
        sample_tickers = list(price_data.keys())[:5]
        
        for ticker in sample_tickers:
            prices = price_data[ticker]
            
            # Plot price chart
            plt.figure(figsize=(10, 6))
            if 'Close' in prices.columns:
                plt.plot(prices.index, prices['Close'])
                plt.title(f"{ticker} Stock Price")
                plt.xlabel("Date")
                plt.ylabel("Price ($)")
                plt.grid(True, alpha=0.3)
                plt.savefig(f"{plots_dir}/{ticker}_price.png")
                plt.close()
                
                print(f"Created price chart for {ticker}")
    
    # 2. Analyze factor data
    factor_data = data['factor_data']
    
    if not factor_data.empty:
        print("\n=== Factor Data Analysis ===")
        
        # Plot factor returns
        plt.figure(figsize=(12, 8))
        
        factor_cols = ['mkt_rf', 'smb', 'hml', 'momentum', 'volatility']
        factor_cols = [col for col in factor_cols if col in factor_data.columns]
        
        if factor_cols:
            for col in factor_cols:
                cumulative_return = (1 + factor_data[col]).cumprod() - 1
                plt.plot(factor_data.index, cumulative_return, label=col)
            
            plt.title("Cumulative Factor Returns")
            plt.xlabel("Date")
            plt.ylabel("Cumulative Return")
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.savefig(f"{plots_dir}/factor_returns.png")
            plt.close()
            
            print("Created factor returns chart")
    
    # 3. Analyze options data
    options_data = data['options_data']
    
    if options_data:
        print("\n=== Options Data Analysis ===")
        
        # Select one ticker for visualization
        if 'SPY' in options_data:
            sample_ticker = 'SPY'
        else:
            sample_ticker = list(options_data.keys())[0]
        
        option_data = options_data[sample_ticker]
        
        # Extract calls and puts
        calls = option_data['calls']
        puts = option_data['puts']
        
        # Print basic stats
        print(f"{sample_ticker} options data:")
        print(f"  Spot price: ${option_data['spot_price']:.2f}")
        print(f"  Call options: {len(calls)}")
        print(f"  Put options: {len(puts)}")
        
        # Plot implied volatility smile for one expiration
        if 'expirationDate' in calls.columns and 'moneyness' in calls.columns and 'impliedVolatility' in calls.columns:
            expirations = sorted(calls['expirationDate'].unique())
            
            if expirations:
                # Get nearest expiration
                nearest_exp = expirations[0]
                
                # Filter calls and puts for this expiration
                exp_calls = calls[calls['expirationDate'] == nearest_exp]
                exp_puts = puts[puts['expirationDate'] == nearest_exp]
                
                # Plot IV smile
                plt.figure(figsize=(10, 6))
                
                if not exp_calls.empty:
                    plt.scatter(exp_calls['moneyness'], exp_calls['impliedVolatility'], 
                              label='Calls', alpha=0.7, color='blue')
                
                if not exp_puts.empty:
                    plt.scatter(exp_puts['moneyness'], exp_puts['impliedVolatility'], 
                              label='Puts', alpha=0.7, color='red')
                
                plt.axvline(x=1.0, color='black', linestyle='--', alpha=0.5, label='ATM')
                
                plt.title(f"{sample_ticker} Implied Volatility Smile - {nearest_exp}")
                plt.xlabel("Moneyness (Strike/Spot)")
                plt.ylabel("Implied Volatility")
                plt.legend()
                plt.grid(True, alpha=0.3)
                plt.savefig(f"{plots_dir}/{sample_ticker}_iv_smile.png")
                plt.close()
                
                print(f"Created IV smile chart for {sample_ticker}")
    
    # 4. Analyze modeling dataset
    modeling_data = data['modeling_data']
    
    if not modeling_data.empty:
        print("\n=== Modeling Data Analysis ===")
        
        # Print basic stats
        print(f"Modeling dataset shape: {modeling_data.shape}")
        print(f"Number of tickers: {modeling_data['ticker'].nunique()}")
        
        # Correlation of factors and returns
        numeric_cols = ['return', 'mkt_rf', 'smb', 'hml', 'momentum', 'volatility']
        numeric_cols = [col for col in numeric_cols if col in modeling_data.columns]
        
        if numeric_cols:
            corr = modeling_data[numeric_cols].corr()
            
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
            plt.title("Factor Correlation Matrix")
            plt.tight_layout()
            plt.savefig(f"{plots_dir}/factor_correlation.png")
            plt.close()
            
            print("Created factor correlation matrix")
    
    print("\nExploratory data analysis complete. Charts saved to data/plots/ directory.")


# Main execution
if __name__ == "__main__":
    # Run data collection
    collected_data = run_data_collection()
    
    # Run exploratory data analysis
    explore_data(collected_data)
    
    print("\nPhase 1 (Data Collection and Exploration) complete!")

Fetching S&P 500 constituents from Wikipedia...
Saved S&P 500 constituents to: data/sp500_constituents.csv
Selected 80 stocks from 11 sectors

Sector distribution:
  Health Care: 10 stocks
  Information Technology: 10 stocks
  Utilities: 9 stocks
  Industrials: 8 stocks
  Financials: 7 stocks
  Real Estate: 7 stocks
  Consumer Staples: 7 stocks
  Materials: 6 stocks
  Consumer Discretionary: 6 stocks
  Communication Services: 5 stocks
  Energy: 5 stocks
Fetching historical price data for 81 tickers...


100%|███████████████████████████████████████████| 81/81 [00:20<00:00,  3.93it/s]


Saving price data to cache: data/prices/prices_2020-01-01_to_2025-04-10_1d.pkl
Fetching Fama-French factor data...
Successfully added Fama-French factors
Fetching market data for momentum and volatility factors...
Successfully added momentum and volatility factors
Saving factor data to cache: data/factors/factors_2020-01-01_to_2025-04-10.csv
Custom OptionsDataFetcher not found, using mock implementation
Fetching options data for 2 tickers...


  0%|                                                     | 0/2 [00:00<?, ?it/s]

Generating mock options data for SPY
Saved options data for SPY


 50%|██████████████████████▌                      | 1/2 [00:01<00:01,  1.01s/it]

Generating mock options data for TSLA
Saved options data for TSLA


100%|█████████████████████████████████████████████| 2/2 [00:02<00:00,  1.02s/it]


Saving options data to cache: data/options/options_data_20250410.pkl
Saved modeling dataset to: data/preprocessed/modeling_dataset.csv

Data Collection Summary:
Collected price data for 81 stocks
Date range: 2020-01-02 00:00:00 to 2025-04-09 00:00:00
Collected options data for 2 stocks
Options tickers: SPY, TSLA

=== Price Data Analysis ===
Created price chart for HUBB
Created price chart for MMM
Created price chart for HII
Created price chart for CTAS
Created price chart for CARR

=== Factor Data Analysis ===
Created factor returns chart

=== Options Data Analysis ===
SPY options data:
  Spot price: $352.72
  Call options: 150
  Put options: 150
Created IV smile chart for SPY

=== Modeling Data Analysis ===
Modeling dataset shape: (106568, 17)
Number of tickers: 81
Created factor correlation matrix

Exploratory data analysis complete. Charts saved to data/plots/ directory.

Phase 1 (Data Collection and Exploration) complete!
