# First, assign Alpaca API Keys from untracked file and test connection

In [3]:
import json

with open('api_keys.json', 'r') as file:
    # Code to load the data goes here
    api_keys = json.load(file)

API_KEY = api_keys['API_KEY']
SECRET_KEY = api_keys['SECRET_KEY']

In [2]:
from alpaca.data.historical import CryptoHistoricalDataClient

# No keys required for crypto data
client = CryptoHistoricalDataClient()

In [4]:
from alpaca.data.requests import CryptoBarsRequest
from alpaca.data.timeframe import TimeFrame
from datetime import datetime

# Creating request object
request_params = CryptoBarsRequest(
  symbol_or_symbols=["BTC/USD"],
  timeframe=TimeFrame.Day,
  start=datetime(2022, 9, 1),
  end=datetime(2022, 9, 7)
)

In [5]:
# Retrieve daily bars for Bitcoin in a DataFrame and printing it
btc_bars = client.get_crypto_bars(request_params)

# Convert to dataframe
btc_bars.df

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,trade_count,vwap
symbol,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BTC/USD,2022-09-01 00:00:00+00:00,20051.81,20205.83,19564.86,20132.97,7529.674053,114052.0,19934.701556
BTC/USD,2022-09-02 00:00:00+00:00,20132.5,20444.0,19757.72,19954.16,7392.679014,98745.0,20095.899441
BTC/USD,2022-09-03 00:00:00+00:00,19950.63,20054.69,19658.04,19832.06,3077.135497,52729.0,19839.406563
BTC/USD,2022-09-04 00:00:00+00:00,19834.87,20030.89,19587.86,20002.38,3712.178165,60722.0,19813.537532
BTC/USD,2022-09-05 00:00:00+00:00,19998.77,20058.0,19635.96,19795.12,4817.489036,66396.0,19801.578592
BTC/USD,2022-09-06 00:00:00+00:00,19795.12,20180.5,18668.9,18790.39,11753.830278,139147.0,19480.98637
BTC/USD,2022-09-07 00:00:00+00:00,18789.4,19462.02,18534.06,19290.53,8092.183326,89704.0,18952.481132


# After testing if it works, import all historical stock symbols

In [3]:
# Fetch S&P 500 + Russell 1000 stock symbols
import pandas as pd
import requests
import io

all_symbols = set()

headers = {'User-Agent': 'Mozilla/5.0'}

# 1. S&P 500 from Wikipedia
try:
    sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    response = requests.get(sp500_url, headers=headers)
    sp500_tables = pd.read_html(io.StringIO(response.text))
    sp500_symbols = sp500_tables[0]['Symbol'].str.replace('.', '-', regex=False).tolist()
    all_symbols.update(sp500_symbols)
    print(f"✓ S&P 500: {len(sp500_symbols)} symbols")
except Exception as e:
    print(f"✗ S&P 500 failed: {e}")

# 2. Russell 1000 from iShares IWB ETF holdings (tracks Russell 1000)
try:
    # iShares Russell 1000 ETF holdings CSV
    iwb_url = "https://www.ishares.com/us/products/239707/ishares-russell-1000-etf/1467271812596.ajax?fileType=csv&fileName=IWB_holdings&dataType=fund"
    response = requests.get(iwb_url, headers=headers)
    
    # Skip the header rows and read the CSV
    lines = response.text.split('\n')
    # Find where the actual data starts (after header info)
    start_idx = 0
    for i, line in enumerate(lines):
        if line.startswith('Ticker,'):
            start_idx = i
            break
    
    csv_data = '\n'.join(lines[start_idx:])
    russell_df = pd.read_csv(io.StringIO(csv_data))
    
    if 'Ticker' in russell_df.columns:
        russell_symbols = russell_df['Ticker'].dropna().str.strip().tolist()
        # Filter out non-stock entries (like cash, futures, etc.)
        russell_symbols = [s for s in russell_symbols if s and s.isalpha() and len(s) <= 5]
        all_symbols.update(russell_symbols)
        print(f"✓ Russell 1000 (iShares IWB): {len(russell_symbols)} symbols")
    else:
        print(f"✗ Russell 1000: Ticker column not found. Columns: {russell_df.columns.tolist()}")
except Exception as e:
    print(f"✗ Russell 1000 failed: {e}")
    print("  Falling back to S&P 500 only")

# Convert to sorted list
SYMBOLS = sorted([s for s in all_symbols if s and isinstance(s, str)])

print(f"\n{'='*50}")
print(f"Total unique symbols: {len(SYMBOLS)}")
print(f"First 20 symbols: {SYMBOLS[:20]}")
print(f"\nFull list available in SYMBOLS variable")

✓ S&P 500: 503 symbols
✓ Russell 1000 (iShares IWB): 1012 symbols

Total unique symbols: 1019
First 20 symbols: ['A', 'AA', 'AAL', 'AAON', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACHC', 'ACI', 'ACM', 'ACN', 'ADBE', 'ADC', 'ADI', 'ADM', 'ADP', 'ADSK', 'ADT']

Full list available in SYMBOLS variable
✓ Russell 1000 (iShares IWB): 1012 symbols

Total unique symbols: 1019
First 20 symbols: ['A', 'AA', 'AAL', 'AAON', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACHC', 'ACI', 'ACM', 'ACN', 'ADBE', 'ADC', 'ADI', 'ADM', 'ADP', 'ADSK', 'ADT']

Full list available in SYMBOLS variable


# Next, use symbols Import historical Russell 1000 stock data from alpaca

In [4]:
import pandas as pd
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame
from datetime import datetime, timedelta
import pytz
import time

# SYMBOLS is defined in the previous cell (S&P 500 list)
# Uncomment below to use a smaller test set:
# SYMBOLS = ["AAPL", "MSFT", "GOOGL", "AMZN", "META"]

# 1. Initialize the client
stock_client = StockHistoricalDataClient(API_KEY, SECRET_KEY)

# 2. Define the date range for the last year
end_date = datetime.now(pytz.timezone('America/New_York')) - timedelta(days=1)
start_date = end_date - timedelta(days=365)

# 3. Dictionary to store all DataFrames
all_stock_data = {}
failed_symbols = []

# 4. Iterate through each symbol and fetch data
print(f"Fetching data for {len(SYMBOLS)} symbols...")
print(f"Date range: {start_date.date()} to {end_date.date()}\n")

for i, symbol in enumerate(SYMBOLS):
    try:
        request_params = StockBarsRequest(
            symbol_or_symbols=[symbol],
            timeframe=TimeFrame.Day,
            start=start_date,
            end=end_date
        )
        
        stock_bars = stock_client.get_stock_bars(request_params)
        bars_df = stock_bars.df
        
        # Reset index to make symbol a column instead of multi-index
        bars_df = bars_df.reset_index()
        
        all_stock_data[symbol] = bars_df
        
        # Progress update every 50 symbols
        if (i + 1) % 50 == 0:
            print(f"Progress: {i + 1}/{len(SYMBOLS)} symbols fetched...")
        
        # Small delay to avoid rate limiting
        time.sleep(0.1)
        
    except Exception as e:
        failed_symbols.append(symbol)
        print(f"  ✗ Error fetching {symbol}: {e}")

# 5. Combine all DataFrames into one
if all_stock_data:
    combined_df = pd.concat(all_stock_data.values(), ignore_index=True)
    print(f"\n{'='*50}")
    print(f"✓ Successfully fetched {len(combined_df)} data points for {len(all_stock_data)} stocks")
    if failed_symbols:
        print(f"✗ Failed to fetch {len(failed_symbols)} symbols: {failed_symbols[:10]}{'...' if len(failed_symbols) > 10 else ''}")
    print(f"\nCombined DataFrame shape: {combined_df.shape}")
    print(f"\nCombined DataFrame preview:")
    display(combined_df.head(10))
else:
    print("No data was fetched.")

Fetching data for 1019 symbols...
Date range: 2025-01-07 to 2026-01-07

Progress: 50/1019 symbols fetched...
Progress: 50/1019 symbols fetched...
Progress: 100/1019 symbols fetched...
Progress: 100/1019 symbols fetched...
  ✗ Error fetching BF-B: {"message":"invalid symbol: BF-B"}

  ✗ Error fetching BF-B: {"message":"invalid symbol: BF-B"}

  ✗ Error fetching BRK-B: {"message":"invalid symbol: BRK-B"}

  ✗ Error fetching BRK-B: {"message":"invalid symbol: BRK-B"}

Progress: 150/1019 symbols fetched...
Progress: 150/1019 symbols fetched...
Progress: 200/1019 symbols fetched...
Progress: 200/1019 symbols fetched...
Progress: 250/1019 symbols fetched...
Progress: 250/1019 symbols fetched...
Progress: 300/1019 symbols fetched...
Progress: 300/1019 symbols fetched...
Progress: 350/1019 symbols fetched...
Progress: 350/1019 symbols fetched...
Progress: 400/1019 symbols fetched...
Progress: 400/1019 symbols fetched...
Progress: 450/1019 symbols fetched...
Progress: 450/1019 symbols fetched..

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,index
0,A,2025-01-08 05:00:00+00:00,137.68,137.68,135.63,137.0,1684573.0,19948.0,137.068421,
1,A,2025-01-10 05:00:00+00:00,134.75,140.14,134.709,137.47,1369875.0,25383.0,137.592663,
2,A,2025-01-13 05:00:00+00:00,137.22,142.82,137.0,141.95,1561959.0,28739.0,141.776934,
3,A,2025-01-14 05:00:00+00:00,142.0,145.38,140.15,143.43,2445434.0,36636.0,143.373405,
4,A,2025-01-15 05:00:00+00:00,144.14,146.5,138.68,142.23,2328643.0,35076.0,142.841548,
5,A,2025-01-16 05:00:00+00:00,142.78,145.11,140.43,144.72,1661474.0,25916.0,143.989713,
6,A,2025-01-17 05:00:00+00:00,145.88,148.46,145.195,147.36,3210310.0,45636.0,147.281027,
7,A,2025-01-21 05:00:00+00:00,148.67,153.18,148.01,152.57,2759636.0,42383.0,152.14676,
8,A,2025-01-22 05:00:00+00:00,152.83,153.76,151.72,152.6,1730996.0,27740.0,152.631508,
9,A,2025-01-23 05:00:00+00:00,152.83,152.955,148.18,152.45,1332235.0,24444.0,151.813582,


In [None]:
# Save the combined DataFrame to CSV
import os

output_file = "raw_data/stock_data.csv"
combined_df.to_csv(output_file, index=False)
print(f"✓ Data saved to {output_file}")
print(f"  File size: {os.path.getsize(output_file) / (1024*1024):.2f} MB")

✓ Data saved to stock_data.csv
  File size: 20.36 MB


In [None]:
# Save the list of tickers to CSV
tickers_df = pd.DataFrame({'symbol': SYMBOLS})
tickers_file = "raw_data/tickers.csv"
tickers_df.to_csv(tickers_file, index=False)
print(f"✓ Tickers saved to {tickers_file}")
print(f"  Total tickers: {len(SYMBOLS)}")

✓ Tickers saved to tickers.csv
  Total tickers: 1019


# Next import historical financial statement data from Yahoo Finance

In [None]:
# Fetch historical financials from Yahoo Finance
import yfinance as yf
import pandas as pd
import time

# Load tickers from CSV (or use SYMBOLS if already in memory)
tickers_df = pd.read_csv("raw_data/tickers.csv")
symbols = tickers_df['symbol'].tolist()

# Lists to store financials data
all_income_stmt = []
all_balance_sheet = []
all_cashflow = []
all_earnings_dates = []
failed_financials = []

print(f"Fetching financials for {len(symbols)} symbols...")
print("This may take a while...\n")

for i, symbol in enumerate(symbols):
    try:
        ticker = yf.Ticker(symbol)
        
        # Get annual financials (income statement, balance sheet, cash flow)
        income_stmt = ticker.income_stmt
        balance_sheet = ticker.balance_sheet
        cashflow = ticker.cashflow
        
        # Get earnings release dates (historical and upcoming)
        try:
            earnings_dates = ticker.earnings_dates
            if earnings_dates is not None and not earnings_dates.empty:
                earnings_df = earnings_dates.reset_index()
                earnings_df.insert(0, 'symbol', symbol)
                earnings_df.rename(columns={'Earnings Date': 'release_date'}, inplace=True)
                all_earnings_dates.append(earnings_df)
        except Exception:
            pass  # Some tickers may not have earnings dates
        
        # Add symbol column and append to lists
        if not income_stmt.empty:
            income_df = income_stmt.T.reset_index()
            income_df.insert(0, 'symbol', symbol)
            income_df.rename(columns={'index': 'fiscal_period_end'}, inplace=True)
            all_income_stmt.append(income_df)
        
        if not balance_sheet.empty:
            balance_df = balance_sheet.T.reset_index()
            balance_df.insert(0, 'symbol', symbol)
            balance_df.rename(columns={'index': 'fiscal_period_end'}, inplace=True)
            all_balance_sheet.append(balance_df)
        
        if not cashflow.empty:
            cashflow_df = cashflow.T.reset_index()
            cashflow_df.insert(0, 'symbol', symbol)
            cashflow_df.rename(columns={'index': 'fiscal_period_end'}, inplace=True)
            all_cashflow.append(cashflow_df)
        
        # Progress update every 50 symbols
        if (i + 1) % 50 == 0:
            print(f"Progress: {i + 1}/{len(symbols)} symbols fetched...")
        
        # Small delay to avoid rate limiting
        time.sleep(0.1)
        
    except Exception as e:
        failed_financials.append(symbol)
        if (i + 1) % 100 == 0:  # Only print errors occasionally to reduce noise
            print(f"  ✗ Error fetching {symbol}: {e}")

# Combine all DataFrames
income_stmt_df = pd.concat(all_income_stmt, ignore_index=True) if all_income_stmt else pd.DataFrame()
balance_sheet_df = pd.concat(all_balance_sheet, ignore_index=True) if all_balance_sheet else pd.DataFrame()
cashflow_df = pd.concat(all_cashflow, ignore_index=True) if all_cashflow else pd.DataFrame()
earnings_dates_df = pd.concat(all_earnings_dates, ignore_index=True) if all_earnings_dates else pd.DataFrame()

print(f"\n{'='*50}")
print(f"✓ Income Statement: {len(income_stmt_df)} rows, {income_stmt_df['symbol'].nunique() if not income_stmt_df.empty else 0} stocks")
print(f"✓ Balance Sheet: {len(balance_sheet_df)} rows, {balance_sheet_df['symbol'].nunique() if not balance_sheet_df.empty else 0} stocks")
print(f"✓ Cash Flow: {len(cashflow_df)} rows, {cashflow_df['symbol'].nunique() if not cashflow_df.empty else 0} stocks")
print(f"✓ Earnings Dates: {len(earnings_dates_df)} rows, {earnings_dates_df['symbol'].nunique() if not earnings_dates_df.empty else 0} stocks")
if failed_financials:
    print(f"✗ Failed: {len(failed_financials)} symbols")

print("\nEarnings Release Dates preview:")
display(earnings_dates_df.head(10))

print("\nIncome Statement preview:")
display(income_stmt_df.head())

Fetching financials for 1019 symbols...
This may take a while...

Progress: 50/1019 symbols fetched...
Progress: 50/1019 symbols fetched...
Progress: 100/1019 symbols fetched...
Progress: 100/1019 symbols fetched...


BFA: No earnings dates found, symbol may be delisted
BFB: No earnings dates found, symbol may be delisted
BFB: No earnings dates found, symbol may be delisted
BRKB: No earnings dates found, symbol may be delisted
BRKB: No earnings dates found, symbol may be delisted


Progress: 150/1019 symbols fetched...
Progress: 200/1019 symbols fetched...
Progress: 200/1019 symbols fetched...
Progress: 250/1019 symbols fetched...
Progress: 250/1019 symbols fetched...


CWENA: No earnings dates found, symbol may be delisted


Progress: 300/1019 symbols fetched...
Progress: 350/1019 symbols fetched...
Progress: 350/1019 symbols fetched...
Progress: 400/1019 symbols fetched...
Progress: 400/1019 symbols fetched...


HEIA: No earnings dates found, symbol may be delisted


Progress: 450/1019 symbols fetched...
Progress: 500/1019 symbols fetched...
Progress: 500/1019 symbols fetched...


LENB: No earnings dates found, symbol may be delisted


Progress: 550/1019 symbols fetched...
Progress: 600/1019 symbols fetched...
Progress: 600/1019 symbols fetched...
Progress: 650/1019 symbols fetched...
Progress: 650/1019 symbols fetched...
Progress: 700/1019 symbols fetched...
Progress: 700/1019 symbols fetched...
Progress: 750/1019 symbols fetched...
Progress: 750/1019 symbols fetched...


REXR: No earnings dates found, symbol may be delisted


Progress: 800/1019 symbols fetched...


SGAFT: No earnings dates found, symbol may be delisted


Progress: 850/1019 symbols fetched...
Progress: 900/1019 symbols fetched...
Progress: 900/1019 symbols fetched...


UHALB: No earnings dates found, symbol may be delisted
USD: No earnings dates found, symbol may be delisted
USD: No earnings dates found, symbol may be delisted


Progress: 950/1019 symbols fetched...
Progress: 1000/1019 symbols fetched...
Progress: 1000/1019 symbols fetched...


XTSLA: No earnings dates found, symbol may be delisted



✓ Income Statement: 4165 rows, 1008 stocks
✓ Balance Sheet: 4201 rows, 1008 stocks
✓ Cash Flow: 4209 rows, 1008 stocks
✓ Earnings Dates: 23789 rows, 1008 stocks

Earnings Release Dates preview:


Unnamed: 0,symbol,release_date,EPS Estimate,Reported EPS,Surprise(%)
0,A,2026-02-25 16:00:00-05:00,1.37,,
1,A,2025-11-24 16:00:00-05:00,1.58,1.59,0.37
2,A,2025-08-27 16:00:00-04:00,1.37,1.37,0.17
3,A,2025-05-28 16:00:00-04:00,1.26,1.31,3.62
4,A,2025-02-26 16:00:00-05:00,1.27,1.31,2.99
5,A,2024-11-25 16:00:00-05:00,1.41,1.46,3.9
6,A,2024-08-21 16:00:00-04:00,1.26,1.32,4.93
7,A,2024-05-29 16:00:00-04:00,1.19,1.22,2.42
8,A,2024-02-27 16:00:00-05:00,1.23,1.29,4.68
9,A,2023-11-20 16:00:00-05:00,1.34,1.38,3.02



Income Statement preview:


Unnamed: 0,symbol,fiscal_period_end,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,EBIT,...,Other Taxes,Provision For Doubtful Accounts,Other Non Interest Expense,Occupancy And Equipment,Professional Expense And Contract Services Expense,Excise Taxes,Depletion Income Statement,Net Income From Tax Loss Carryforward,Net Income Extraordinary,Securities Amortization
0,A,2025-10-31,0.0,0.092,1835000000.0,1303000000.0,288000000.0,3305000000.0,1835000000.0,1547000000.0,...,,,,,,,,,,
1,A,2024-10-31,0.0,0.153,1874000000.0,1289000000.0,257000000.0,2975000000.0,1874000000.0,1617000000.0,...,,,,,,,,,,
2,A,2023-10-31,0.0,0.074,1705000000.0,1240000000.0,271000000.0,3368000000.0,1705000000.0,1434000000.0,...,,,,,,,,,,
3,A,2022-10-31,0.0,0.166,1905000000.0,1254000000.0,317000000.0,3126000000.0,1905000000.0,1588000000.0,...,,,,,,,,,,
4,A,2021-10-31,,,,,,,,,...,,,,,,,,,,


In [None]:
# Save all financial DataFrames to CSV files
import os

# Save Income Statement
income_stmt_df.to_csv("raw_data/income_statement.csv", index=False)
print(f"✓ Income Statement saved to income_statement.csv ({os.path.getsize('income_statement.csv') / (1024*1024):.2f} MB)")

# Save Balance Sheet
balance_sheet_df.to_csv("raw_data/balance_sheet.csv", index=False)
print(f"✓ Balance Sheet saved to balance_sheet.csv ({os.path.getsize('balance_sheet.csv') / (1024*1024):.2f} MB)")

# Save Cash Flow
cashflow_df.to_csv("raw_data/cashflow.csv", index=False)
print(f"✓ Cash Flow saved to cashflow.csv ({os.path.getsize('cashflow.csv') / (1024*1024):.2f} MB)")

# Save Earnings Dates
earnings_dates_df.to_csv("raw_data/earnings_dates.csv", index=False)
print(f"✓ Earnings Dates saved to earnings_dates.csv ({os.path.getsize('earnings_dates.csv') / (1024*1024):.2f} MB)")

print(f"\n{'='*50}")
print("All financial data exported successfully!")

✓ Income Statement saved to income_statement.csv (2.23 MB)
✓ Balance Sheet saved to balance_sheet.csv (3.48 MB)
✓ Cash Flow saved to cashflow.csv (2.71 MB)
✓ Earnings Dates saved to earnings_dates.csv (1.03 MB)

All financial data exported successfully!


# Import historical News Data From Alpaca

In [None]:
# Fetch ALL Alpaca news data for all tickers using pagination
from alpaca.data.historical import NewsClient
from alpaca.data.requests import NewsRequest
from datetime import datetime, timedelta
import pandas as pd
import time

# Load tickers from CSV
tickers_df = pd.read_csv("raw_data/tickers.csv")
symbols = tickers_df['symbol'].tolist()

# Initialize news client
news_client = NewsClient(API_KEY, SECRET_KEY)

# Date range for last 370 days
end_date = datetime.now()
start_date = end_date - timedelta(days=370)

# List to store all news articles
all_news = []
failed_news = []

print(f"Fetching ALL news for {len(symbols)} symbols...")
print(f"Date range: {start_date.date()} to {end_date.date()}\n")

# Fetch news one symbol at a time with pagination
for i, symbol in enumerate(symbols):
    try:
        page_token = None
        symbol_article_count = 0
        
        # Keep fetching until no more pages
        while True:
            # Create news request for this symbol
            request_params = NewsRequest(
                symbols=symbol,
                start=start_date,
                end=end_date,
                limit=50  # Max per request
            )
            
            # Add page token if we have one (for pagination)
            if page_token:
                request_params.page_token = page_token
            
            news_response = news_client.get_news(request_params)
            
            # Extract news articles from response.data['news']
            news_articles = news_response.data['news']
            
            # News objects use attribute access (e.g., article.id)
            for article in news_articles:
                all_news.append({
                    'id': article.id,
                    'headline': article.headline,
                    'summary': article.summary,
                    'author': article.author,
                    'created_at': article.created_at,
                    'updated_at': article.updated_at,
                    'url': article.url,
                    'symbols': ','.join(article.symbols) if article.symbols else '',
                    'source': article.source
                })
            
            symbol_article_count += len(news_articles)
            
            # Check if there are more pages
            page_token = news_response.next_page_token
            if not page_token or len(news_articles) == 0:
                break
            
            # Small delay between pages to avoid rate limiting
            time.sleep(0.05)
        
        # Progress update every 100 symbols
        if (i + 1) % 100 == 0:
            print(f"Progress: {i + 1}/{len(symbols)} symbols processed, {len(all_news)} articles collected...")
        
        # Small delay between symbols to avoid rate limiting
        time.sleep(0.1)
        
    except Exception as e:
        failed_news.append(symbol)
        # Only print first few errors
        if len(failed_news) <= 5:
            print(f"  ✗ Error fetching news for {symbol}: {e}")

# Create DataFrame from the news data
news_df = pd.DataFrame(all_news)

# Remove duplicates (same article may appear for multiple symbols)
if not news_df.empty:
    news_df = news_df.drop_duplicates(subset=['id'])

print(f"\n{'='*50}")
print(f"✓ Total unique news articles: {len(news_df)}")
if failed_news:
    print(f"✗ Failed: {len(failed_news)} symbols")

print(f"\nDataFrame columns: {news_df.columns.tolist()}")
print(f"\nNews DataFrame preview:")
display(news_df.head(10))

Fetching ALL news for 1019 symbols...
Date range: 2025-01-05 to 2026-01-10

Progress: 100/1019 symbols processed, 4859 articles collected...
Progress: 100/1019 symbols processed, 4859 articles collected...
Progress: 200/1019 symbols processed, 9428 articles collected...
Progress: 200/1019 symbols processed, 9428 articles collected...
Progress: 300/1019 symbols processed, 14239 articles collected...
Progress: 300/1019 symbols processed, 14239 articles collected...
Progress: 400/1019 symbols processed, 18943 articles collected...
Progress: 400/1019 symbols processed, 18943 articles collected...
Progress: 500/1019 symbols processed, 23645 articles collected...
Progress: 500/1019 symbols processed, 23645 articles collected...
Progress: 600/1019 symbols processed, 28216 articles collected...
Progress: 600/1019 symbols processed, 28216 articles collected...
Progress: 700/1019 symbols processed, 32931 articles collected...
Progress: 700/1019 symbols processed, 32931 articles collected...
Prog

Unnamed: 0,id,headline,summary,author,created_at,updated_at,url,symbols,source
0,49701666,Evercore ISI Group Upgrades Agilent Technologi...,,Benzinga Newsdesk,2026-01-05 17:50:55+00:00,2026-01-05 17:50:56+00:00,https://www.benzinga.com/news/26/01/49701666/e...,A,benzinga
1,49391324,Barclays Upgrades Agilent Technologies to Over...,,Benzinga Newsdesk,2025-12-15 13:49:28+00:00,2025-12-15 13:49:29+00:00,https://www.benzinga.com/news/25/12/49391324/b...,A,benzinga
2,49342760,What's Driving the Market Sentiment Around Agi...,,Benzinga Insights,2025-12-11 18:00:38+00:00,2025-12-11 18:00:39+00:00,https://www.benzinga.com/insights/short-seller...,A,benzinga
3,49276887,Goldman Sachs Initiates Coverage On Agilent Te...,,Benzinga Newsdesk,2025-12-09 12:32:55+00:00,2025-12-09 12:32:56+00:00,https://www.benzinga.com/news/25/12/49276887/g...,A,benzinga
4,49190655,10 Health Care Stocks With Whale Alerts In Tod...,,Benzinga Insights,2025-12-03 17:35:29+00:00,2025-12-03 17:35:29+00:00,https://www.benzinga.com/insights/options/25/1...,"A,COR,IOVA,LLY,NKTR,PFE,PROK,SRPT,VEEV,VKTX",benzinga
5,49161291,Morgan Stanley Initiates Coverage On Agilent T...,,Benzinga Newsdesk,2025-12-02 15:02:49+00:00,2025-12-02 15:02:49+00:00,https://www.benzinga.com/news/25/12/49161291/m...,A,benzinga
6,49069188,Spotlight on Agilent Technologies: Analyzing t...,,Benzinga Insights,2025-11-25 20:02:46+00:00,2025-11-25 20:02:47+00:00,https://www.benzinga.com/insights/options/25/1...,A,benzinga
7,49067586,Citigroup Maintains Buy on Agilent Technologie...,,Benzinga Newsdesk,2025-11-25 19:05:00+00:00,2025-11-25 19:05:01+00:00,https://www.benzinga.com/news/25/11/49067586/c...,A,benzinga
8,49065822,These Analysts Boost Their Forecasts On Agilen...,,Avi Kapoor,2025-11-25 18:06:00+00:00,2025-11-25 18:06:00+00:00,https://www.benzinga.com/analyst-stock-ratings...,A,benzinga
9,49065306,"UBS Maintains Buy on Agilent Technologies, Rai...",,Benzinga Newsdesk,2025-11-25 17:53:34+00:00,2025-11-25 17:53:35+00:00,https://www.benzinga.com/news/25/11/49065306/u...,A,benzinga


In [None]:
# Save news DataFrame to CSV
import os

news_file = "raw_data/news_data.csv"
news_df.to_csv(news_file, index=False)
print(f"✓ News data saved to {news_file}")
print(f"  File size: {os.path.getsize(news_file) / (1024*1024):.2f} MB")
print(f"  Total articles: {len(news_df)}")

✓ News data saved to news_data.csv
  File size: 12.42 MB
  Total articles: 39758
