# Stock News Retrieval Test

This notebook tests retrieving the latest news for a given stock ticker from the yfinance API.

Uses the existing `StockDataDownloader` class from `app.py`.


In [None]:
# Import the stock data downloader
from app import StockDataDownloader
import pandas as pd
from datetime import datetime
import json

# Clear any cached imports to ensure we get the latest version
import importlib
import sys
if 'app' in sys.modules:
    importlib.reload(sys.modules['app'])


In [74]:
# Initialize the downloader
downloader = StockDataDownloader()
print("✓ StockDataDownloader initialized")


2025-10-06 16:29:05,707 - app - INFO - Initialized StockDataDownloader with base path: data/price-history


✓ StockDataDownloader initialized


In [75]:
# Configure the ticker and max news items
ticker = "NVDA"  # Change this to any ticker you want
max_news_items = 10  # Number of news items to retrieve

print(f"Ticker: {ticker}")
print(f"Max news items: {max_news_items}")


Ticker: NVDA
Max news items: 10


In [None]:
# First, let's inspect the raw news data structure from yfinance
import yfinance as yf

print(f"\n{'='*80}")
print(f"RAW NEWS DATA INSPECTION FOR {ticker}")
print(f"{'='*80}\n")

stock = yf.Ticker(ticker)
raw_news = stock.news

if raw_news:
    print(f"✓ Found {len(raw_news)} news items\n")
    
    # Show the structure of the first news item
    if len(raw_news) > 0:
        print("First news item structure:")
        print("-" * 80)
        for key, value in raw_news[0].items():
            if key == 'content':
                print(f"{key}: [NESTED CONTENT - see below]")
                # Show the content structure
                content = value
                print("  Content fields:")
                for ckey, cvalue in content.items():
                    if isinstance(cvalue, str) and len(cvalue) > 100:
                        print(f"    {ckey}: {cvalue[:100]}...")
                    else:
                        print(f"    {ckey}: {cvalue}")
            else:
                print(f"{key}: {value}")
        print("-" * 80)
else:
    print("✗ No news data returned from API")

print(f"\n{'='*80}")
print("NOW FETCHING USING UPDATED StockDataDownloader")
print(f"{'='*80}\n")

# Now fetch using our updated downloader
news_df = downloader.get_recent_news(ticker=ticker, max_items=max_news_items)

if news_df is not None:
    print(f"\n✓ Successfully retrieved {len(news_df)} news items")
    print(f"✓ Columns: {list(news_df.columns)}")
else:
    print(f"\n✗ No news found for {ticker}")



RAW NEWS DATA INSPECTION FOR NVDA



2025-10-06 16:29:06,309 - app - INFO - Fetching recent news for NVDA (attempt 1/3)


✓ Found 10 news items

First news item structure:
--------------------------------------------------------------------------------
id: 4731fc11-ab62-444a-af68-cddf98eca95e
content: {'id': '4731fc11-ab62-444a-af68-cddf98eca95e', 'contentType': 'STORY', 'title': "AMD CEO Lisa Su says AI critics are 'thinking too small' after massive OpenAI deal", 'description': '', 'summary': 'The chipmaker inked a multi-gigawatt GPU agreement with OpenAI, setting the stage for a potential 10-year AI supercycle', 'pubDate': '2025-10-06T20:28:18Z', 'displayTime': '2025-10-06T20:28:19Z', 'isHosted': True, 'bypassModal': False, 'previewUrl': None, 'thumbnail': {'originalUrl': 'https://s.yimg.com/os/creatr-uploaded-images/2025-10/8b3d5a70-a2f0-11f0-bff3-5539b367b53e', 'originalWidth': 6000, 'originalHeight': 4015, 'caption': '', 'resolutions': [{'url': 'https://s.yimg.com/uu/api/res/1.2/oaGTChmBk9av.dDIWPZEjw--~B/aD00MDE1O3c9NjAwMDthcHBpZD15dGFjaHlvbg--/https://s.yimg.com/os/creatr-uploaded-images/2025-10/8b

2025-10-06 16:29:06,518 - app - INFO - Successfully fetched 10 news items for NVDA



✓ Successfully retrieved 10 news items


In [None]:
# Display DataFrame info and check for actual content
if news_df is not None:
    print("\n" + "="*80)
    print("NEWS DATA INFO")
    print("="*80)
    print(f"\nShape: {news_df.shape}")
    print(f"\nColumns: {list(news_df.columns)}")
    print(f"\nData types:")
    print(news_df.dtypes)
    
    # Check if data is actually populated
    print("\n" + "="*80)
    print("CONTENT CHECK")
    print("="*80)
    
    # Check first row to see if we have real data
    if len(news_df) > 0:
        first_row = news_df.iloc[0]
        print(f"\nFirst row values:")
        print(f"  TICKER: '{first_row['TICKER']}'")
        print(f"  ID: '{first_row['ID']}'")
        print(f"  TITLE: '{first_row['TITLE']}'")
        print(f"  SUMMARY: '{first_row['SUMMARY']}'")
        print(f"  DESCRIPTION: '{first_row['DESCRIPTION']}'")
        print(f"  PUBLISHER: '{first_row['PUBLISHER']}'")
        print(f"  LINK: '{first_row['LINK']}'")
        print(f"  PUBLISH_TIME: {first_row['PUBLISH_TIME']}")
        print(f"  CONTENT_TYPE: '{first_row['CONTENT_TYPE']}'")
        print(f"  IS_PREMIUM: {first_row['IS_PREMIUM']}")
        print(f"  IS_HOSTED: {first_row['IS_HOSTED']}")
        
        # Check if all titles are empty
        empty_titles = news_df['TITLE'].str.strip() == ''
        print(f"\n  Empty titles: {empty_titles.sum()} / {len(news_df)}")
        
        # Check if all links are empty
        empty_links = news_df['LINK'].str.strip() == ''
        print(f"  Empty links: {empty_links.sum()} / {len(news_df)}")
        
        # Check if we have summaries
        has_summaries = news_df['SUMMARY'].str.strip() != ''
        print(f"  Has summaries: {has_summaries.sum()} / {len(news_df)}")
        
        # Check if we have descriptions
        has_descriptions = news_df['DESCRIPTION'].str.strip() != ''
        print(f"  Has descriptions: {has_descriptions.sum()} / {len(news_df)}")



NEWS DATA INFO

Shape: (10, 8)

Columns: ['TICKER', 'TITLE', 'PUBLISHER', 'LINK', 'PUBLISH_TIME', 'TYPE', 'THUMBNAIL_URL', 'DOWNLOAD_TIMESTAMP']

Data types:
TICKER                        object
TITLE                         object
PUBLISHER                     object
LINK                          object
PUBLISH_TIME          datetime64[ns]
TYPE                          object
THUMBNAIL_URL                 object
DOWNLOAD_TIMESTAMP    datetime64[ns]
dtype: object

CONTENT CHECK

First row values:
  TICKER: 'NVDA'
  TITLE: ''
  PUBLISHER: ''
  LINK: ''
  PUBLISH_TIME: 1969-12-31 19:00:00
  TYPE: ''

  Empty titles: 10 / 10
  Empty links: 10 / 10


In [None]:
# Display all news items with full text content
if news_df is not None:
    print("\n" + "="*80)
    print(f"ALL NEWS ITEMS FOR {ticker} - FULL CONTENT")
    print("="*80)
    
    # Set pandas display options to show full content
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 200)  # Show more text
    pd.set_option('display.width', None)
    
    # Show key columns for full-text database
    key_columns = ['ID', 'TITLE', 'SUMMARY', 'DESCRIPTION', 'PUBLISHER', 'LINK', 'PUBLISH_TIME', 'CONTENT_TYPE']
    display(news_df[key_columns])
    
    print(f"\n{'='*80}")
    print("FULL TEXT CONTENT FOR DATABASE")
    print("="*80)
    
    # Show the full text content that would go into a full-text database
    for idx, row in news_df.iterrows():
        print(f"\n[{idx + 1}] ID: {row['ID']}")
        print(f"    TITLE: {row['TITLE']}")
        print(f"    SUMMARY: {row['SUMMARY']}")
        print(f"    DESCRIPTION: {row['DESCRIPTION']}")
        print(f"    PUBLISHER: {row['PUBLISHER']}")
        print(f"    CONTENT_TYPE: {row['CONTENT_TYPE']}")
        print(f"    PUBLISH_TIME: {row['PUBLISH_TIME']}")
        print(f"    LINK: {row['LINK']}")
        print("-" * 80)



ALL NEWS ITEMS FOR NVDA


Unnamed: 0,TICKER,TITLE,PUBLISHER,LINK,PUBLISH_TIME,TYPE,THUMBNAIL_URL,DOWNLOAD_TIMESTAMP
0,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517689
1,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517691
2,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517692
3,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517693
4,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517693
5,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517694
6,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517695
7,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517696
8,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517697
9,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517698


In [None]:
# Display news in a more readable format with all text content
if news_df is not None:
    print("\n" + "="*80)
    print(f"FORMATTED NEWS FOR {ticker} - READABLE FORMAT")
    print("="*80)
    
    for idx, row in news_df.iterrows():
        print(f"\n[{idx + 1}] {row['TITLE']}")
        print(f"    ID: {row['ID']}")
        print(f"    Summary: {row['SUMMARY']}")
        print(f"    Description: {row['DESCRIPTION']}")
        print(f"    Publisher: {row['PUBLISHER']}")
        print(f"    Published: {row['PUBLISH_TIME']}")
        print(f"    Content Type: {row['CONTENT_TYPE']}")
        print(f"    Is Premium: {row['IS_PREMIUM']}")
        print(f"    Is Hosted: {row['IS_HOSTED']}")
        print(f"    Link: {row['LINK']}")
        if row['THUMBNAIL_URL']:
            print(f"    Thumbnail: {row['THUMBNAIL_URL']}")
        print("-" * 80)



FORMATTED NEWS FOR NVDA

[1] 
    Publisher: 
    Published: 1969-12-31 19:00:00
    Type: 
    Link: 
--------------------------------------------------------------------------------

[2] 
    Publisher: 
    Published: 1969-12-31 19:00:00
    Type: 
    Link: 
--------------------------------------------------------------------------------

[3] 
    Publisher: 
    Published: 1969-12-31 19:00:00
    Type: 
    Link: 
--------------------------------------------------------------------------------

[4] 
    Publisher: 
    Published: 1969-12-31 19:00:00
    Type: 
    Link: 
--------------------------------------------------------------------------------

[5] 
    Publisher: 
    Published: 1969-12-31 19:00:00
    Type: 
    Link: 
--------------------------------------------------------------------------------

[6] 
    Publisher: 
    Published: 1969-12-31 19:00:00
    Type: 
    Link: 
--------------------------------------------------------------------------------

[7] 
    Publi

In [None]:
# Display summary statistics
if news_df is not None:
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    
    # Count by publisher
    print("\nNews items by publisher:")
    publisher_counts = news_df['PUBLISHER'].value_counts()
    for publisher, count in publisher_counts.items():
        print(f"  {publisher}: {count}")
    
    # Count by content type
    print("\nNews items by content type:")
    type_counts = news_df['CONTENT_TYPE'].value_counts()
    for news_type, count in type_counts.items():
        print(f"  {news_type}: {count}")
    
    # Count by premium status
    print("\nNews items by premium status:")
    premium_counts = news_df['IS_PREMIUM'].value_counts()
    for is_premium, count in premium_counts.items():
        print(f"  Premium: {is_premium} - {count} items")
    
    # Date range
    print(f"\nDate range:")
    print(f"  Earliest: {news_df['PUBLISH_TIME'].min()}")
    print(f"  Latest: {news_df['PUBLISH_TIME'].max()}")
    
    # Check for thumbnails
    has_thumbnail = news_df['THUMBNAIL_URL'].notna() & (news_df['THUMBNAIL_URL'] != '')
    print(f"\nItems with thumbnails: {has_thumbnail.sum()} / {len(news_df)}")
    
    # Text content statistics
    print(f"\nText content statistics:")
    has_titles = news_df['TITLE'].str.strip() != ''
    has_summaries = news_df['SUMMARY'].str.strip() != ''
    has_descriptions = news_df['DESCRIPTION'].str.strip() != ''
    print(f"  Items with titles: {has_titles.sum()} / {len(news_df)}")
    print(f"  Items with summaries: {has_summaries.sum()} / {len(news_df)}")
    print(f"  Items with descriptions: {has_descriptions.sum()} / {len(news_df)}")
    
    # Average text lengths
    if has_titles.any():
        avg_title_length = news_df[has_titles]['TITLE'].str.len().mean()
        print(f"  Average title length: {avg_title_length:.1f} characters")
    if has_summaries.any():
        avg_summary_length = news_df[has_summaries]['SUMMARY'].str.len().mean()
        print(f"  Average summary length: {avg_summary_length:.1f} characters")
    if has_descriptions.any():
        avg_desc_length = news_df[has_descriptions]['DESCRIPTION'].str.len().mean()
        print(f"  Average description length: {avg_desc_length:.1f} characters")



SUMMARY STATISTICS

News items by publisher:
  : 10

News items by type:
  : 10

Date range:
  Earliest: 1969-12-31 19:00:00
  Latest: 1969-12-31 19:00:00

Items with thumbnails: 0 / 10


In [81]:
# Test with multiple tickers
print("\n" + "="*80)
print("TESTING MULTIPLE TICKERS")
print("="*80)

test_tickers = ["AAPL", "MSFT", "GOOGL", "TSLA"]
results = {}

for test_ticker in test_tickers:
    print(f"\nFetching news for {test_ticker}...")
    df = downloader.get_recent_news(ticker=test_ticker, max_items=5)
    
    if df is not None:
        results[test_ticker] = len(df)
        print(f"  ✓ Retrieved {len(df)} items")
        # Show first headline
        if len(df) > 0:
            print(f"  Latest: {df.iloc[0]['TITLE'][:80]}...")
    else:
        results[test_ticker] = 0
        print(f"  ✗ No news found")

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
for ticker_name, count in results.items():
    print(f"{ticker_name}: {count} news items")


2025-10-06 16:29:06,546 - app - INFO - Fetching recent news for AAPL (attempt 1/3)



TESTING MULTIPLE TICKERS

Fetching news for AAPL...


2025-10-06 16:29:06,866 - app - INFO - Successfully fetched 5 news items for AAPL
2025-10-06 16:29:06,866 - app - INFO - Fetching recent news for MSFT (attempt 1/3)


  ✓ Retrieved 5 items
  Latest: ...

Fetching news for MSFT...


2025-10-06 16:29:07,073 - app - INFO - Successfully fetched 5 news items for MSFT
2025-10-06 16:29:07,073 - app - INFO - Fetching recent news for GOOGL (attempt 1/3)
2025-10-06 16:29:07,273 - app - INFO - Successfully fetched 5 news items for GOOGL


  ✓ Retrieved 5 items
  Latest: ...

Fetching news for GOOGL...


2025-10-06 16:29:07,275 - app - INFO - Fetching recent news for TSLA (attempt 1/3)


  ✓ Retrieved 5 items
  Latest: ...

Fetching news for TSLA...


2025-10-06 16:29:07,475 - app - INFO - Successfully fetched 5 news items for TSLA


  ✓ Retrieved 5 items
  Latest: ...

SUMMARY
AAPL: 5 news items
MSFT: 5 news items
GOOGL: 5 news items
TSLA: 5 news items


In [None]:
# Save the news to a parquet file
if news_df is not None:
    print("\n" + "="*80)
    print("SAVE NEWS TO FILE")
    print("="*80)
    
    # Save the data to ./data/news/{ticker}/news-{ticker}.parquet
    success = downloader.format_and_save_data(news_df, ticker, data_type="news")
    
    if success:
        print(f"\n✓ News data saved successfully!")
        
        # Get the full absolute path
        import os
        filepath = f"./data/news/{ticker}/news-{ticker}.parquet"
        abs_filepath = os.path.abspath(filepath)
        
        print(f"  Location: {filepath}")
        print(f"  Full Path: {abs_filepath}")
        print(f"  Records: {len(news_df)}")
        
        # Check if there were any existing records
        if os.path.exists(filepath):
            saved_df = pd.read_parquet(filepath)
            print(f"  Total records in file: {len(saved_df)}")
            
            # Show the schema of the saved file
            print(f"\n  Schema of saved file:")
            print(f"    Columns: {list(saved_df.columns)}")
            print(f"    Data types:")
            for col, dtype in saved_df.dtypes.items():
                print(f"      {col}: {dtype}")
            
            # Show sample of saved data
            print(f"\n  Sample of saved data:")
            sample_cols = ['ID', 'TITLE', 'SUMMARY', 'PUBLISHER', 'CONTENT_TYPE']
            print(saved_df[sample_cols].head(2).to_string())
            
    else:
        print(f"\n✗ Failed to save news data")


2025-10-06 16:29:07,486 - app - INFO - Loaded 1 existing records for NVDA
2025-10-06 16:29:07,487 - app - INFO - Merging 10 new records with 1 existing records
2025-10-06 16:29:07,488 - app - INFO - Removed 10 duplicate news items
2025-10-06 16:29:07,492 - app - INFO - Successfully saved 1 total records to data/price-history/NVDA/news-NVDA.parquet



SAVE NEWS TO FILE

✓ News data saved successfully!
  Location: ./data/news/NVDA/news-NVDA.parquet
  Records: 10


In [None]:
# Create full-text database schema and sample
if news_df is not None:
    print("\n" + "="*80)
    print("FULL-TEXT DATABASE SCHEMA")
    print("="*80)
    
    # Define the full-text database schema
    full_text_schema = {
        'id': 'VARCHAR(36) PRIMARY KEY',  # UUID
        'ticker': 'VARCHAR(10)',
        'title': 'TEXT',
        'summary': 'TEXT', 
        'description': 'TEXT',
        'full_text': 'TEXT',  # Combined searchable text
        'publisher': 'VARCHAR(100)',
        'content_type': 'VARCHAR(50)',
        'publish_time': 'TIMESTAMP',
        'is_premium': 'BOOLEAN',
        'link': 'VARCHAR(1000)',
        'thumbnail_url': 'VARCHAR(1000)',
        'download_timestamp': 'TIMESTAMP'
    }
    
    print("Full-text database schema:")
    for field, data_type in full_text_schema.items():
        print(f"  {field}: {data_type}")
    
    # Create sample full-text records
    print(f"\n{'='*80}")
    print("SAMPLE FULL-TEXT RECORDS")
    print("="*80)
    
    for idx, row in news_df.head(3).iterrows():
        # Combine all text fields for full-text search
        full_text = f"{row['TITLE']} {row['SUMMARY']} {row['DESCRIPTION']}".strip()
        
        print(f"\nRecord {idx + 1}:")
        print(f"  ID: {row['ID']}")
        print(f"  Ticker: {row['TICKER']}")
        print(f"  Title: {row['TITLE']}")
        print(f"  Summary: {row['SUMMARY']}")
        print(f"  Description: {row['DESCRIPTION']}")
        print(f"  Full Text (for search): {full_text[:200]}...")
        print(f"  Publisher: {row['PUBLISHER']}")
        print(f"  Content Type: {row['CONTENT_TYPE']}")
        print(f"  Publish Time: {row['PUBLISH_TIME']}")
        print(f"  Is Premium: {row['IS_PREMIUM']}")
        print(f"  Link: {row['LINK']}")
        print("-" * 80)
    
    # Show SQL for creating full-text search table
    print(f"\n{'='*80}")
    print("SQL FOR FULL-TEXT SEARCH TABLE")
    print("="*80)
    
    sql_create = """
CREATE TABLE stock_news_fulltext (
    id VARCHAR(36) PRIMARY KEY,
    ticker VARCHAR(10),
    title TEXT,
    summary TEXT,
    description TEXT,
    full_text TEXT,
    publisher VARCHAR(100),
    content_type VARCHAR(50),
    publish_time TIMESTAMP,
    is_premium BOOLEAN,
    link VARCHAR(1000),
    thumbnail_url VARCHAR(1000),
    download_timestamp TIMESTAMP,
    
    -- Full-text search index
    FULLTEXT(title, summary, description, full_text)
);
"""
    print(sql_create)
    
    # Show sample INSERT statements
    print(f"\n{'='*80}")
    print("SAMPLE INSERT STATEMENTS")
    print("="*80)
    
    for idx, row in news_df.head(2).iterrows():
        full_text = f"{row['TITLE']} {row['SUMMARY']} {row['DESCRIPTION']}".strip()
        
        insert_sql = f"""
INSERT INTO stock_news_fulltext VALUES (
    '{row['ID']}',
    '{row['TICKER']}',
    '{row['TITLE'].replace("'", "''")}',
    '{row['SUMMARY'].replace("'", "''")}',
    '{row['DESCRIPTION'].replace("'", "''")}',
    '{full_text.replace("'", "''")}',
    '{row['PUBLISHER']}',
    '{row['CONTENT_TYPE']}',
    '{row['PUBLISH_TIME']}',
    {row['IS_PREMIUM']},
    '{row['LINK']}',
    '{row['THUMBNAIL_URL']}',
    '{row['DOWNLOAD_TIMESTAMP']}'
);
"""
        print(insert_sql)


In [83]:
newsFile = "/Users/jdacosta/Library/CloudStorage/GoogleDrive-john.dacosta@snowflake.com/My Drive/_local/Downloads/_cursor_demos/Snowflake_Intelligence_HOL/jdacosta/pricehistory/data/price-history/NVDA/news-NVDA.parquet"

In [84]:
dfNews = pd.read_parquet(newsFile)

In [85]:
dfNews

Unnamed: 0,TICKER,TITLE,PUBLISHER,LINK,PUBLISH_TIME,TYPE,THUMBNAIL_URL,DOWNLOAD_TIMESTAMP
0,NVDA,,,,1969-12-31 19:00:00,,,2025-10-06 16:29:06.517698
