In [None]:
import pandas as pd
from newsapi import NewsApiClient
from transformers import pipeline
import torch
from tqdm import tqdm
from IPython.display import display
import os
from datetime import datetime, timedelta

print("=== 1. NewsAPI Initialization ===")
newsapi = NewsApiClient(api_key='fe27f943772a4235a29f751edabae735') 

# Date range setup (last 30 days)
end_date = datetime.now()
start_date = end_date - timedelta(days=30)

try:
    # Fetch Bitcoin news
    print("\n=== 1. Fetching News Articles with Pagination ===")
    articles = []
    max_pages = 5  # Try up to 5 pages (100 per page = max 500 if needed)
    for page in range(1, max_pages + 1):
        try:
            response = newsapi.get_everything(
                q='Bitcoin OR BTC',
                language='en',
                from_param=start_date.strftime('%Y-%m-%d'),
                to=end_date.strftime('%Y-%m-%d'),
                sort_by='publishedAt',
                page_size=100,
                page=page
            )
            page_articles = response.get('articles', [])
            if not page_articles: # if no articles left stop
                break 
            articles.extend(page_articles)
        except Exception as e:
            print(f"Page {page} fetch failed: {str(e)}")
            break

    # Make into DataFrame and clean
    news_df = pd.DataFrame(articles)
    news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    news_df = news_df[['title', 'publishedAt']].drop_duplicates().dropna()

    print(f"Retrieved {len(news_df)} news articles across {page} page(s). Sample:")
    display(news_df.head(3))

    # Limit to 100 if too many
    news_df = news_df.head(100)

    # Basic cleaning
    news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    news_df = news_df[['title', 'publishedAt']].drop_duplicates()
    
    print(f"Retrieved {len(news_df)} news articles. Sample:")
    display(news_df.head(2))
    
except Exception as e:
    test_news = [
        "Bitcoin soars to $50,000 as institutional investors flock in",
        "SEC delays decision on Bitcoin ETF, causing market panic"
    ]
    news_df = pd.DataFrame({
        'title': test_news,
        'publishedAt': [datetime.now() - timedelta(days=x) for x in range(2)]
    })
    print("Using test data instead:")
    display(news_df)

#sentiment analysis 
print("\n=== 2. Sentiment Analysis Setup ===")
print(f"PyTorch CUDA available: {torch.cuda.is_available()}")

try:
    # Initialize FinBERT
    device = 0 if torch.cuda.is_available() else -1
    finbert = pipeline(
        "text-classification",
        model="ProsusAI/finbert",
        return_all_scores=True,
        device=device,
        truncation=True
    )
    print(f"FinBERT loaded on {'GPU' if device == 0 else 'CPU'}")
    
    # Sentiment extraction function
    def get_sentiment(text):
        try:
            results = finbert(text[:512])[0]  # Truncate long texts
            return {
                'sent_pos': next(r['score'] for r in results if r['label'] == 'positive'),
                'sent_neg': next(r['score'] for r in results if r['label'] == 'negative'),
                'sent_neutral': next(r['score'] for r in results if r['label'] == 'neutral')
            }
        except Exception as e:
            print(f"Error processing: '{text[:30]}...' - {str(e)}")
            return {'sent_pos': 0, 'sent_neg': 0, 'sent_neutral': 1}

    # Process with progress bar
    print("\n=== 3. Processing Headlines ===")
    tqdm.pandas(desc="Analyzing")
    sentiment_df = pd.DataFrame(news_df['title'].progress_apply(get_sentiment).tolist())
    enhanced_news = pd.concat([news_df, pd.DataFrame(sentiment_df)], axis=1)
    
    # VALIDATION BEFORE SAVING
    print("\n=== 4. Data Validation ===")
    # Ensure all sentiment columns are numeric
    for col in ['sent_pos', 'sent_neg', 'sent_neutral']:
        enhanced_news[col] = pd.to_numeric(enhanced_news[col], errors='coerce')
        if enhanced_news[col].isnull().any():
            print(f"⚠️ Found {enhanced_news[col].isnull().sum()} null values in {col} - filling with 0")
            enhanced_news[col] = enhanced_news[col].fillna(0)
    
    text_in_sentiment = enhanced_news[
        enhanced_news[['sent_pos', 'sent_neg', 'sent_neutral']]
        .applymap(lambda x: isinstance(x, str))
        .any(axis=1)
    ]
    if not text_in_sentiment.empty:
        print(" Found text")
        display(text_in_sentiment.head())
        raise ValueError("Text values found in sentiment columns")
    
    enhanced_news.to_csv(
        'news_with_sentiment.csv',
        index=False,
        float_format='%.15f'  
    )
    print(f"\n Saved {len(enhanced_news)} records to 'news_with_sentiment.csv'")
    
    # Verify saved file can be loaded correctly
    test_load = pd.read_csv('news_with_sentiment.csv')
    print("\n=== 5. File Verification ===")
    print("Loaded data types:")
    print(test_load.dtypes)
    print("\nSample from saved file:")
    display(test_load[['title', 'sent_pos', 'sent_neg', 'sent_neutral']].head(3))

except Exception as e:
    print(f" error: {str(e)}")
    if 'enhanced_news' in locals():
        print("\nCurrent data sample:")
        display(enhanced_news.head())

  from .autonotebook import tqdm as notebook_tqdm


=== 1. NewsAPI Initialization ===

=== 1. Fetching News Articles with Pagination ===
⚠️ Page 2 fetch failed: {'status': 'error', 'code': 'maximumResultsReached', 'message': 'You have requested too many results. Developer accounts are limited to a max of 100 results. You are trying to request results 100 to 200. Please upgrade to a paid plan if you need more results.'}
✅ Retrieved 99 news articles across 2 page(s). Sample:


Unnamed: 0,title,publishedAt
0,Bitcoin CDD Shows Uptick In Coin Movement — Ea...,2025-06-04 04:00:10
1,Bitcoin profit-taking underway as ‘big whales’...,2025-06-04 03:56:26
2,DMG Blockchain Solutions Announces Preliminary...,2025-06-04 03:54:10


✅ Retrieved 99 news articles. Sample:


Unnamed: 0,title,publishedAt
0,Bitcoin CDD Shows Uptick In Coin Movement — Ea...,2025-06-04 04:00:10
1,Bitcoin profit-taking underway as ‘big whales’...,2025-06-04 03:56:26



=== 2. Sentiment Analysis Setup ===
PyTorch CUDA available: False


Device set to use cpu


✅ FinBERT loaded on CPU

=== 3. Processing Headlines ===


Analyzing: 100%|██████████| 99/99 [00:02<00:00, 36.46it/s]


=== 4. Data Validation ===
⚠️ Found 1 null values in sent_pos - filling with 0
⚠️ Found 1 null values in sent_neg - filling with 0
⚠️ Found 1 null values in sent_neutral - filling with 0

✅ Saved 100 records to 'news_with_sentiment.csv'

=== 5. File Verification ===
Loaded data types:
title            object
publishedAt      object
sent_pos        float64
sent_neg        float64
sent_neutral    float64
dtype: object

Sample from saved file:



  .applymap(lambda x: isinstance(x, str))


Unnamed: 0,title,sent_pos,sent_neg,sent_neutral
0,Bitcoin CDD Shows Uptick In Coin Movement — Ea...,0.91758,0.039124,0.043296
1,Bitcoin profit-taking underway as ‘big whales’...,0.868116,0.086131,0.045753
2,DMG Blockchain Solutions Announces Preliminary...,0.044501,0.049754,0.905745
