<a href="https://colab.research.google.com/github/MaxStam1/AI-and-GPT-Bootcamp-Final-Project/blob/main/AI_Bootcamp_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install newsapi-python nltk transformers newspaper3k beautifulsoup4 plotly emoji==0.6.0 lxml[html_clean]



In [5]:
import requests
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline
from newspaper import Article
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Download required NLTK data
nltk.download('vader_lexicon')

class EnhancedFinancialNewsAnalyzer:
    def __init__(self, api_key):
        self.api_key = api_key
        # Traditional VADER sentiment analyzer
        self.vader_analyzer = SentimentIntensityAnalyzer()
        # Hugging Face models with explicit models
        print("Loading models...")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.hf_sentiment = pipeline("sentiment-analysis",
                                   model="finiteautomata/bertweet-base-sentiment-analysis")
        self.classifier = pipeline("zero-shot-classification",
                                 model="facebook/bart-large-mnli")

        # Define categories for classification
        self.topic_categories = [
            "Market Analysis",
            "Company Earnings",
            "Economic Policy",
            "Technology Trends",
            "Mergers & Acquisitions",
            "Market Sentiment",
            "Risk Analysis"
        ]

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def get_news(self, query='finance', days_back=7):
        """Fetch financial news using NewsAPI"""
        base_url = "https://newsapi.org/v2/everything"

        end_date = datetime.now()
        start_date = end_date - timedelta(days=days_back)

        params = {
            'q': query,
            'from': start_date.strftime('%Y-%m-%d'),
            'to': end_date.strftime('%Y-%m-%d'),
            'language': 'en',
            'sortBy': 'relevancy',
            'apiKey': self.api_key
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching news: {e}")
            return None

    def calculate_market_sentiment_score(self, vader_scores, hf_sentiment):
        """Calculate combined market sentiment score"""
        # Convert VADER compound score to 0-10 scale
        vader_score = (vader_scores['compound'] + 1) * 5

        # Convert HuggingFace sentiment to numeric
        hf_numeric = 7.5 if hf_sentiment['label'] == 'POS' else 2.5
        hf_confidence = hf_sentiment['score']

        # Weighted combination
        combined_score = (vader_score * 0.6) + (hf_numeric * 0.4 * hf_confidence)

        # Ensure score is between 1 and 10
        final_score = max(1, min(10, combined_score))

        # Map score to sentiment category
        if final_score <= 3:
            category = "Extreme Fear"
        elif final_score <= 5:
            category = "Fear"
        elif final_score <= 6:
            category = "Neutral"
        elif final_score <= 8:
            category = "Greed"
        else:
            category = "Extreme Greed"

        return {
            'score': round(final_score, 1),
            'category': category
        }

    def analyze_article(self, article_data):
        """Analyze a single article with enhanced analysis"""
        title = article_data.get('title', '')
        url = article_data.get('url', '')
        description = article_data.get('description', '')

        print(f"\nAnalyzing: {title}")
        print(f"URL: {url}")

        # Extract content
        try:
            article = Article(url)
            article.download()
            article.parse()
            content = article.text if article.text else title
        except Exception as e:
            content = description or title
            print(f"Error extracting content: {e}")

        # Generate summary - handle long texts better
        try:
            # Break content into smaller chunks if it's too long
            content_words = content.split()
            chunk_size = 100  # Reduced chunk size for better handling
            first_chunk = ' '.join(content_words[:chunk_size])

            # Ensure the input is within model limits
            summary = self.summarizer(
                first_chunk,
                max_length=60,  # Reduced max length
                min_length=20,  # Reduced min length
                do_sample=False,
                truncation=True  # Ensure truncation
            )[0]['summary_text']
        except Exception as e:
            print(f"Error in summarization: {e}")
            summary = description or "Summary generation failed"

        # For sentiment analysis, use a shortened version if content is too long
        try:
            # Ensure content is not too long for sentiment analysis
            analysis_content = ' '.join(content_words[:200])  # Use first 200 words

            # VADER sentiment (works with any length)
            vader_sentiment = self.vader_analyzer.polarity_scores(analysis_content)

            # HuggingFace sentiment with proper truncation
            hf_sentiment = self.hf_sentiment(
                analysis_content[:256],  # Further reduced length
                truncation=True
            )[0]
        except Exception as e:
            print(f"Sentiment analysis failed: {e}")
            vader_sentiment = {'compound': 0, 'pos': 0, 'neg': 0, 'neu': 1}
            hf_sentiment = {'label': 'NEU', 'score': 0.5}

        # Topic classification with shorter text
        try:
            topic_text = f"{title}. {' '.join(content_words[:100])}"  # Use title and first 100 words
            topics = self.classifier(
                topic_text,
                self.topic_categories,
                truncation=True
            )
        except Exception as e:
            print(f"Topic classification failed: {e}")
            topics = {
                'labels': ['Market Analysis'],
                'scores': [1.0]
            }

        # Calculate market sentiment score
        market_sentiment = self.calculate_market_sentiment_score(vader_sentiment, hf_sentiment)

        return {
            'title': title,
            'url': url,
            'summary': summary,
            'vader_sentiment': vader_sentiment,
            'huggingface_sentiment': hf_sentiment,
            'market_sentiment': market_sentiment,
            'topics': {
                'main_topic': topics['labels'][0],
                'confidence': topics['scores'][0],
                'all_topics': list(zip(topics['labels'], topics['scores']))
            },
            'processed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }

    def create_visualizations(self, results):
        """Create interactive visualizations of the analysis results"""
        # 1. Market Sentiment Overview
        fig1 = go.Figure(go.Indicator(
            mode = "gauge+number",
            value = sum(r['market_sentiment']['score'] for r in results) / len(results),
            title = {'text': "Average Market Sentiment"},
            gauge = {
                'axis': {'range': [1, 10]},
                'steps': [
                    {'range': [1, 3], 'color': "red"},
                    {'range': [3, 5], 'color': "orange"},
                    {'range': [5, 7], 'color': "yellow"},
                    {'range': [7, 9], 'color': "lightgreen"},
                    {'range': [9, 10], 'color': "green"}
                ],
                'bar': {'color': "darkblue"}
            }
        ))

        # 2. Topic Distribution
        topic_data = []
        for result in results:
            for topic, score in result['topics']['all_topics']:
                topic_data.append({'topic': topic, 'confidence': score})

        topic_df = pd.DataFrame(topic_data)
        fig2 = px.bar(
            topic_df.groupby('topic')['confidence'].mean().reset_index(),
            x='topic',
            y='confidence',
            title='Topic Distribution'
        )

        # 3. Sentiment Comparison
        fig3 = make_subplots(rows=1, cols=2,
                            subplot_titles=('VADER Sentiment', 'HuggingFace Sentiment'))

        # VADER scatter
        fig3.add_trace(
            go.Scatter(
                x=list(range(len(results))),
                y=[r['vader_sentiment']['compound'] for r in results],
                mode='markers+lines',
                name='VADER'
            ),
            row=1, col=1
        )

        # HuggingFace scatter
        fig3.add_trace(
            go.Scatter(
                x=list(range(len(results))),
                y=[1 if r['huggingface_sentiment']['label'] == 'POS' else 0
                   for r in results],
                mode='markers+lines',
                name='HuggingFace'
            ),
            row=1, col=2
        )

        return {
            'market_sentiment_gauge': fig1,
            'topic_distribution': fig2,
            'sentiment_comparison': fig3
        }

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
from IPython.display import display, HTML
from getpass import getpass  # This will hide the API key when typing

print("Please enter your NewsAPI key:")
api_key = getpass("API Key: ")

# Configure your search parameters here
SEARCH_TOPIC = 'financial markets'  # You can change the topic
MAX_ARTICLES = 10  # You can change the number of articles

# Initialize and run analysis
analyzer = EnhancedFinancialNewsAnalyzer(api_key)
print(f"\nAnalyzing {MAX_ARTICLES} articles about '{SEARCH_TOPIC}'...")

# Get news and analyze
news_data = analyzer.get_news(SEARCH_TOPIC)
results = []

if news_data and 'articles' in news_data:
    for article in news_data['articles'][:MAX_ARTICLES]:
        result = analyzer.analyze_article(article)
        if result:
            results.append(result)
            print(f"\nProcessed: {result['title']}")
            print(f"Summary: {result['summary']}")
            print(f"Market Sentiment: {result['market_sentiment']['score']}/10 "
                  f"({result['market_sentiment']['category']})")
            print(f"Main Topic: {result['topics']['main_topic']} "
                  f"(confidence: {result['topics']['confidence']:.2f})")

# Create and display visualizations
if results:
    figs = analyzer.create_visualizations(results)

    # Display visualizations
    for name, fig in figs.items():
        fig.show()
else:
    print("\nNo articles were successfully analyzed. Please check your API key and search parameters.")

Please enter your NewsAPI key:
API Key: ··········
Loading models...


Device set to use cpu
Device set to use cpu
Device set to use cpu



Analyzing 10 articles about 'financial markets'...

Analyzing: Wealth strategies that used to be reserved for billionaires are becoming more accessible
URL: https://www.businessinsider.com/wealth-strategies-for-billionaires-are-becoming-more-accessible-2024-12


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processed: Wealth strategies that used to be reserved for billionaires are becoming more accessible
Summary: New tech is lowering the price of entry in fields like direct indexing and private markets. These personalized portfolios used to be out of reach.
Market Sentiment: 7.7/10 (Greed)
Main Topic: Technology Trends (confidence: 0.79)

Analyzing: Private credit firms are hot acquisition targets. As M&A ramps up next year, here are the firms likely to be bought.
URL: https://www.businessinsider.com/private-market-asset-management-deal-outlook-2024-12

Processed: Private credit firms are hot acquisition targets. As M&A ramps up next year, here are the firms likely to be bought.
Summary: Firms want more private market products to offer clients. Private credit firms with $30 billion to $70 billion in assets will be the firms to watch.
Market Sentiment: 6.8/10 (Greed)
Main Topic: Mergers & Acquisitions (confidence: 0.51)

Analyzing: AirPods to Be Made in India for the First Time Next Year