In [1]:
!pip install praw
!pip install newsapi-python
!pip install transformers
!pip install trafilatura



In [2]:
# System libraries
import os
import time
import re
import requests

# Data analysis libraries
import numpy as np
import pandas as pd

# Time handling
from datetime import datetime, timedelta

# API libraries
import praw
from newsapi import NewsApiClient

# Natural Language Processing and Machine Learning
from transformers import pipeline
import trafilatura


In [3]:
# -------------------------------------------
# 2) Reddit Data Collection
# -------------------------------------------
# Initialize Reddit API connection
reddit = praw.Reddit(
    client_id="28fE0w7nKpv349-WrrRv-w",
    client_secret="2L4owi5krxRJXzFVp_OFQib8xV6Z1w",
    user_agent="python:apple_sentiment_analysis:v1.0 (by THE ALGO TEAM)"
)

# Define the subreddit you want to access
subreddit_name = "aapl"
subreddit = reddit.subreddit(subreddit_name)

# Fetch submissions from the subreddit
# limit=1000 will try to pull up to 1000 new posts
print(f"Collecting posts from r/{subreddit_name}...")
posts_data = []

for submission in subreddit.new(limit=1000):
    submission_date = datetime.utcfromtimestamp(submission.created_utc)

    # Get the post text content
    post_text = submission.selftext if submission.selftext else "No content available"

    # Add post data to our collection
    posts_data.append({
        'title': submission.title,
        'text': post_text,
        'Date Created': submission_date,
        'author': submission.author.name if submission.author else 'Deleted',
        'upvotes': submission.score,
        'comments': submission.num_comments,
        'url': submission.url
    })

# Convert the list of posts to a DataFrame
posts_df = pd.DataFrame(posts_data)

# Sort by 'Date Created' in descending order (most recent first)
posts_df = posts_df.sort_values(by='Date Created', ascending=False)

# Filter posts that have real content (not the placeholder)
content_posts = posts_df[posts_df['text'] != "No content available"]

# Print statistics
print(f"Total posts collected: {len(posts_df)}")
print(f"Posts with content: {len(content_posts)}")

# Get exactly 100 posts with content (or all if less than 100)
max_posts = 100  # Set the limit to exactly 100
if len(content_posts) > max_posts:
    content_posts = content_posts.head(max_posts)  # Take only the first 100 posts
    print(f"Limited to {max_posts} posts with content")
else:
    print(f"Found {len(content_posts)} posts with content (less than 100)")

Collecting posts from r/aapl...


  submission_date = datetime.utcfromtimestamp(submission.created_utc)


Total posts collected: 963
Posts with content: 502
Limited to 100 posts with content


In [4]:
# -------------------------------------------
# 3) Sentiment Analysis on Reddit Text Posts
# -------------------------------------------

# Load the sentiment analysis model
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
)

def get_sentiment(text):

    if not isinstance(text, str) or not text.strip():
        return ("neutral", 0.0)

    # Truncate text if it's longer than 512 characters
    max_length = 512
    if len(text) > max_length:
        text = text[:max_length]

    # Run text through the sentiment pipeline with truncation
    result = sentiment_pipeline(text, truncation=True)
    return (result[0]['label'], result[0]['score'])

# Apply sentiment analysis to both title and text
print("Analyzing sentiment for titles...")
content_posts['title_sentiment'] = content_posts['title'].apply(get_sentiment)

print("Analyzing sentiment for text content...")
content_posts['text_sentiment'] = content_posts['text'].apply(get_sentiment)

# Extract label and score for both title and text
content_posts['title_sentiment_label'] = content_posts['title_sentiment'].apply(lambda x: x[0])
content_posts['title_sentiment_score'] = content_posts['title_sentiment'].apply(lambda x: x[1])
content_posts['text_sentiment_label'] = content_posts['text_sentiment'].apply(lambda x: x[0])
content_posts['text_sentiment_score'] = content_posts['text_sentiment'].apply(lambda x: x[1])

# Calculate weighted sentiment score (60% title, 40% text)
# Convert sentiment labels to numeric values for calculation
def sentiment_to_value(label, score):
    if label == "positive":
        return score
    elif label == "negative":
        return -score
    else:  # neutral
        return 0.0

# Calculate weighted sentiment
title_weight = 0.10
text_weight = 0.90

content_posts['title_value'] = content_posts.apply(
    lambda row: sentiment_to_value(row['title_sentiment_label'], row['title_sentiment_score']),
    axis=1
)
content_posts['text_value'] = content_posts.apply(
    lambda row: sentiment_to_value(row['text_sentiment_label'], row['text_sentiment_score']),
    axis=1
)

# Calculate combined sentiment
content_posts['combined_sentiment_value'] = (
    content_posts['title_value'] * title_weight +
    content_posts['text_value'] * text_weight
)

# Determine combined sentiment label
def value_to_sentiment(value):
    if value > 0.5:
        return "positive"
    elif value < -0.5:
        return "negative"
    else:
        return "neutral"

content_posts['combined_sentiment_label'] = content_posts['combined_sentiment_value'].apply(value_to_sentiment)
content_posts['combined_sentiment_score'] = content_posts['combined_sentiment_value'].apply(abs)

# Clean up temporary columns
content_posts = content_posts.drop(['title_sentiment', 'text_sentiment', 'title_value', 'text_value'], axis=1)




Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Analyzing sentiment for titles...
Analyzing sentiment for text content...


In [5]:
# -------------------------------------------
# 4) Calculating a "Virality" Score
# -------------------------------------------

# Step 1: Compute a raw virality score
content_posts['virality'] = content_posts['upvotes'] + 0.3 * content_posts['comments']

# Step 2: Take the log norm
content_posts['virality_scaled'] = np.log1p(content_posts['virality']) / np.log1p(100)

# Step 3: If virality_scaled > 1, cap it at 1.
# Using clip method for cleaner code
content_posts['virality_scaled'] = content_posts['virality_scaled'].clip(upper=1.0)

# Step 5: Compute weighted sentiment using combined sentiment score
content_posts['sentiment_virality_score'] = (
    (content_posts['combined_sentiment_score'] * 0.65) +
    (content_posts['virality_scaled'] * 0.35 )
)

# Print summary statistics
print("\nVirality Score Analysis:")
print(f"Average virality score: {content_posts['virality_scaled'].mean():.4f}")
print(f"Average weighted sentiment: {content_posts['sentiment_virality_score'].mean():.4f}")

# Show the posts with highest virality
print("\nTop 5 Most Viral Posts:")
top_viral = content_posts.sort_values('virality_scaled', ascending=False).head(5)
for idx, row in top_viral.iterrows():
    print(f"Title: {row['title']}")
    print(f"Upvotes: {row['upvotes']}, Comments: {row['comments']}")
    print(f"Virality: {row['virality_scaled']:.4f}, Sentiment: {row['combined_sentiment_label']}")
    print("-" * 80)



Virality Score Analysis:
Average virality score: 0.4787
Average weighted sentiment: 0.4309

Top 5 Most Viral Posts:
Title: Upvote if HOLDING AAPL (that dip opportunity was REALLY good!)
Upvotes: 158, Comments: 14
Virality: 1.0000, Sentiment: neutral
--------------------------------------------------------------------------------
Title: Thank you, Apple
Upvotes: 102, Comments: 30
Virality: 1.0000, Sentiment: neutral
--------------------------------------------------------------------------------
Title: Upvote if still here on AAPL (Wanting to see how many are active here) [ Updated Technical Analysis post for AAPL ]
Upvotes: 98, Comments: 7
Virality: 1.0000, Sentiment: positive
--------------------------------------------------------------------------------
Title: Why Apple Stock Gained 30% Last Year
Upvotes: 114, Comments: 46
Virality: 1.0000, Sentiment: positive
--------------------------------------------------------------------------------
Title: Like or Upvote this POST if you thi

In [6]:
# Creat clean data frame for virality score

final_virality_df = pd.DataFrame({
    'title': content_posts['title'],
    'text': content_posts['text'],
    'Date Created': content_posts['Date Created'],
    'upvotes': content_posts['upvotes'],
    'comments': content_posts['comments'],
    'author': content_posts['author'],
    'url': content_posts['url'],
    'combined_sentiment_label': content_posts['combined_sentiment_label'],
    'sentiment_virality_score': content_posts['sentiment_virality_score'],
})


In [7]:

# NewsAPI configuration
api_key = "7a226dec8a994247863f12caa7bc8ad5"  # Replace with your actual NewsAPI key
base_url = "https://newsapi.org/v2/everything"

# Define trusted news sources
financial_sources = [
    "nytimes.com", "bbc.com", "reuters.com", "bloomberg.com", "ft.com",
    "wsj.com", "cnbc.com", "forbes.com", "marketwatch.com", "theverge.com",
    "techcrunch.com", "engadget.com", "cnet.com", "zdnet.com", 'finance.yahoo.com',
    'nasdaq.com',
]
sources_param = ",".join(financial_sources)

# The earliest date allowed according to the error message
api_earliest_date = "2025-01-25"
now = datetime.now()

def fetch_articles_with_criteria(query, date_from=None, date_to=None, source_domains=None, page=1):

    params = {
        "apiKey": api_key,
        "q": query,
        "language": "en",
        "sortBy": "publishedAt",
        "pageSize": 100,
        "page": page
    }

    if date_from:
        params["from"] = date_from
    if date_to:
        params["to"] = date_to
    if source_domains:
        params["domains"] = source_domains

    print(f"Fetching articles for query: {query} | Date range: {date_from} to {date_to} | Page: {page}")
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data.get("articles", [])
        print(f"Retrieved {len(articles)} articles for this query.")
        return articles
    else:
        print(f"Error fetching data: {response.status_code}")
        print(response.json())
        return []

# Split date ranges into multiple parts within the allowed period to maximize results
date_ranges = [
    # Full allowed period
    (api_earliest_date, now.strftime('%Y-%m-%d')),
    # First week
    (
        api_earliest_date,
        (datetime.strptime(api_earliest_date, '%Y-%m-%d') + timedelta(days=7)).strftime('%Y-%m-%d')
    ),
    # Second week
    (
        (datetime.strptime(api_earliest_date, '%Y-%m-%d') + timedelta(days=8)).strftime('%Y-%m-%d'),
        (datetime.strptime(api_earliest_date, '%Y-%m-%d') + timedelta(days=15)).strftime('%Y-%m-%d')
    ),
    # Third week to now
    (
        (datetime.strptime(api_earliest_date, '%Y-%m-%d') + timedelta(days=16)).strftime('%Y-%m-%d'),
        now.strftime('%Y-%m-%d')
    )
]

# Expanded query variations related to Apple to get more results
search_queries = [
    "Apple AND iPhone",
    "Apple AND MacBook",
    "Apple AND Cook",
    "AAPL AND stock",
    "Apple AND revenue",
    "Apple AND product",
    "Apple AND Vision Pro",
    "Apple AND iOS",
    "Apple AND M3",
    "Apple AND AI",
    "Apple AND sales",
    "Apple AND quarterly",
    "AAPL AND market",
    "Apple AND investment",
    "Apple AND hardware",
    "Apple AND software",
    "Apple AND services",
    "Apple AND iPad",
    "Apple AND Watch",
    "Apple AND App Store",
    "Apple AND Music",
    "Apple AND TV+",
    "Apple AND AirPods",
    "Apple AND Silicon",
    "Apple AND privacy",
    "Apple AND HomePod",
    "Apple AND competitors"
]

def collect_articles(target_count=200):  # Increased target to 200 to ensure enough after deduplication

    all_articles = []
    seen_urls = set()

    # First attempt: try all queries and date ranges
    for query in search_queries:
        for date_from, date_to in date_ranges:
            if len(seen_urls) >= target_count:
                break

            articles = fetch_articles_with_criteria(query, date_from, date_to, sources_param)

            for article in articles:
                url = article.get("url", "")
                if url and url not in seen_urls:
                    all_articles.append(article)
                    seen_urls.add(url)

            time.sleep(0.5)  # Small delay to prevent rate limiting

    # Second attempt: try additional pages if we still need more articles
    if len(seen_urls) < target_count:
        print("\nTrying additional pages to get more articles...")

        for query in search_queries:  # Try all queries
            for page in range(2, 5):  # Try pages 2, 3, 4
                if len(seen_urls) >= target_count:
                    break

                articles = fetch_articles_with_criteria(
                    query,
                    api_earliest_date,
                    now.strftime('%Y-%m-%d'),
                    sources_param,
                    page
                )

                for article in articles:
                    url = article.get("url", "")
                    if url and url not in seen_urls:
                        all_articles.append(article)
                        seen_urls.add(url)

                time.sleep(0.5)

    return all_articles

# Collect articles - aiming for more than we need to account for duplicates
all_articles = collect_articles(target_count=200)
print(f"\nTotal articles collected: {len(all_articles)}")

# Filter articles that mention Apple in the title or description
apple_articles = []
for article in all_articles:
    title = article.get("title", "")
    description = article.get("description", "")

    # Case-insensitive search for "apple" or "aapl"
    if re.search(r'apple|aapl', title + " " + description, re.IGNORECASE):
        apple_articles.append(article)

print(f"Articles specifically about Apple: {len(apple_articles)}")

# Process and prepare data for DataFrame
articles_data = []
for article in apple_articles:
    title = article.get("title", "")
    description = article.get("description", "")
    url = article.get("url", "")
    published_at = article.get("publishedAt", "")
    source_name = article.get("source", {}).get("name", "")

    articles_data.append({
        "Title": title,
        "Description": description,
        "URL": url,
        "PublishedAt": published_at,
        "Source": source_name
    })

# Create DataFrame from collected data
df = pd.DataFrame(articles_data)
print(f"Processing all {len(df)} Apple-related articles")

def extract_full_text(article_url):

    try:
        downloaded = trafilatura.fetch_url(article_url)
        if downloaded:
            return trafilatura.extract(downloaded)
        return "[Failed to fetch content]"
    except Exception as e:
        return f"[Error fetching content: {e}]"

# Extract full content from each article
print("\nExtracting full article content...")
contents = []

for idx, row in df.iterrows():
    print(f"Processing article {idx + 1}/{len(df)}: {row['Title'][:50]}...")
    content = extract_full_text(row["URL"])
    contents.append(content)
    time.sleep(1)  # Delay to avoid overloading servers

# Add content to DataFrame
df["Content"] = contents

# Convert published date to datetime format
df["PublishedAt"] = pd.to_datetime(df["PublishedAt"])

# Filter articles that don't have valid content
def is_valid_content(text):

    if not text:
        return False
    if text.startswith("[Failed") or "Error fetching content" in text:
        return False
    if len(text.strip()) < 100:  # Ensure content has meaningful length
        return False
    return True

# Filter for valid content
valid_df = df[df["Content"].apply(is_valid_content)].copy()
print(f"\nArticles with valid content: {len(valid_df)}")

# Remove duplicate content
unique_content_df = valid_df.drop_duplicates(subset=['Content'])
print(f"Articles after removing content duplicates: {len(unique_content_df)}")

# If we have more than 100 unique articles, limit to 100
if len(unique_content_df) > 100:
    final_df = unique_content_df.iloc[:100]
    print(f"Trimmed to 100 unique articles")
else:
    final_df = unique_content_df
    print(f"Using all {len(unique_content_df)} unique articles")

print(f"Final DataFrame shape: {final_df.shape}")
print("\nSample of final articles:")
print(final_df[["Title", "Source", "PublishedAt"]].head())




Fetching articles for query: Apple AND iPhone | Date range: 2025-01-25 to 2025-02-27 | Page: 1
Error fetching data: 426
{'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-01-26, but you have requested 2025-01-25. You may need to upgrade to a paid plan.'}
Fetching articles for query: Apple AND iPhone | Date range: 2025-01-25 to 2025-02-01 | Page: 1
Error fetching data: 426
{'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-01-26, but you have requested 2025-01-25. You may need to upgrade to a paid plan.'}
Fetching articles for query: Apple AND iPhone | Date range: 2025-02-02 to 2025-02-09 | Page: 1
Retrieved 94 articles for this query.
Fetching articles for query: Apple AND iPhone | Date range: 2025-02-10 to 2025-02-27 | Page: 1
Retrieved 

In [8]:

# Initialize the sentiment analysis pipeline with the financial news model
print("\nLoading sentiment analysis model...")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
)

# Function to analyze sentiment using the transformer model
def analyze_sentiment(text):
    if not isinstance(text, str) or not text.strip():
        return {'label': 'neutral', 'score': 0.5}

    # Handle long texts due to model limitations
    max_length = 512 * 4  # Characters, roughly estimating 4 chars per token
    if len(text) > max_length:
        text = text[:max_length]

    try:
        result = sentiment_pipeline(text)[0]
        return result
    except Exception as e:
        print(f"Error analyzing text: {e}")
        return {'label': 'neutral', 'score': 0.5}

# Apply sentiment analysis to different parts of the articles
print("\nAnalyzing sentiment of article content, titles, and descriptions...")

content_sentiments = []
title_sentiments = []
description_sentiments = []

# Process articles
for i, row in final_df.iterrows():
    print(f"Analyzing article {i+1}/{len(final_df)}...")

    # Analyze full content
    content_sentiment = analyze_sentiment(row['Content'])
    content_sentiments.append(content_sentiment)

    # Analyze title
    title_sentiment = analyze_sentiment(row['Title'])
    title_sentiments.append(title_sentiment)

    # Analyze description/summary
    description_sentiment = analyze_sentiment(row['Description'])
    description_sentiments.append(description_sentiment)

    time.sleep(0.1)  # Small pause to prevent overloading

# Add sentiment results to DataFrame
# Content sentiment
final_df['content_sentiment_label'] = [s['label'] for s in content_sentiments]
final_df['content_sentiment_score'] = [s['score'] for s in content_sentiments]

# Title sentiment
final_df['title_sentiment_label'] = [s['label'] for s in title_sentiments]
final_df['title_sentiment_score'] = [s['score'] for s in title_sentiments]

# Description sentiment
final_df['desc_sentiment_label'] = [s['label'] for s in description_sentiments]
final_df['desc_sentiment_score'] = [s['score'] for s in description_sentiments]

# Create numeric sentiment values (-1, 0, 1)
sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
final_df['content_sentiment_value'] = final_df['content_sentiment_label'].map(sentiment_map)
final_df['title_sentiment_value'] = final_df['title_sentiment_label'].map(sentiment_map)
final_df['desc_sentiment_value'] = final_df['desc_sentiment_label'].map(sentiment_map)

# Display sentiment distribution for content
content_sentiment_counts = final_df['content_sentiment_label'].value_counts()
total_articles = len(final_df)
print("\n=== CONTENT SENTIMENT ANALYSIS ===")
print("Sentiment Distribution:")
print(content_sentiment_counts)
print(f"Positive: {content_sentiment_counts.get('positive', 0)} articles ({content_sentiment_counts.get('positive', 0)/total_articles*100:.1f}%)")
print(f"Neutral: {content_sentiment_counts.get('neutral', 0)} articles ({content_sentiment_counts.get('neutral', 0)/total_articles*100:.1f}%)")
print(f"Negative: {content_sentiment_counts.get('negative', 0)} articles ({content_sentiment_counts.get('negative', 0)/total_articles*100:.1f}%)")

# Display sentiment distribution for titles
title_sentiment_counts = final_df['title_sentiment_label'].value_counts()
print("\n=== TITLE SENTIMENT ANALYSIS ===")
print("Sentiment Distribution:")
print(title_sentiment_counts)
print(f"Positive: {title_sentiment_counts.get('positive', 0)} articles ({title_sentiment_counts.get('positive', 0)/total_articles*100:.1f}%)")
print(f"Neutral: {title_sentiment_counts.get('neutral', 0)} articles ({title_sentiment_counts.get('neutral', 0)/total_articles*100:.1f}%)")
print(f"Negative: {title_sentiment_counts.get('negative', 0)} articles ({title_sentiment_counts.get('negative', 0)/total_articles*100:.1f}%)")

# Display sentiment distribution for descriptions
desc_sentiment_counts = final_df['desc_sentiment_label'].value_counts()
print("\n=== DESCRIPTION SENTIMENT ANALYSIS ===")
print("Sentiment Distribution:")
print(desc_sentiment_counts)
print(f"Positive: {desc_sentiment_counts.get('positive', 0)} articles ({desc_sentiment_counts.get('positive', 0)/total_articles*100:.1f}%)")
print(f"Neutral: {desc_sentiment_counts.get('neutral', 0)} articles ({desc_sentiment_counts.get('neutral', 0)/total_articles*100:.1f}%)")
print(f"Negative: {desc_sentiment_counts.get('negative', 0)} articles ({desc_sentiment_counts.get('negative', 0)/total_articles*100:.1f}%)")

# Compare sentiment agreement between content and title
agreement_count = sum(final_df['content_sentiment_label'] == final_df['title_sentiment_label'])
agreement_percentage = agreement_count / total_articles * 100

print("\n=== SENTIMENT AGREEMENT ANALYSIS ===")
print(f"Content and title sentiments agree in {agreement_count} articles ({agreement_percentage:.1f}%)")

# Analyze cases where content and title sentiments differ
print("\nCases where content and title sentiments differ:")
different_sentiment = final_df[final_df['content_sentiment_label'] != final_df['title_sentiment_label']]
disagreement_counts = different_sentiment.groupby(['title_sentiment_label', 'content_sentiment_label']).size()
print(disagreement_counts)

# Top articles with different sentiment between content and title
print("\nExamples of articles with sentiment differences between content and title:")
for i, (_, row) in enumerate(different_sentiment.head(5).iterrows(), 1):
    print(f"{i}. Title: {row['Title']}")
    print(f"   Title Sentiment: {row['title_sentiment_label']} (score: {row['title_sentiment_score']:.2f})")
    print(f"   Content Sentiment: {row['content_sentiment_label']} (score: {row['content_sentiment_score']:.2f})")
    print()

print("Sentiment analysis completed!")





Loading sentiment analysis model...


Device set to use cuda:0



Analyzing sentiment of article content, titles, and descriptions...
Analyzing article 1/100...
Analyzing article 2/100...
Analyzing article 3/100...
Analyzing article 4/100...
Analyzing article 5/100...
Analyzing article 6/100...
Analyzing article 7/100...
Analyzing article 8/100...
Analyzing article 9/100...
Analyzing article 10/100...
Analyzing article 12/100...
Analyzing article 13/100...
Analyzing article 14/100...
Analyzing article 16/100...
Analyzing article 17/100...
Analyzing article 18/100...
Analyzing article 19/100...
Analyzing article 20/100...


Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors


Analyzing article 21/100...
Analyzing article 22/100...
Error analyzing text: The expanded size of the tensor (526) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 526].  Tensor sizes: [1, 514]
Analyzing article 23/100...
Analyzing article 25/100...
Analyzing article 26/100...
Analyzing article 27/100...
Analyzing article 28/100...
Analyzing article 29/100...
Analyzing article 33/100...
Analyzing article 34/100...
Analyzing article 35/100...
Analyzing article 36/100...
Analyzing article 37/100...
Analyzing article 38/100...
Analyzing article 40/100...
Analyzing article 41/100...
Analyzing article 42/100...
Analyzing article 43/100...
Analyzing article 44/100...
Analyzing article 47/100...
Analyzing article 48/100...
Analyzing article 49/100...
Analyzing article 50/100...
Analyzing article 51/100...
Analyzing article 52/100...
Analyzing article 53/100...
Analyzing article 54/100...
Analyzing article 55/100...
Analyzing article 56/100...
Analyzing arti

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['content_sentiment_label'] = [s['label'] for s in content_sentiments]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['content_sentiment_score'] = [s['score'] for s in content_sentiments]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['title_sentiment_label'] = [s['label']

In [9]:
# ====== Weighted Sentiment Analysis ======

# Calculate a weighted sentiment score for each article
print("\nCalculating weighted sentiment scores...")

# Define weights for different parts of the article
CONTENT_WEIGHT = 0.75  # Main content has the highest weight
TITLE_WEIGHT = 0.10    # Title has significant impact
DESC_WEIGHT = 0.15     # Description has the smallest weight

# Calculate weighted score combining sentiment values and confidence scores
final_df['weighted_sentiment_score'] = (
    final_df['content_sentiment_value'] * final_df['content_sentiment_score'] * CONTENT_WEIGHT +
    final_df['title_sentiment_value'] * final_df['title_sentiment_score'] * TITLE_WEIGHT +
    final_df['desc_sentiment_value'] * final_df['desc_sentiment_score'] * DESC_WEIGHT
)

# Define sentiment categories based on the weighted score
def categorize_weighted_sentiment(score):
    if score >= 0.75:
        return "Very Bullish"
    elif score >= 0.5:
        return "Bullish"
    elif score > -0.5:
        return "Neutral"
    elif score > -0.75:
        return "Bearish"
    else:
        return "Very Bearish"

# Categorize the weighted sentiment scores
final_df['weighted_sentiment_category'] = final_df['weighted_sentiment_score'].apply(categorize_weighted_sentiment)

# Display results
print("\n=== WEIGHTED SENTIMENT ANALYSIS ===")

# Show distribution of weighted sentiment categories
weighted_sentiment_counts = final_df['weighted_sentiment_category'].value_counts()
print("Weighted Sentiment Distribution:")
print(weighted_sentiment_counts)

# Calculate percentage for each category
for category in ["Very Bullish", "Bullish", "Neutral", "Bearish", "Very Bearish"]:
    count = weighted_sentiment_counts.get(category, 0)
    percentage = count / len(final_df) * 100
    print(f"{category}: {count} articles ({percentage:.1f}%)")




Calculating weighted sentiment scores...

=== WEIGHTED SENTIMENT ANALYSIS ===
Weighted Sentiment Distribution:
weighted_sentiment_category
Neutral         66
Bullish         14
Very Bullish     8
Bearish          6
Very Bearish     6
Name: count, dtype: int64
Very Bullish: 8 articles (8.0%)
Bullish: 14 articles (14.0%)
Neutral: 66 articles (66.0%)
Bearish: 6 articles (6.0%)
Very Bearish: 6 articles (6.0%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['weighted_sentiment_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['weighted_sentiment_category'] = final_df['weighted_sentiment_score'].apply(categorize_weighted_sentiment)
