In [1]:
# =============================================================================
# TASK 3: Sentiment Analysis & Correlation with Stock Returns
# KAIM Week 1 - Financial Sentiment Challenge
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from scipy import stats
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("=" * 50)
print("TASK 3: SENTIMENT & CORRELATION ANALYSIS")
print("=" * 50)
print(f"\n✓ Libraries imported successfully!")
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

TASK 3: SENTIMENT & CORRELATION ANALYSIS

✓ Libraries imported successfully!
Analysis Date: 2025-11-22 23:26


#### Sentiment Analysis

In [7]:
# =============================================================================
# STEP 1: Load Data and Perform Sentiment Analysis
# =============================================================================

# Load news data
news_df = pd.read_csv('../data/newsData/raw_analyst_ratings.csv')  # Adjust path if needed

# Parse dates
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')
news_df['date_only'] = news_df['date'].dt.date

print("DATA LOADED:")
print(f"  Total articles: {len(news_df):,}")
print(f"  Date range: {news_df['date'].min().date()} to {news_df['date'].max().date()}")
print(f"  Unique stocks: {news_df['stock'].nunique()}")

DATA LOADED:
  Total articles: 1,407,328
  Date range: 2011-04-27 to 2020-06-11
  Unique stocks: 6204


In [None]:
# Sentiment Analysis Function
def get_sentiment(text):
    """
    Calculate sentiment polarity using TextBlob
    Returns: float between -1 (negative) and +1 (positive)
    """
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0.0

def categorize_sentiment(score):
    """Categorize sentiment score into Positive/Neutral/Negative"""
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis
print("\nCalculating sentiment scores...")
print("(This may take 1-2 minutes for large datasets)")

news_df['sentiment'] = news_df['headline'].apply(get_sentiment)
news_df['sentiment_category'] = news_df['sentiment'].apply(categorize_sentiment)

print("\n✓ SENTIMENT ANALYSIS COMPLETE!")
print("\n" + "=" * 40)
print("SENTIMENT DISTRIBUTION:")
print("=" * 40)
print(news_df['sentiment_category'].value_counts())
print(f"\nSentiment Statistics:")
print(f"  Mean:   {news_df['sentiment'].mean():.4f}")
print(f"  Median: {news_df['sentiment'].median():.4f}")
print(f"  Std:    {news_df['sentiment'].std():.4f}")
print(f"  Min:    {news_df['sentiment'].min():.4f}")
print(f"  Max:    {news_df['sentiment'].max():.4f}")


Calculating sentiment scores...
(This may take 1-2 minutes for large datasets)
