# 🧠 Task 2 – Quantitative Sentiment–Price Analysis  
📘 Version: 2025-06-01  

This notebook initiates exploratory and quantitative analysis of stock price movements in relation to sentiment signals extracted from financial news.  
It supports the computation of technical indicators, return alignment, and visual diagnostics.

---

### This notebook covers:
- Modular loading of historical stock price data  
- Basic technical indicator computation using TA-Lib  
- Alignment of stock data with enriched sentiment signals  
- Preparation for downstream correlation analysis


In [3]:
# 🗂️ Project Directory Setup

#To ensure smooth path handling across environments, we standardize the working directory and define data locations relative to the project root.

#- **Sentiment Dataset**: `data/cleaned_headlines_sample.csv`
#- **Stock Price Data**: `data/yfinance_data/*.csv`

#All modules are assumed to be accessible from the `src/` folder when running the notebook from the root.

# 🛠 Standardize working directory
import os

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
print("📂 Working directory is now:", os.getcwd())

# 🔎 Confirm required sentiment and price files exist
sentiment_file = "data/cleaned_headlines_sample.csv"
price_dir = "data/yfinance_data"
expected_tickers = ["AAPL", "AMZN", "GOOG", "META", "MSFT", "NVDA", "TSLA"]

# Check sentiment file
print("📄 Sentiment file found ✅" if os.path.exists(sentiment_file) else f"❌ Sentiment file missing: {sentiment_file}")

# Check stock price files
for ticker in expected_tickers:
    price_path = os.path.join(price_dir, f"{ticker}_historical_data.csv")
    status = "✅" if os.path.exists(price_path) else "❌"
    print(f"{status} {ticker} data file: {price_path}")


📂 Working directory is now: c:\Users\admin\Documents\GIT Repositories\b5w1-stock-market-challenge
📄 Sentiment file found ✅
✅ AAPL data file: data/yfinance_data\AAPL_historical_data.csv
✅ AMZN data file: data/yfinance_data\AMZN_historical_data.csv
✅ GOOG data file: data/yfinance_data\GOOG_historical_data.csv
✅ META data file: data/yfinance_data\META_historical_data.csv
✅ MSFT data file: data/yfinance_data\MSFT_historical_data.csv
✅ NVDA data file: data/yfinance_data\NVDA_historical_data.csv
✅ TSLA data file: data/yfinance_data\TSLA_historical_data.csv


In [6]:
# ------------------------------------------------------------------------------
# 📦 Core Libraries
# ------------------------------------------------------------------------------
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------------------------------------------------------
# 📈 Financial Analysis & Signal Tools
# ------------------------------------------------------------------------------
# TA-Lib for technical indicators (MACD, RSI, etc.)
import talib

# For future NLP-driven sentiment joins
from rapidfuzz import fuzz, process  # Optional: for fuzzy ticker/event joins

# ------------------------------------------------------------------------------
# 🔧 Display & Notebook Config
# ------------------------------------------------------------------------------
from IPython.display import display

# ------------------------------------------------------------------------------
# 🛠️ Module Reloading for Dev Iteration
# ------------------------------------------------------------------------------
import importlib
import src.price_data_loader
import src.news_loader

importlib.reload(src.price_data_loader)
importlib.reload(src.news_loader)

from src.price_data_loader import PriceDataLoader
from src.news_loader import NewsDataLoader

# ------------------------------------------------------------------------------
# 📋 Display Settings
# ------------------------------------------------------------------------------
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)
sns.set(style="whitegrid")

In [4]:
# ------------------------------------------------------------------------------
# 🔁 Reload custom modules for live development
# ------------------------------------------------------------------------------

import importlib
import src.price_data_loader
import src.news_loader

# Reload to ensure any updates to the file are picked up
importlib.reload(src.price_data_loader)
importlib.reload(src.news_loader)

# Bring updated class definitions into scope
from src.price_data_loader import PriceDataLoader
from src.news_loader import NewsDataLoader

In [5]:
# ------------------------------------------------------------------------------
# 📥 Load Stock Price Data & Enriched News Sentiment Dataset
# ------------------------------------------------------------------------------

from src.price_data_loader import PriceDataLoader
from src.news_loader import NewsDataLoader  # ✅ Reuse Task 1 loader

# Define paths to stock price directory and sentiment CSV file
PRICE_DATA_DIR = "data/yfinance_data"
SENTIMENT_DATA_PATH = "data/cleaned_headlines_sample.csv"

# Initialize and load all stock price CSVs in the directory
try:
    price_loader = PriceDataLoader(
        folder_path=PRICE_DATA_DIR, verbose=True
    )  # ✅ Correct param
    prices_df = price_loader.load_all()
    print("✅ All stock price files loaded successfully.")
except Exception as e:
    print(f"❌ Failed to load stock price data: {e}")
    prices_df = None  # gracefully degrade

# Initialize and load the enriched sentiment dataset
try:
    sentiment_loader = NewsDataLoader(
        path=SENTIMENT_DATA_PATH, parse_dates=["cleaned_date"], verbose=True
    )
    sentiment_df = sentiment_loader.load()
    print("✅ Enriched sentiment dataset loaded successfully.")
except Exception as e:
    print(f"❌ Failed to load sentiment dataset: {e}")
    sentiment_df = None  # gracefully degrade


📄 Loaded: AAPL | Path: data/yfinance_data\AAPL_historical_data.csv
📦 Encoding used: utf-8
🔢 Rows: 10,998 | Columns: 10
🗓️ Date range: 1980-12-12 → 2024-07-30
🧪 Columns: Date, Open, High, Low, Close, Adj Close, Volume, Dividends, Stock Splits, cleaned_date


📄 Loaded: AMZN | Path: data/yfinance_data\AMZN_historical_data.csv
📦 Encoding used: utf-8
🔢 Rows: 6,846 | Columns: 10
🗓️ Date range: 1997-05-15 → 2024-07-30
🧪 Columns: Date, Open, High, Low, Close, Adj Close, Volume, Dividends, Stock Splits, cleaned_date


📄 Loaded: GOOG | Path: data/yfinance_data\GOOG_historical_data.csv
📦 Encoding used: utf-8
🔢 Rows: 5,020 | Columns: 10
🗓️ Date range: 2004-08-19 → 2024-07-30
🧪 Columns: Date, Open, High, Low, Close, Adj Close, Volume, Dividends, Stock Splits, cleaned_date


📄 Loaded: META | Path: data/yfinance_data\META_historical_data.csv
📦 Encoding used: utf-8
🔢 Rows: 2,926 | Columns: 10
🗓️ Date range: 2012-12-12 → 2024-07-30
🧪 Columns: Date, Open, High, Low, Close, Adj Close, Volume, Dividends,


📄 File loaded: data/cleaned_headlines_sample.csv
📦 Encoding used: utf-8
🔢 Rows: 1,407,328 | Columns: 23
🧪 Columns: Unnamed: 0, headline, url, publisher, date, stock, cleaned_date, cleaned_headline, headline_length, word_count, publisher_domain, date_only, hour, day_of_week, is_weekend, bullish_flag, bearish_flag, vader_scores, vader_compound, vader_sentiment, textblob_polarity, ensemble_sentiment, ensemble_confidence

✅ Enriched sentiment dataset loaded successfully.


## ✅ Initial Sanity Checks – Structure, Missingness, Duplicates

Before diving into analysis, we run a quick diagnostic to verify data health:

- Preview the top 3 rows for structural validation.
- Print full column data types to confirm schema expectations.
- Check for missing values across all columns.
- Identify and count any fully duplicated rows.

These checks help detect formatting issues or corrupt entries early, ensuring downstream feature engineering operates on clean and consistent data.


In [7]:
# ------------------------------------------------------------------------------
# ✅ Initial Sanity Check – Stock Price and Sentiment Datasets
# ------------------------------------------------------------------------------


def run_sanity_check(df, name="DataFrame"):
    """
    Runs basic data quality checks on a given DataFrame.

    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame to check.
    name : str
        A label for the dataset (used in printouts).
    """
    print(f"\n🧪 Sanity Check – {name}")
    print("-" * 60)

    if df is not None:
        # Show preview
        display(df.head(3))

        # Column types
        print("🧬 Column Data Types:")
        print(df.dtypes)

        # Nulls
        print("\n🔍 Missing Value Summary:")
        missing = df.isna().sum()
        print(
            missing[missing > 0] if missing.any() else "✅ No missing values detected."
        )

        # Duplicates
        duplicate_count = df.duplicated().sum()
        if duplicate_count > 0:
            print(f"⚠️ Found {duplicate_count:,} duplicate rows.")
        else:
            print("✅ No duplicate rows found.")
    else:
        print(f"🚫 Skipping {name} – DataFrame not loaded.")


# Run checks for both datasets
run_sanity_check(prices_df, name="Stock Price Data")
run_sanity_check(sentiment_df, name="Enriched Sentiment Data")


🧪 Sanity Check – Stock Price Data
------------------------------------------------------------


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,cleaned_date,ticker
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0,1980-12-12,AAPL
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.093781,175884800,0.0,0.0,1980-12-15,AAPL
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0,1980-12-16,AAPL


🧬 Column Data Types:
Date                    object
Open                   float64
High                   float64
Low                    float64
Close                  float64
Adj Close              float64
Volume                   int64
Dividends              float64
Stock Splits           float64
cleaned_date    datetime64[ns]
ticker                  object
dtype: object

🔍 Missing Value Summary:
✅ No missing values detected.
✅ No duplicate rows found.

🧪 Sanity Check – Enriched Sentiment Data
------------------------------------------------------------


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,cleaned_date,cleaned_headline,headline_length,word_count,publisher_domain,date_only,hour,day_of_week,is_weekend,bullish_flag,bearish_flag,vader_scores,vader_compound,vader_sentiment,textblob_polarity,ensemble_sentiment,ensemble_confidence
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A,2020-06-05,stocks hit 52week highs friday,39,7,benzinga.com,2020-06-04,20,Thursday,False,True,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,neutral,0.0,neutral,0.5
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A,2020-06-03,stocks hit 52week highs wednesday,42,7,benzinga.com,2020-06-02,20,Tuesday,False,True,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,neutral,0.0,neutral,0.5
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A,2020-05-26,71 biggest movers friday,29,5,benzinga.com,2020-05-25,20,Monday,False,False,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,neutral,0.0,neutral,0.5


🧬 Column Data Types:
Unnamed: 0                      int64
headline                       object
url                            object
publisher                      object
date                           object
stock                          object
cleaned_date           datetime64[ns]
cleaned_headline               object
headline_length                 int64
word_count                      int64
publisher_domain               object
date_only                      object
hour                            int64
day_of_week                    object
is_weekend                       bool
bullish_flag                     bool
bearish_flag                     bool
vader_scores                   object
vader_compound                float64
vader_sentiment                object
textblob_polarity             float64
ensemble_sentiment             object
ensemble_confidence           float64
dtype: object

🔍 Missing Value Summary:
cleaned_headline    2
dtype: int64
✅ No duplicate rows found.


# 🔗 Sentiment–Price Signal Alignment

This section performs a robust alignment between financial news sentiment signals and historical stock price data. The alignment pipeline includes:

### 🧮 Key Logic:

- **Sentiment Encoding**: Converts `ensemble_sentiment` (bullish, neutral, bearish) into a normalized numerical score in the range [-1, 1].

- **Exponential Time Decay**: When multiple headlines occur on the same day for a stock, sentiment signals are weighted using an **exponential decay function**, prioritizing more recent headlines. The decay factor (λ) controls how aggressively past signals are discounted.

- **Return Computation**:
  - `return_t`: Daily percentage change in closing prices.
  - `return_t+1`: Forward return for next-day modeling tasks.

- **Final Merge**: Sentiment and return data are joined on both `ticker` and `cleaned_date`, producing a rich dataset for correlation analysis or signal evaluation.

### 📦 Output:
A merged DataFrame containing:
- `ticker`, `cleaned_date`
- OHLCV price data (`Open`, `Close`, etc.)
- `weighted_sentiment` (decayed score)
- `return_t`, `return_t+1` (realized returns)


In [12]:
# ------------------------------------------------------------------------------
# 🧾 Standardize sentiment_df for alignment
# ------------------------------------------------------------------------------
# Rename 'stock' column to 'ticker' for consistency with price_df
sentiment_df.rename(columns={"stock": "ticker"}, inplace=True)

# ------------------------------------------------------------------------------
# 🔗 Align Enriched Sentiment with Price Data using Exponential Decay
# ------------------------------------------------------------------------------
from src.sentiment_return_aligner import SentimentReturnAligner

try:
    # Initialize the aligner with decay factor and verbosity
    aligner = SentimentReturnAligner(
        price_df=prices_df,
        sentiment_df=sentiment_df,
        decay_lambda=0.7,  # Strong exponential decay
        verbose=True,
    )

    # Run alignment process to merge and enrich sentiment–price dataset
    aligned_df = aligner.align()

    # Show preview of final merged dataset
    display(aligned_df.head(5))
    print("✅ Sentiment and return data successfully aligned.")

except Exception as e:
    print(f"❌ Failed to align sentiment and price data: {e}")
    aligned_df = None  # fallback in case of error

✅ Sentiment labels converted to numeric scale [-1, 0, 1]
📉 Exponential decay applied with λ = 0.7
📈 Returns computed: current (t), forward (t+1)
❌ Failed to align sentiment and price data: 'ticker'


------------------------------------------------------------------------------
📐 Technical Indicators – RSI, MACD, ATR, SMA, EMA
------------------------------------------------------------------------------

We now extend the sentiment–return aligned dataset with core technical indicators using **TA-Lib**.

Indicators added:
- **SMA (14-day)** – Simple moving average
- **EMA (14-day)** – Exponential moving average
- **RSI (14-day)** – Relative strength index
- **MACD** and **MACD signal** – Momentum indicators
- **ATR (14-day)** – Average true range (volatility)

These indicators support downstream predictive modeling and correlation testing by capturing trend, momentum, and volatility patterns for each stock.


In [11]:
# ------------------------------------------------------------------------------
# 🧮 Compute Technical Indicators on Aligned Dataset
# ------------------------------------------------------------------------------

from src.technical_indicator_calculator import TechnicalIndicatorCalculator

try:
    # Initialize indicator engine
    indicator_calc = TechnicalIndicatorCalculator(
        df=aligned_df,
        ticker_col="ticker",
        date_col="cleaned_date",
        verbose=True,
    )

    # Add RSI, SMA, EMA, MACD, ATR
    enriched_df = indicator_calc.add_indicators()

    # Show result sample
    display(enriched_df.head(5))
    print("✅ Technical indicators computed and added.")

except Exception as e:
    print(f"❌ Failed to compute technical indicators: {e}")
    enriched_df = None  # fallback in case of failure

❌ Failed to compute technical indicators: 'NoneType' object has no attribute 'copy'


# 📈 Signal Visual Diagnostics

This section visualizes the relationship between sentiment scores, stock price movements, and technical indicators for exploratory signal analysis. It includes:

- 🧠 **Sentiment vs. Price Trends**: Track weighted sentiment overlays on adjusted closing prices.
- 🎯 **Sentiment–Return Scatter**: Examine correlation between sentiment and next-day returns.
- 🛠️ **Technical Indicators**: Plot SMA, EMA, RSI, and MACD per ticker.

Use this to validate signal strength, detect lags, and identify alignment with market behaviors.


In [None]:
# ------------------------------------------------------------------------------
# 📊 Visualize Signals for Diagnostic Insights
# ------------------------------------------------------------------------------

from src.signal_visualizer import SignalVisualizer

# Initialize the visualizer with the enriched dataset
visualizer = SignalVisualizer(df=enriched_df)

# Choose a stock ticker to analyze (example: "AAPL", "GOOG", etc.)
selected_ticker = "AAPL"

# Plot sentiment vs. price
visualizer.plot_sentiment_vs_price(ticker=selected_ticker)

# Plot scatter of sentiment vs. next-day return
visualizer.plot_sentiment_return_scatter(ticker=selected_ticker)

# Plot technical indicators (SMA, EMA, RSI, MACD)
visualizer.plot_technical_indicators(ticker=selected_ticker)