In [1]:
# 04_sentiment_analysis.ipynb
# Purpose: Apply FinBERT sentiment analysis models to JPM data
# Models: yiyanghkust/finbert-tone, ProsusAI/finbert
# Input: processed_jpm_q1_2025_df, processed_jpm_q2_2025_df, processed_jpm_multi_2025_df
# Output: sentiment_results with multi-level analysis

## Import Libraries

import pandas as pd
import numpy as np
import json
import torch
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Hugging Face transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch.nn.functional as F

# Progress tracking
from tqdm import tqdm
tqdm.pandas()

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive")

# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

# Set seeds for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

print(f"Sentiment analysis for bank: {BANK_CODE.upper()}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")


Mounted at /content/drive
Sentiment analysis for bank: JPM
Device: CPU


In [2]:
## Define Paths

processed_data_path = drive_base / "data/processed/jpm"
clean_data_path = drive_base / "data/clean/jpm"
results_sentiment_path = drive_base / "results/sentiment/jpm"
models_path = drive_base / "models"

# Ensure directories exist
results_sentiment_path.mkdir(parents=True, exist_ok=True)
models_path.mkdir(parents=True, exist_ok=True)


In [4]:
## Load Processed Data

def load_processed_dataset(filename: str) -> pd.DataFrame:
    """Load processed dataset with error handling."""
    file_path = processed_data_path / filename

    if not file_path.exists():
        print(f"❌ File not found: {file_path}")
        return None

    try:
        df = pd.read_csv(file_path)
        print(f"✓ Loaded {filename}: {df.shape}")
        return df
    except Exception as e:
        print(f"❌ Error loading {filename}: {str(e)}")
        return None

print("Loading processed datasets...")
processed_jpm_q1_2025_df = load_processed_dataset("processed_jpm_q1_2025.csv")
processed_jpm_q2_2025_df = load_processed_dataset("processed_jpm_q2_2025.csv")
processed_jpm_multi_2025_df = load_processed_dataset("processed_jpm_multi_2025.csv")

Loading processed datasets...
✓ Loaded processed_jpm_q1_2025.csv: (578, 9)
✓ Loaded processed_jpm_q2_2025.csv: (532, 9)
✓ Loaded processed_jpm_multi_2025.csv: (1110, 9)


In [5]:
## Model Setup and Configuration

class FinBERTSentimentAnalyzer:
    """Wrapper class for FinBERT sentiment analysis models."""

    def __init__(self, model_name: str, device: str = None):
        self.model_name = model_name
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = None
        self.model = None
        self.pipeline = None

        # Model-specific configurations
        self.model_configs = {
            'yiyanghkust/finbert-tone': {
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16
            },
            'ProsusAI/finbert': {
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16
            }
        }

        self.config = self.model_configs.get(model_name, {})

    def load_model(self):
        """Load the FinBERT model and tokenizer."""
        try:
            print(f"Loading model: {self.model_name}")

            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)

            # Move model to device
            self.model.to(self.device)
            self.model.eval()

            # Create pipeline for easier inference
            self.pipeline = pipeline(
                'sentiment-analysis',
                model=self.model,
                tokenizer=self.tokenizer,
                device=0 if self.device == 'cuda' else -1,
                return_all_scores=True
            )

            print(f"✓ Model loaded on {self.device}")
            return True

        except Exception as e:
            print(f"❌ Error loading model {self.model_name}: {str(e)}")
            return False

    def predict_sentiment(self, texts: List[str], batch_size: int = None) -> List[Dict]:
        """Predict sentiment for a list of texts."""
        if self.pipeline is None:
            raise ValueError("Model not loaded. Call load_model() first.")

        batch_size = batch_size or self.config.get('batch_size', 16)
        results = []

        print(f"Processing {len(texts)} texts in batches of {batch_size}")

        for i in tqdm(range(0, len(texts), batch_size), desc="Sentiment Analysis"):
            batch = texts[i:i + batch_size]

            try:
                # Get predictions for batch
                batch_results = self.pipeline(batch)

                # Process results
                for text_idx, text_result in enumerate(batch_results):
                    # Convert to our standard format
                    scores_dict = {item['label'].lower(): item['score'] for item in text_result}

                    # Get the predicted label (highest score)
                    predicted_label = max(scores_dict, key=scores_dict.get)
                    predicted_score = scores_dict[predicted_label]

                    result = {
                        'text': batch[text_idx],
                        'predicted_label': predicted_label,
                        'predicted_score': predicted_score,
                        'positive_score': scores_dict.get('positive', 0.0),
                        'neutral_score': scores_dict.get('neutral', 0.0),
                        'negative_score': scores_dict.get('negative', 0.0),
                        'model_name': self.model_name
                    }

                    results.append(result)

            except Exception as e:
                print(f"❌ Error processing batch {i//batch_size + 1}: {str(e)}")
                # Add placeholder results for failed batch
                for j in range(len(batch)):
                    results.append({
                        'text': batch[j],
                        'predicted_label': 'neutral',
                        'predicted_score': 0.33,
                        'positive_score': 0.33,
                        'neutral_score': 0.34,
                        'negative_score': 0.33,
                        'model_name': self.model_name,
                        'error': True
                    })

        return results

# Initialize models
finbert_tone_model = FinBERTSentimentAnalyzer('yiyanghkust/finbert-tone')
prosus_finbert_model = FinBERTSentimentAnalyzer('ProsusAI/finbert')

# Load models
print("Loading FinBERT models...")
finbert_tone_loaded = finbert_tone_model.load_model()
prosus_finbert_loaded = prosus_finbert_model.load_model()

Loading FinBERT models...
Loading model: yiyanghkust/finbert-tone


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


✓ Model loaded on cpu
Loading model: ProsusAI/finbert


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cpu


✓ Model loaded on cpu


In [6]:
## Sentiment Analysis Functions

def run_sentiment_analysis_on_dataset(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """Run both FinBERT models on a dataset."""
    if df is None:
        print(f"❌ Cannot process {dataset_name} - dataset is None")
        return None

    print(f"\n🔍 RUNNING SENTIMENT ANALYSIS - {dataset_name}")
    print("-" * 50)
    print(f"Input shape: {df.shape}")

    # Prepare text data
    texts = df['text'].fillna('').astype(str).tolist()
    print(f"Processing {len(texts)} text entries")

    results_df = df.copy()

    # Run FinBERT-tone analysis
    if finbert_tone_loaded:
        print("\n🤖 Running FinBERT-tone analysis...")
        try:
            finbert_tone_results = finbert_tone_model.predict_sentiment(texts)

            # Add results to dataframe
            for i, result in enumerate(finbert_tone_results):
                results_df.loc[i, 'finbert_tone_label'] = result['predicted_label']
                results_df.loc[i, 'finbert_tone_score'] = result['predicted_score']
                results_df.loc[i, 'finbert_tone_positive'] = result['positive_score']
                results_df.loc[i, 'finbert_tone_neutral'] = result['neutral_score']
                results_df.loc[i, 'finbert_tone_negative'] = result['negative_score']
                results_df.loc[i, 'finbert_tone_error'] = result.get('error', False)

            print("✓ FinBERT-tone analysis complete")

        except Exception as e:
            print(f"❌ Error in FinBERT-tone analysis: {str(e)}")

    # Run ProsusAI FinBERT analysis
    if prosus_finbert_loaded:
        print("\n🤖 Running ProsusAI FinBERT analysis...")
        try:
            prosus_results = prosus_finbert_model.predict_sentiment(texts)

            # Add results to dataframe
            for i, result in enumerate(prosus_results):
                results_df.loc[i, 'prosus_label'] = result['predicted_label']
                results_df.loc[i, 'prosus_score'] = result['predicted_score']
                results_df.loc[i, 'prosus_positive'] = result['positive_score']
                results_df.loc[i, 'prosus_neutral'] = result['neutral_score']
                results_df.loc[i, 'prosus_negative'] = result['negative_score']
                results_df.loc[i, 'prosus_error'] = result.get('error', False)

            print("✓ ProsusAI FinBERT analysis complete")

        except Exception as e:
            print(f"❌ Error in ProsusAI FinBERT analysis: {str(e)}")

    print(f"Final shape: {results_df.shape}")
    return results_df

# Run sentiment analysis on all datasets
sentiment_jpm_q1_2025_df = run_sentiment_analysis_on_dataset(processed_jpm_q1_2025_df, "Q1 2025")
sentiment_jpm_q2_2025_df = run_sentiment_analysis_on_dataset(processed_jpm_q2_2025_df, "Q2 2025")
sentiment_jpm_multi_2025_df = run_sentiment_analysis_on_dataset(processed_jpm_multi_2025_df, "Multi 2025")



🔍 RUNNING SENTIMENT ANALYSIS - Q1 2025
--------------------------------------------------
Input shape: (578, 9)
Processing 578 text entries

🤖 Running FinBERT-tone analysis...
Processing 578 texts in batches of 16


Sentiment Analysis: 100%|██████████| 37/37 [01:28<00:00,  2.39s/it]


✓ FinBERT-tone analysis complete

🤖 Running ProsusAI FinBERT analysis...
Processing 578 texts in batches of 16


Sentiment Analysis: 100%|██████████| 37/37 [01:22<00:00,  2.22s/it]


✓ ProsusAI FinBERT analysis complete
Final shape: (578, 21)

🔍 RUNNING SENTIMENT ANALYSIS - Q2 2025
--------------------------------------------------
Input shape: (532, 9)
Processing 532 text entries

🤖 Running FinBERT-tone analysis...
Processing 532 texts in batches of 16


Sentiment Analysis: 100%|██████████| 34/34 [01:20<00:00,  2.37s/it]


✓ FinBERT-tone analysis complete

🤖 Running ProsusAI FinBERT analysis...
Processing 532 texts in batches of 16


Sentiment Analysis: 100%|██████████| 34/34 [01:18<00:00,  2.30s/it]


✓ ProsusAI FinBERT analysis complete
Final shape: (532, 21)

🔍 RUNNING SENTIMENT ANALYSIS - Multi 2025
--------------------------------------------------
Input shape: (1110, 9)
Processing 1110 text entries

🤖 Running FinBERT-tone analysis...
Processing 1110 texts in batches of 16


Sentiment Analysis: 100%|██████████| 70/70 [02:45<00:00,  2.37s/it]


✓ FinBERT-tone analysis complete

🤖 Running ProsusAI FinBERT analysis...
Processing 1110 texts in batches of 16


Sentiment Analysis: 100%|██████████| 70/70 [02:46<00:00,  2.37s/it]


✓ ProsusAI FinBERT analysis complete
Final shape: (1110, 21)


In [7]:
## Multi-level Sentiment Aggregation

def aggregate_sentence_to_qa_level(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregate sentence-level sentiment to Q&A level."""
    if df is None or 'original_qa_id' not in df.columns:
        print("❌ Cannot aggregate - missing original_qa_id column")
        return None

    print("Aggregating sentence-level sentiment to Q&A level...")

    # Define aggregation functions
    agg_functions = {
        # Text info
        'text': lambda x: ' '.join(x),  # Reconstruct full text
        'speaker': 'first',
        'speaker_role': 'first',
        'quarter': 'first',
        'bank_code': 'first',

        # Sentence counts
        'sentence_length': ['count', 'mean', 'sum'],
        'sentence_word_count': ['mean', 'sum'],

        # FinBERT-tone aggregations
        'finbert_tone_score': ['mean', 'std', 'min', 'max'],
        'finbert_tone_positive': 'mean',
        'finbert_tone_neutral': 'mean',
        'finbert_tone_negative': 'mean',

        # ProsusAI aggregations
        'prosus_score': ['mean', 'std', 'min', 'max'],
        'prosus_positive': 'mean',
        'prosus_neutral': 'mean',
        'prosus_negative': 'mean'
    }

    # Apply aggregations
    qa_level_df = df.groupby('original_qa_id').agg(agg_functions).reset_index()

    # Flatten column names
    qa_level_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                           for col in qa_level_df.columns]

    # Determine overall sentiment labels at Q&A level
    # FinBERT-tone label (majority vote)
    finbert_labels = df.groupby('original_qa_id')['finbert_tone_label'].apply(
        lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'neutral'
    ).reset_index()
    qa_level_df = qa_level_df.merge(finbert_labels, left_on='original_qa_id', right_on='original_qa_id')
    qa_level_df.rename(columns={'finbert_tone_label': 'finbert_tone_qa_label'}, inplace=True)

    # ProsusAI label (majority vote)
    prosus_labels = df.groupby('original_qa_id')['prosus_label'].apply(
        lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'neutral'
    ).reset_index()
    qa_level_df = qa_level_df.merge(prosus_labels, left_on='original_qa_id', right_on='original_qa_id')
    qa_level_df.rename(columns={'prosus_label': 'prosus_qa_label'}, inplace=True)

    print(f"✓ Aggregated to Q&A level: {qa_level_df.shape}")
    return qa_level_df

def aggregate_by_speaker_role(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregate sentiment by speaker role (analyst vs executive)."""
    if df is None or 'speaker_role' not in df.columns:
        print("❌ Cannot aggregate by speaker - missing speaker_role column")
        return None

    print("Aggregating sentiment by speaker role...")

    # Define aggregation functions for speaker-level analysis
    speaker_agg = {
        'text': 'count',  # Number of statements
        'sentence_length': 'mean',
        'sentence_word_count': 'mean',

        # FinBERT-tone by speaker
        'finbert_tone_score': ['mean', 'std'],
        'finbert_tone_positive': 'mean',
        'finbert_tone_neutral': 'mean',
        'finbert_tone_negative': 'mean',

        # ProsusAI by speaker
        'prosus_score': ['mean', 'std'],
        'prosus_positive': 'mean',
        'prosus_neutral': 'mean',
        'prosus_negative': 'mean'
    }

    # Group by quarter and speaker role
    if 'quarter' in df.columns:
        speaker_df = df.groupby(['quarter', 'speaker_role']).agg(speaker_agg).reset_index()
    else:
        speaker_df = df.groupby('speaker_role').agg(speaker_agg).reset_index()

    # Flatten column names
    speaker_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                          for col in speaker_df.columns]

    # Add sentiment label distribution by speaker
    if 'quarter' in df.columns:
        finbert_dist = df.groupby(['quarter', 'speaker_role'])['finbert_tone_label'].value_counts(normalize=True).unstack(fill_value=0)
        prosus_dist = df.groupby(['quarter', 'speaker_role'])['prosus_label'].value_counts(normalize=True).unstack(fill_value=0)
    else:
        finbert_dist = df.groupby('speaker_role')['finbert_tone_label'].value_counts(normalize=True).unstack(fill_value=0)
        prosus_dist = df.groupby('speaker_role')['prosus_label'].value_counts(normalize=True).unstack(fill_value=0)

    # Add distribution columns
    finbert_dist.columns = [f'finbert_tone_{col}_pct' for col in finbert_dist.columns]
    prosus_dist.columns = [f'prosus_{col}_pct' for col in prosus_dist.columns]

    # Merge distributions
    if 'quarter' in df.columns:
        speaker_df = speaker_df.set_index(['quarter', 'speaker_role'])
    else:
        speaker_df = speaker_df.set_index('speaker_role')

    speaker_df = speaker_df.join(finbert_dist).join(prosus_dist).reset_index()

    print(f"✓ Speaker-level aggregation complete: {speaker_df.shape}")
    return speaker_df

# Create multi-level aggregations
print("\n" + "="*60)
print("CREATING MULTI-LEVEL AGGREGATIONS")
print("="*60)

# Q&A level aggregations
qa_level_jpm_q1_2025_df = aggregate_sentence_to_qa_level(sentiment_jpm_q1_2025_df)
qa_level_jpm_q2_2025_df = aggregate_sentence_to_qa_level(sentiment_jpm_q2_2025_df)
qa_level_jpm_multi_2025_df = aggregate_sentence_to_qa_level(sentiment_jpm_multi_2025_df)

# Speaker level aggregations
speaker_level_jpm_q1_2025_df = aggregate_by_speaker_role(sentiment_jpm_q1_2025_df)
speaker_level_jpm_q2_2025_df = aggregate_by_speaker_role(sentiment_jpm_q2_2025_df)
speaker_level_jpm_multi_2025_df = aggregate_by_speaker_role(sentiment_jpm_multi_2025_df)



CREATING MULTI-LEVEL AGGREGATIONS
Aggregating sentence-level sentiment to Q&A level...
✓ Aggregated to Q&A level: (97, 27)
Aggregating sentence-level sentiment to Q&A level...
✓ Aggregated to Q&A level: (121, 27)
Aggregating sentence-level sentiment to Q&A level...
✓ Aggregated to Q&A level: (218, 27)
Aggregating sentiment by speaker role...
✓ Speaker-level aggregation complete: (3, 21)
Aggregating sentiment by speaker role...
✓ Speaker-level aggregation complete: (3, 21)
Aggregating sentiment by speaker role...
✓ Speaker-level aggregation complete: (6, 21)


In [8]:
## Topic-Conditional Sentiment Analysis

def extract_financial_topics(df: pd.DataFrame) -> pd.DataFrame:
    """Extract and categorize financial topics from text."""
    if df is None:
        return None

    print("Extracting financial topics for topic-conditional sentiment...")

    df = df.copy()

    # Define financial topic keywords
    topic_keywords = {
        'revenue': ['revenue', 'sales', 'income', 'earnings', 'turnover'],
        'profitability': ['profit', 'margin', 'profitability', 'ebitda', 'roi'],
        'risk': ['risk', 'credit', 'default', 'loss', 'provisions', 'regulatory'],
        'growth': ['growth', 'expansion', 'increase', 'development', 'strategic'],
        'costs': ['cost', 'expense', 'efficiency', 'reduction', 'savings'],
        'capital': ['capital', 'equity', 'debt', 'financing', 'leverage'],
        'market': ['market', 'competition', 'share', 'client', 'customer']
    }

    # Identify topics in each text
    for topic, keywords in topic_keywords.items():
        pattern = '|'.join(keywords)
        df[f'topic_{topic}'] = df['text'].str.lower().str.contains(pattern, regex=True, na=False)

    # Add primary topic (most prominent based on keyword count)
    topic_columns = [f'topic_{topic}' for topic in topic_keywords.keys()]
    df['primary_topic'] = df[topic_columns].idxmax(axis=1).str.replace('topic_', '')

    # If no topics found, mark as 'general'
    no_topics_mask = ~df[topic_columns].any(axis=1)
    df.loc[no_topics_mask, 'primary_topic'] = 'general'

    print(f"✓ Topic extraction complete. Topic distribution:")
    topic_dist = df['primary_topic'].value_counts()
    for topic, count in topic_dist.items():
        print(f"  {topic}: {count}")

    return df

# Add topic analysis
print("\n" + "="*50)
print("TOPIC-CONDITIONAL SENTIMENT ANALYSIS")
print("="*50)

sentiment_topics_jpm_q1_2025_df = extract_financial_topics(sentiment_jpm_q1_2025_df)
sentiment_topics_jpm_q2_2025_df = extract_financial_topics(sentiment_jpm_q2_2025_df)
sentiment_topics_jpm_multi_2025_df = extract_financial_topics(sentiment_jpm_multi_2025_df)

def analyze_sentiment_by_topic(df: pd.DataFrame) -> pd.DataFrame:
    """Analyze sentiment patterns by financial topic."""
    if df is None or 'primary_topic' not in df.columns:
        return None

    print("Analyzing sentiment by topic...")

    # Define aggregation by topic
    topic_agg = {
        'text': 'count',
        'finbert_tone_score': ['mean', 'std'],
        'finbert_tone_positive': 'mean',
        'finbert_tone_neutral': 'mean',
        'finbert_tone_negative': 'mean',
        'prosus_score': ['mean', 'std'],
        'prosus_positive': 'mean',
        'prosus_neutral': 'mean',
        'prosus_negative': 'mean'
    }

    # Group by topic and optionally quarter
    if 'quarter' in df.columns:
        topic_sentiment_df = df.groupby(['quarter', 'primary_topic']).agg(topic_agg).reset_index()
    else:
        topic_sentiment_df = df.groupby('primary_topic').agg(topic_agg).reset_index()

    # Flatten column names
    topic_sentiment_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                                  for col in topic_sentiment_df.columns]

    print(f"✓ Topic-sentiment analysis complete: {topic_sentiment_df.shape}")
    return topic_sentiment_df

# Create topic-sentiment analysis
topic_sentiment_jpm_q1_2025_df = analyze_sentiment_by_topic(sentiment_topics_jpm_q1_2025_df)
topic_sentiment_jpm_q2_2025_df = analyze_sentiment_by_topic(sentiment_topics_jpm_q2_2025_df)
topic_sentiment_jpm_multi_2025_df = analyze_sentiment_by_topic(sentiment_topics_jpm_multi_2025_df)



TOPIC-CONDITIONAL SENTIMENT ANALYSIS
Extracting financial topics for topic-conditional sentiment...
✓ Topic extraction complete. Topic distribution:
  general: 389
  market: 51
  risk: 37
  revenue: 36
  growth: 24
  capital: 17
  costs: 16
  profitability: 8
Extracting financial topics for topic-conditional sentiment...
✓ Topic extraction complete. Topic distribution:
  general: 358
  market: 52
  risk: 36
  growth: 27
  capital: 23
  revenue: 20
  profitability: 10
  costs: 6
Extracting financial topics for topic-conditional sentiment...
✓ Topic extraction complete. Topic distribution:
  general: 747
  market: 103
  risk: 73
  revenue: 56
  growth: 51
  capital: 40
  costs: 22
  profitability: 18
Analyzing sentiment by topic...
✓ Topic-sentiment analysis complete: (8, 13)
Analyzing sentiment by topic...
✓ Topic-sentiment analysis complete: (8, 13)
Analyzing sentiment by topic...
✓ Topic-sentiment analysis complete: (16, 13)


In [9]:
## Anomaly Detection in Sentiment Patterns

def detect_sentiment_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """Detect anomalous sentiment patterns."""
    if df is None:
        return None

    print("Detecting sentiment anomalies...")

    df = df.copy()

    # Calculate z-scores for sentiment scores
    from scipy import stats

    # FinBERT-tone anomalies
    if 'finbert_tone_score' in df.columns:
        df['finbert_tone_zscore'] = np.abs(stats.zscore(df['finbert_tone_score'].fillna(0)))
        df['finbert_tone_anomaly'] = df['finbert_tone_zscore'] > 2.5

    # ProsusAI anomalies
    if 'prosus_score' in df.columns:
        df['prosus_zscore'] = np.abs(stats.zscore(df['prosus_score'].fillna(0)))
        df['prosus_anomaly'] = df['prosus_zscore'] > 2.5

    # Model disagreement anomalies
    if 'finbert_tone_label' in df.columns and 'prosus_label' in df.columns:
        df['model_disagreement'] = df['finbert_tone_label'] != df['prosus_label']

    # Text length anomalies
    if 'sentence_length' in df.columns:
        df['length_zscore'] = np.abs(stats.zscore(df['sentence_length']))
        df['length_anomaly'] = df['length_zscore'] > 3

    # Count anomalies
    anomaly_columns = [col for col in df.columns if 'anomaly' in col or 'disagreement' in col]
    if anomaly_columns:
        df['total_anomaly_flags'] = df[anomaly_columns].sum(axis=1)
        anomaly_count = df['total_anomaly_flags'].sum()
        print(f"  Detected {anomaly_count} total anomaly flags across {len(df)} records")

        # Show anomaly distribution
        for col in anomaly_columns:
            count = df[col].sum()
            pct = (count / len(df)) * 100
            print(f"  {col}: {count} ({pct:.1f}%)")

    return df

# Apply anomaly detection
print("\n" + "="*50)
print("ANOMALY DETECTION")
print("="*50)

anomaly_jpm_q1_2025_df = detect_sentiment_anomalies(sentiment_topics_jpm_q1_2025_df)
anomaly_jpm_q2_2025_df = detect_sentiment_anomalies(sentiment_topics_jpm_q2_2025_df)
anomaly_jpm_multi_2025_df = detect_sentiment_anomalies(sentiment_topics_jpm_multi_2025_df)



ANOMALY DETECTION
Detecting sentiment anomalies...
  Detected 202 total anomaly flags across 578 records
  finbert_tone_anomaly: 30 (5.2%)
  prosus_anomaly: 17 (2.9%)
  model_disagreement: 146 (25.3%)
  length_anomaly: 9 (1.6%)
Detecting sentiment anomalies...
  Detected 168 total anomaly flags across 532 records
  finbert_tone_anomaly: 28 (5.3%)
  prosus_anomaly: 12 (2.3%)
  model_disagreement: 119 (22.4%)
  length_anomaly: 9 (1.7%)
Detecting sentiment anomalies...
  Detected 370 total anomaly flags across 1110 records
  finbert_tone_anomaly: 57 (5.1%)
  prosus_anomaly: 30 (2.7%)
  model_disagreement: 265 (23.9%)
  length_anomaly: 18 (1.6%)


In [11]:
## Save Sentiment Analysis Results

def save_sentiment_results(df: pd.DataFrame, filename: str, description: str):
    """Save sentiment analysis results to multiple locations."""
    if df is None:
        print(f"❌ Cannot save {description} - dataset is None")
        return

    print(f"Saving {description}...")

    # Save to results directory
    results_path = results_sentiment_path / filename
    df.to_csv(results_path, index=False)
    print(f"  ✓ Results: {results_path}")

    # Save to colab for easy access
    colab_results_path = colab_base / "results/sentiment/jpm" / filename
    colab_results_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(colab_results_path, index=False)
    print(f"  ✓ Colab: {colab_results_path}")

print("\n" + "="*60)
print("SAVING SENTIMENT ANALYSIS RESULTS")
print("="*60)

# Save sentence-level results
save_sentiment_results(anomaly_jpm_q1_2025_df, "sentiment_sentence_jpm_q1_2025.csv", "Q1 2025 sentence-level sentiment")
save_sentiment_results(anomaly_jpm_q2_2025_df, "sentiment_sentence_jpm_q2_2025.csv", "Q2 2025 sentence-level sentiment")
save_sentiment_results(anomaly_jpm_multi_2025_df, "sentiment_sentence_jpm_multi_2025.csv", "Multi 2025 sentence-level sentiment")

# Save Q&A-level results
save_sentiment_results(qa_level_jpm_q1_2025_df, "sentiment_qa_jpm_q1_2025.csv", "Q1 2025 Q&A-level sentiment")
save_sentiment_results(qa_level_jpm_q2_2025_df, "sentiment_qa_jpm_q2_2025.csv", "Q2 2025 Q&A-level sentiment")
save_sentiment_results(qa_level_jpm_multi_2025_df, "sentiment_qa_jpm_multi_2025.csv", "Multi 2025 Q&A-level sentiment")

# Save speaker-level results
save_sentiment_results(speaker_level_jpm_q1_2025_df, "sentiment_speaker_jpm_q1_2025.csv", "Q1 2025 speaker-level sentiment")
save_sentiment_results(speaker_level_jpm_q2_2025_df, "sentiment_speaker_jpm_q2_2025.csv", "Q2 2025 speaker-level sentiment")
save_sentiment_results(speaker_level_jpm_multi_2025_df, "sentiment_speaker_jpm_multi_2025.csv", "Multi 2025 speaker-level sentiment")

# Save topic-sentiment results
save_sentiment_results(topic_sentiment_jpm_q1_2025_df, "sentiment_topic_jpm_q1_2025.csv", "Q1 2025 topic-sentiment")
save_sentiment_results(topic_sentiment_jpm_q2_2025_df, "sentiment_topic_jpm_q2_2025.csv", "Q2 2025 topic-sentiment")
save_sentiment_results(topic_sentiment_jpm_multi_2025_df, "sentiment_topic_jpm_multi_2025.csv", "Multi 2025 topic-sentiment")



SAVING SENTIMENT ANALYSIS RESULTS
Saving Q1 2025 sentence-level sentiment...
  ✓ Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_sentence_jpm_q1_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/results/sentiment/jpm/sentiment_sentence_jpm_q1_2025.csv
Saving Q2 2025 sentence-level sentiment...
  ✓ Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_sentence_jpm_q2_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/results/sentiment/jpm/sentiment_sentence_jpm_q2_2025.csv
Saving Multi 2025 sentence-level sentiment...
  ✓ Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_sentence_jpm_multi_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/results/sentiment/jpm/sentiment_sentence_jpm_multi_2025.csv
Saving Q1 2025 Q&A-level sentiment...
  ✓ Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_qa_jpm_q1_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/results/sentiment/jpm/sentim

In [12]:
## Sentiment Analysis Summary

def create_sentiment_summary():
    """Create comprehensive sentiment analysis summary."""
    summary = {
        "analysis_timestamp": pd.Timestamp.now().isoformat(),
        "bank_code": BANK_CODE,
        "models_used": [
            "yiyanghkust/finbert-tone",
            "ProsusAI/finbert"
        ],
        "analysis_levels": [
            "sentence_level",
            "qa_level",
            "speaker_level",
            "topic_conditional"
        ],
        "datasets_analyzed": {},
        "anomaly_detection": {
            "enabled": True,
            "thresholds": {
                "z_score": 2.5,
                "length_z_score": 3.0
            }
        }
    }

    # Add dataset information
    datasets_info = [
        ("sentiment_sentence_jpm_q1_2025", anomaly_jpm_q1_2025_df),
        ("sentiment_sentence_jpm_q2_2025", anomaly_jpm_q2_2025_df),
        ("sentiment_sentence_jpm_multi_2025", anomaly_jpm_multi_2025_df),
        ("sentiment_qa_jpm_q1_2025", qa_level_jpm_q1_2025_df),
        ("sentiment_qa_jpm_q2_2025", qa_level_jpm_q2_2025_df),
        ("sentiment_qa_jpm_multi_2025", qa_level_jpm_multi_2025_df),
        ("sentiment_speaker_jpm_q1_2025", speaker_level_jpm_q1_2025_df),
        ("sentiment_speaker_jpm_q2_2025", speaker_level_jpm_q2_2025_df),
        ("sentiment_speaker_jpm_multi_2025", speaker_level_jpm_multi_2025_df)
    ]

    for name, df in datasets_info:
        if df is not None:
            summary["datasets_analyzed"][name] = {
                "shape": df.shape,
                "memory_mb": round(df.memory_usage(deep=True).sum() / 1024**2, 2)
            }

            # Add sentiment distribution if available
            if 'finbert_tone_label' in df.columns:
                finbert_dist = df['finbert_tone_label'].value_counts().to_dict()
                summary["datasets_analyzed"][name]["finbert_tone_distribution"] = finbert_dist

            if 'prosus_label' in df.columns:
                prosus_dist = df['prosus_label'].value_counts().to_dict()
                summary["datasets_analyzed"][name]["prosus_distribution"] = prosus_dist

    # Save summary
    summary_path = results_sentiment_path / "sentiment_analysis_summary.json"
    with open(summary_path, "w") as f:
        json.dump(summary, f, indent=2)

    return summary, summary_path

summary, summary_path = create_sentiment_summary()

print("\n" + "="*60)
print("SENTIMENT ANALYSIS COMPLETE")
print("="*60)

print(f"Summary report: {summary_path}")
print("\nResults generated:")
for dataset_name, info in summary["datasets_analyzed"].items():
    if info is not None:
        print(f"  ✓ {dataset_name}: {info['shape']} - {info['memory_mb']} MB")

total_records = sum(info['shape'][0] for info in summary["datasets_analyzed"].values() if info is not None)
total_memory = sum(info['memory_mb'] for info in summary["datasets_analyzed"].values() if info is not None)

print(f"\nTotal records processed: {total_records:,}")
print(f"Total memory usage: {total_memory:.2f} MB")

print("\nNext step: Run 05_model_comparison.ipynb to compare model performance")



SENTIMENT ANALYSIS COMPLETE
Summary report: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_analysis_summary.json

Results generated:
  ✓ sentiment_sentence_jpm_q1_2025: (578, 37) - 0.45 MB
  ✓ sentiment_sentence_jpm_q2_2025: (532, 37) - 0.41 MB
  ✓ sentiment_sentence_jpm_multi_2025: (1110, 37) - 0.86 MB
  ✓ sentiment_qa_jpm_q1_2025: (97, 27) - 0.14 MB
  ✓ sentiment_qa_jpm_q2_2025: (121, 27) - 0.13 MB
  ✓ sentiment_qa_jpm_multi_2025: (218, 27) - 0.27 MB
  ✓ sentiment_speaker_jpm_q1_2025: (3, 21) - 0.0 MB
  ✓ sentiment_speaker_jpm_q2_2025: (3, 21) - 0.0 MB
  ✓ sentiment_speaker_jpm_multi_2025: (6, 21) - 0.0 MB

Total records processed: 2,668
Total memory usage: 2.26 MB

Next step: Run 05_model_comparison.ipynb to compare model performance


In [13]:
## Research Questions Analysis Preview

def preview_research_questions(df: pd.DataFrame):
    """Preview analysis for key research questions."""
    if df is None:
        return

    print("\n📊 RESEARCH QUESTIONS PREVIEW")
    print("-" * 40)

    # Question 1: Do bankers and analysts show diverging sentiment?
    if 'speaker_role' in df.columns and 'finbert_tone_label' in df.columns:
        print("1. Banker vs Analyst Sentiment Divergence:")

        speaker_sentiment = df.groupby('speaker_role')['finbert_tone_label'].value_counts(normalize=True).unstack(fill_value=0)
        print(speaker_sentiment.round(3))

        # Calculate divergence metric
        if 'analyst' in speaker_sentiment.index and 'executive' in speaker_sentiment.index:
            analyst_pos = speaker_sentiment.loc['analyst', 'positive'] if 'positive' in speaker_sentiment.columns else 0
            exec_pos = speaker_sentiment.loc['executive', 'positive'] if 'positive' in speaker_sentiment.columns else 0
            divergence = abs(analyst_pos - exec_pos)
            print(f"  Positive sentiment divergence: {divergence:.3f}")

    # Question 2: Tone shift over time
    if 'quarter' in df.columns and 'finbert_tone_label' in df.columns:
        print("\n2. Tone Shift Over Time:")
        quarter_sentiment = df.groupby('quarter')['finbert_tone_label'].value_counts(normalize=True).unstack(fill_value=0)
        print(quarter_sentiment.round(3))

# Preview analysis on multi-quarter data
if anomaly_jpm_multi_2025_df is not None:
    preview_research_questions(anomaly_jpm_multi_2025_df)


📊 RESEARCH QUESTIONS PREVIEW
----------------------------------------
1. Banker vs Analyst Sentiment Divergence:
finbert_tone_label  negative  neutral  positive
speaker_role                                   
analyst                0.136    0.741     0.123
cfo                    0.136    0.649     0.215
executive              0.128    0.723     0.149
  Positive sentiment divergence: 0.025

2. Tone Shift Over Time:
finbert_tone_label  negative  neutral  positive
quarter                                        
q1_2025                0.161    0.676     0.163
q2_2025                0.103    0.714     0.182
