In [9]:
# 04_sentiment_analysis_jpm_enhanced.ipynb
# Purpose: Enhanced sentiment analysis with performance optimization and result improvement techniques
# Models: yiyanghkust/finbert-tone, ProsusAI/finbert + Enhanced techniques
# Input: processed datasets + manual validation results
# Output: Enhanced sentiment results with improved performance metrics

## Import Libraries

import pandas as pd
import numpy as np
import json
import torch
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Enhanced ML libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn.functional as F

# Enhanced text processing
import re
from textblob import TextBlob
import nltk
try:
    nltk.download('vader_lexicon', quiet=True)
    from nltk.sentiment import SentimentIntensityAnalyzer
except:
    print("NLTK VADER not available")

# Progress tracking
from tqdm import tqdm
tqdm.pandas()

# Statistical analysis
from scipy import stats
from scipy.stats import mode

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive")

# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

# Set seeds for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

print(f"Enhanced sentiment analysis for bank: {BANK_CODE.upper()}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enhanced sentiment analysis for bank: JPM
Device: CPU


In [10]:
## Define Paths

processed_data_path = drive_base / "data/processed/jpm"
manual_validation_path = drive_base / "data/manual_validation/jpm"
results_sentiment_path = drive_base / "results/sentiment/jpm"
models_path = drive_base / "models"

# Ensure directories exist
results_sentiment_path.mkdir(parents=True, exist_ok=True)
models_path.mkdir(parents=True, exist_ok=True)

In [11]:
## Load Data and Manual Validation Results

def load_processed_dataset(filename: str) -> pd.DataFrame:
    """Load processed dataset with error handling."""
    file_path = processed_data_path / filename
    if not file_path.exists():
        print(f"Warning: File not found: {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {filename}: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading {filename}: {str(e)}")
        return None

def load_manual_validation_results() -> Tuple[pd.DataFrame, Dict]:
    """Load manual validation results."""
    # Load manual labels
    manual_labels_path = results_sentiment_path / "sentiment_sentence_jpm_multi_2025_validated.csv"
    manual_df = None
    if manual_labels_path.exists():
        try:
            manual_df = pd.read_csv(manual_labels_path)
            print(f"Loaded manual validation data: {manual_df.shape}")
        except Exception as e:
            print(f"Could not load manual validation data: {e}")

    # Load validation report
    validation_report_path = manual_validation_path / "manual_validation_report.json"
    validation_report = {}
    if validation_report_path.exists():
        try:
            with open(validation_report_path, 'r') as f:
                validation_report = json.load(f)
            print("Loaded validation report")
        except Exception as e:
            print(f"Could not load validation report: {e}")

    return manual_df, validation_report

print("Loading datasets...")
processed_jpm_q1_2025_df = load_processed_dataset("processed_jpm_q1_2025.csv")
processed_jpm_q2_2025_df = load_processed_dataset("processed_jpm_q2_2025.csv")
processed_jpm_multi_2025_df = load_processed_dataset("processed_jpm_multi_2025.csv")

# Load manual validation results
manual_labels_df, validation_report = load_manual_validation_results()

Loading datasets...
Loaded processed_jpm_q1_2025.csv: (578, 9)
Loaded processed_jpm_q2_2025.csv: (532, 9)
Loaded processed_jpm_multi_2025.csv: (1110, 9)
Loaded manual validation data: (1110, 41)
Loaded validation report


In [12]:
## Enhanced FinBERT Model with Confidence Calibration

class EnhancedFinBERTAnalyzer:
    """Enhanced FinBERT analyzer with confidence calibration and ensemble methods."""

    def __init__(self, model_name: str, device: str = None):
        self.model_name = model_name
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = None
        self.model = None
        self.pipeline = None
        self.confidence_calibrator = None

        # Enhanced configurations
        self.model_configs = {
            'yiyanghkust/finbert-tone': {
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16,
                'confidence_threshold': 0.6
            },
            'ProsusAI/finbert': {
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16,
                'confidence_threshold': 0.6
            }
        }

        self.config = self.model_configs.get(model_name, {})

    def load_model(self):
        """Load the FinBERT model with enhanced features."""
        try:
            print(f"Loading enhanced model: {self.model_name}")

            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)

            # Move model to device
            self.model.to(self.device)
            self.model.eval()

            # Create pipeline
            self.pipeline = pipeline(
                'sentiment-analysis',
                model=self.model,
                tokenizer=self.tokenizer,
                device=0 if self.device == 'cuda' else -1,
                return_all_scores=True
            )

            print(f"Enhanced model loaded on {self.device}")
            return True

        except Exception as e:
            print(f"Error loading model {self.model_name}: {str(e)}")
            return False

    def calibrate_confidence(self, texts: List[str], true_labels: List[str] = None):
        """Calibrate confidence scores using manual labels if available."""
        if true_labels is None or len(true_labels) == 0:
            print("No manual labels available for confidence calibration")
            return

        print("Calibrating confidence scores...")

        # Get model predictions for calibration data
        predictions = self.predict_sentiment_enhanced(texts, batch_size=8)

        # Extract confidence scores and predictions
        pred_labels = [p['predicted_label'] for p in predictions]
        confidences = [p['predicted_score'] for p in predictions]

        # Calculate calibration metrics
        correct_mask = np.array(pred_labels) == np.array(true_labels)

        # Platt scaling for confidence calibration
        from sklearn.calibration import CalibratedClassifierCV
        from sklearn.linear_model import LogisticRegression

        # Simple confidence recalibration
        confidence_bins = np.linspace(0, 1, 11)
        bin_accuracies = []

        for i in range(len(confidence_bins) - 1):
            bin_mask = (np.array(confidences) >= confidence_bins[i]) & (np.array(confidences) < confidence_bins[i+1])
            if bin_mask.sum() > 0:
                bin_accuracy = correct_mask[bin_mask].mean()
                bin_accuracies.append(bin_accuracy)
            else:
                bin_accuracies.append(0.0)

        self.confidence_calibrator = {
            'bin_edges': confidence_bins,
            'bin_accuracies': bin_accuracies
        }

        print(f"Confidence calibration complete")

    def get_calibrated_confidence(self, raw_confidence: float) -> float:
        """Get calibrated confidence score."""
        if self.confidence_calibrator is None:
            return raw_confidence

        # Find the appropriate bin
        bin_edges = self.confidence_calibrator['bin_edges']
        bin_accuracies = self.confidence_calibrator['bin_accuracies']

        for i in range(len(bin_edges) - 1):
            if bin_edges[i] <= raw_confidence < bin_edges[i+1]:
                return bin_accuracies[i]

        return raw_confidence

    def predict_sentiment_enhanced(self, texts: List[str], batch_size: int = None) -> List[Dict]:
        """Enhanced sentiment prediction with additional features."""
        if self.pipeline is None:
            raise ValueError("Model not loaded. Call load_model() first.")

        batch_size = batch_size or self.config.get('batch_size', 16)
        results = []

        print(f"Processing {len(texts)} texts with enhanced features...")

        for i in tqdm(range(0, len(texts), batch_size), desc="Enhanced Sentiment Analysis"):
            batch = texts[i:i + batch_size]

            try:
                # Get base model predictions
                batch_results = self.pipeline(batch)

                # Process each result with enhancements
                for text_idx, text_result in enumerate(batch_results):
                    text = batch[text_idx]

                    # Convert to standard format
                    scores_dict = {item['label'].lower(): item['score'] for item in text_result}

                    # Get predicted label and score
                    predicted_label = max(scores_dict, key=scores_dict.get)
                    predicted_score = scores_dict[predicted_label]

                    # Apply confidence calibration if available
                    calibrated_score = self.get_calibrated_confidence(predicted_score)

                    # Calculate additional metrics
                    entropy = -sum(score * np.log(score + 1e-8) for score in scores_dict.values())
                    max_prob_diff = predicted_score - sorted(scores_dict.values(), reverse=True)[1]

                    # Text-based features
                    text_features = self.extract_text_features(text)

                    result = {
                        'text': text,
                        'predicted_label': predicted_label,
                        'predicted_score': predicted_score,
                        'calibrated_score': calibrated_score,
                        'positive_score': scores_dict.get('positive', 0.0),
                        'neutral_score': scores_dict.get('neutral', 0.0),
                        'negative_score': scores_dict.get('negative', 0.0),
                        'entropy': entropy,
                        'max_prob_diff': max_prob_diff,
                        'model_name': self.model_name,
                        **text_features
                    }

                    results.append(result)

            except Exception as e:
                print(f"Error processing batch {i//batch_size + 1}: {str(e)}")
                # Add placeholder results for failed batch
                for j in range(len(batch)):
                    results.append({
                        'text': batch[j],
                        'predicted_label': 'neutral',
                        'predicted_score': 0.33,
                        'calibrated_score': 0.33,
                        'positive_score': 0.33,
                        'neutral_score': 0.34,
                        'negative_score': 0.33,
                        'entropy': 1.0,
                        'max_prob_diff': 0.0,
                        'model_name': self.model_name,
                        'error': True
                    })

        return results

    def extract_text_features(self, text: str) -> Dict:
        """Extract additional text-based features."""
        features = {}

        # Basic text statistics
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())
        features['sentence_count'] = len(re.split(r'[.!?]+', text))

        # Financial keyword indicators
        positive_keywords = ['growth', 'profit', 'increase', 'strong', 'improved', 'positive', 'beat', 'exceed']
        negative_keywords = ['loss', 'decline', 'decrease', 'weak', 'poor', 'negative', 'miss', 'below']

        text_lower = text.lower()
        features['positive_keyword_count'] = sum(1 for kw in positive_keywords if kw in text_lower)
        features['negative_keyword_count'] = sum(1 for kw in negative_keywords if kw in text_lower)
        features['keyword_sentiment_score'] = features['positive_keyword_count'] - features['negative_keyword_count']

        # Textual patterns
        features['has_numbers'] = bool(re.search(r'\d', text))
        features['has_percentages'] = bool(re.search(r'\d+%', text))
        features['has_currency'] = bool(re.search(r'[$£€]\d', text))
        features['question_marks'] = text.count('?')
        features['exclamation_marks'] = text.count('!')

        return features


In [13]:
## Ensemble and Blended Scoring Methods

class SentimentEnsemble:
    """Ensemble methods for combining multiple sentiment analysis approaches."""

    def __init__(self):
        self.models = {}
        self.weights = {}
        self.vader_analyzer = None
        self.tfidf_vectorizer = None
        self.ml_classifier = None

        # Initialize VADER if available
        try:
            self.vader_analyzer = SentimentIntensityAnalyzer()
        except:
            print("VADER sentiment analyzer not available")

    def add_model(self, name: str, model: EnhancedFinBERTAnalyzer, weight: float = 1.0):
        """Add a model to the ensemble."""
        self.models[name] = model
        self.weights[name] = weight
        print(f"Added {name} to ensemble with weight {weight}")

    def train_ml_classifier(self, texts: List[str], labels: List[str]):
        """Train a simple ML classifier as backup."""
        print("Training backup ML classifier...")

        # Create TF-IDF features
        self.tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        X = self.tfidf_vectorizer.fit_transform(texts)

        # Train logistic regression
        self.ml_classifier = LogisticRegression(random_state=SEED, max_iter=1000)
        self.ml_classifier.fit(X, labels)

        print("ML classifier trained")

    def predict_ensemble(self, texts: List[str]) -> List[Dict]:
        """Generate ensemble predictions."""
        print(f"Generating ensemble predictions for {len(texts)} texts...")

        all_predictions = {}

        # Get predictions from all models
        for model_name, model in self.models.items():
            try:
                predictions = model.predict_sentiment_enhanced(texts)
                all_predictions[model_name] = predictions
                print(f"Got predictions from {model_name}")
            except Exception as e:
                print(f"Error getting predictions from {model_name}: {e}")

        # Get VADER predictions if available
        if self.vader_analyzer:
            vader_predictions = []
            for text in texts:
                scores = self.vader_analyzer.polarity_scores(text)
                compound = scores['compound']

                if compound >= 0.05:
                    label = 'positive'
                elif compound <= -0.05:
                    label = 'negative'
                else:
                    label = 'neutral'

                vader_predictions.append({
                    'predicted_label': label,
                    'predicted_score': abs(compound),
                    'compound_score': compound
                })

            all_predictions['vader'] = vader_predictions

        # Get ML classifier predictions if available
        if self.ml_classifier and self.tfidf_vectorizer:
            try:
                X = self.tfidf_vectorizer.transform(texts)
                ml_labels = self.ml_classifier.predict(X)
                ml_probas = self.ml_classifier.predict_proba(X)

                ml_predictions = []
                for i, label in enumerate(ml_labels):
                    max_prob = ml_probas[i].max()
                    ml_predictions.append({
                        'predicted_label': label,
                        'predicted_score': max_prob
                    })

                all_predictions['ml_classifier'] = ml_predictions
            except Exception as e:
                print(f"Error getting ML classifier predictions: {e}")

        # Combine predictions
        ensemble_results = []
        for i in range(len(texts)):
            # Collect predictions for this text
            text_predictions = {}
            text_scores = {}

            for model_name, predictions in all_predictions.items():
                if i < len(predictions):
                    pred = predictions[i]
                    text_predictions[model_name] = pred['predicted_label']
                    text_scores[model_name] = pred['predicted_score']

            # Weighted voting for final prediction
            label_votes = {}
            total_weight = 0

            for model_name, label in text_predictions.items():
                weight = self.weights.get(model_name, 1.0)
                confidence = text_scores.get(model_name, 0.5)

                # Weight by model weight and confidence
                vote_weight = weight * confidence

                if label not in label_votes:
                    label_votes[label] = 0
                label_votes[label] += vote_weight
                total_weight += vote_weight

            # Normalize votes
            if total_weight > 0:
                for label in label_votes:
                    label_votes[label] /= total_weight

            # Get final prediction
            if label_votes:
                final_label = max(label_votes, key=label_votes.get)
                final_confidence = label_votes[final_label]
            else:
                final_label = 'neutral'
                final_confidence = 0.33

            ensemble_result = {
                'text': texts[i],
                'ensemble_label': final_label,
                'ensemble_confidence': final_confidence,
                'individual_predictions': text_predictions,
                'individual_scores': text_scores,
                'label_votes': label_votes
            }

            ensemble_results.append(ensemble_result)

        return ensemble_results


In [14]:
## Enhanced Model Performance Evaluation

class EnhancedModelEvaluator:
    """Enhanced evaluation with comprehensive metrics."""

    def __init__(self):
        self.evaluation_results = {}

    def evaluate_model_performance(self, predictions: List[str], true_labels: List[str],
                                 confidences: List[float] = None, model_name: str = "model") -> Dict:
        """Comprehensive model evaluation."""

        if len(predictions) != len(true_labels):
            raise ValueError("Predictions and true labels must have same length")

        print(f"Evaluating {model_name} performance...")

        # Basic metrics
        accuracy = (np.array(predictions) == np.array(true_labels)).mean()

        # Classification report
        report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)

        # Per-class metrics
        f1_scores = {}
        precision_scores = {}
        recall_scores = {}

        for label in set(true_labels + predictions):
            if label in report:
                f1_scores[label] = report[label]['f1-score']
                precision_scores[label] = report[label]['precision']
                recall_scores[label] = report[label]['recall']

        # Weighted averages
        weighted_f1 = report['weighted avg']['f1-score']
        weighted_precision = report['weighted avg']['precision']
        weighted_recall = report['weighted avg']['recall']

        # Macro averages
        macro_f1 = report['macro avg']['f1-score']
        macro_precision = report['macro avg']['precision']
        macro_recall = report['macro avg']['recall']

        # Confidence-based metrics
        confidence_metrics = {}
        if confidences is not None:
            confidences = np.array(confidences)
            correct_mask = np.array(predictions) == np.array(true_labels)

            # Accuracy by confidence bins
            confidence_bins = np.linspace(0, 1, 6)  # 5 bins
            bin_accuracies = []
            bin_counts = []

            for i in range(len(confidence_bins) - 1):
                bin_mask = (confidences >= confidence_bins[i]) & (confidences < confidence_bins[i+1])
                if bin_mask.sum() > 0:
                    bin_accuracy = correct_mask[bin_mask].mean()
                    bin_accuracies.append(bin_accuracy)
                    bin_counts.append(bin_mask.sum())
                else:
                    bin_accuracies.append(0.0)
                    bin_counts.append(0)

            # Expected Calibration Error (ECE)
            ece = 0
            total_samples = len(predictions)
            for i in range(len(bin_accuracies)):
                if bin_counts[i] > 0:
                    bin_confidence = (confidence_bins[i] + confidence_bins[i+1]) / 2
                    ece += (bin_counts[i] / total_samples) * abs(bin_accuracies[i] - bin_confidence)

            confidence_metrics = {
                'confidence_bins': confidence_bins.tolist(),
                'bin_accuracies': bin_accuracies,
                'bin_counts': bin_counts,
                'expected_calibration_error': ece,
                'avg_confidence': confidences.mean(),
                'confidence_std': confidences.std()
            }

        evaluation_result = {
            'model_name': model_name,
            'sample_size': len(predictions),
            'accuracy': accuracy,
            'weighted_f1': weighted_f1,
            'weighted_precision': weighted_precision,
            'weighted_recall': weighted_recall,
            'macro_f1': macro_f1,
            'macro_precision': macro_precision,
            'macro_recall': macro_recall,
            'per_class_f1': f1_scores,
            'per_class_precision': precision_scores,
            'per_class_recall': recall_scores,
            'classification_report': report,
            'confidence_metrics': confidence_metrics
        }

        # Print key metrics
        print(f"  Accuracy: {accuracy:.3f}")
        print(f"  Weighted F1: {weighted_f1:.3f}")
        print(f"  Macro F1: {macro_f1:.3f}")
        if confidence_metrics:
            print(f"  Avg Confidence: {confidence_metrics['avg_confidence']:.3f}")
            print(f"  Calibration Error: {confidence_metrics['expected_calibration_error']:.3f}")

        self.evaluation_results[model_name] = evaluation_result
        return evaluation_result

    def compare_models(self) -> Dict:
        """Compare all evaluated models."""
        if not self.evaluation_results:
            return {}

        print("\nModel Comparison Summary:")
        print("=" * 50)

        comparison = {
            'model_rankings': {},
            'best_model': None,
            'performance_summary': {}
        }

        # Rank models by weighted F1
        f1_scores = {name: results['weighted_f1'] for name, results in self.evaluation_results.items()}
        ranked_models = sorted(f1_scores.items(), key=lambda x: x[1], reverse=True)

        comparison['model_rankings']['by_f1'] = ranked_models
        comparison['best_model'] = ranked_models[0][0] if ranked_models else None

        # Performance summary
        for model_name, results in self.evaluation_results.items():
            print(f"{model_name}:")
            print(f"  F1: {results['weighted_f1']:.3f}")
            print(f"  Precision: {results['weighted_precision']:.3f}")
            print(f"  Recall: {results['weighted_recall']:.3f}")
            print(f"  Accuracy: {results['accuracy']:.3f}")

            comparison['performance_summary'][model_name] = {
                'weighted_f1': results['weighted_f1'],
                'weighted_precision': results['weighted_precision'],
                'weighted_recall': results['weighted_recall'],
                'accuracy': results['accuracy']
            }

        if comparison['best_model']:
            print(f"\nBest performing model: {comparison['best_model']}")

        return comparison


In [15]:
## Initialize Enhanced Models

print("\n" + "=" * 60)
print("INITIALIZING ENHANCED MODELS")
print("=" * 60)

# Initialize enhanced analyzers
finbert_tone_enhanced = EnhancedFinBERTAnalyzer('yiyanghkust/finbert-tone')
prosus_finbert_enhanced = EnhancedFinBERTAnalyzer('ProsusAI/finbert')

# Load models
print("Loading enhanced FinBERT models...")
finbert_tone_loaded = finbert_tone_enhanced.load_model()
prosus_finbert_loaded = prosus_finbert_enhanced.load_model()

# Initialize ensemble
ensemble = SentimentEnsemble()
if finbert_tone_loaded:
    ensemble.add_model('finbert_tone', finbert_tone_enhanced, weight=1.0)
if prosus_finbert_loaded:
    ensemble.add_model('prosus_finbert', prosus_finbert_enhanced, weight=1.0)

# Initialize evaluator
evaluator = EnhancedModelEvaluator()



INITIALIZING ENHANCED MODELS
Loading enhanced FinBERT models...
Loading enhanced model: yiyanghkust/finbert-tone


Device set to use cpu


Enhanced model loaded on cpu
Loading enhanced model: ProsusAI/finbert


Device set to use cpu


Enhanced model loaded on cpu
Added finbert_tone to ensemble with weight 1.0
Added prosus_finbert to ensemble with weight 1.0


In [16]:
## Confidence Calibration with Manual Labels

if manual_labels_df is not None:
    print("\n" + "=" * 50)
    print("CONFIDENCE CALIBRATION")
    print("=" * 50)

    # Extract manually labeled data for calibration
    manual_mask = manual_labels_df['human_label'].notna() & (manual_labels_df['human_label'] != '')
    manual_calibration_df = manual_labels_df[manual_mask].copy()

    if len(manual_calibration_df) > 10:  # Need minimum samples for calibration
        calibration_texts = manual_calibration_df['text'].tolist()
        calibration_labels = manual_calibration_df['human_label'].tolist()

        # Calibrate both models
        if finbert_tone_loaded:
            finbert_tone_enhanced.calibrate_confidence(calibration_texts, calibration_labels)

        if prosus_finbert_loaded:
            prosus_finbert_enhanced.calibrate_confidence(calibration_texts, calibration_labels)

        # Train backup ML classifier
        ensemble.train_ml_classifier(calibration_texts, calibration_labels)

        print("Confidence calibration complete")
    else:
        print("Insufficient manual labels for confidence calibration")


CONFIDENCE CALIBRATION
Calibrating confidence scores...
Processing 200 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 25/25 [00:31<00:00,  1.25s/it]


Confidence calibration complete
Calibrating confidence scores...
Processing 200 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 25/25 [00:30<00:00,  1.20s/it]

Confidence calibration complete
Training backup ML classifier...
ML classifier trained
Confidence calibration complete





In [17]:
## Enhanced Sentiment Analysis on Datasets

def run_enhanced_sentiment_analysis(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """Run enhanced sentiment analysis with all improvements."""
    if df is None:
        print(f"Cannot process {dataset_name} - dataset is None")
        return None

    print(f"\n🔬 ENHANCED SENTIMENT ANALYSIS - {dataset_name}")
    print("-" * 50)
    print(f"Input shape: {df.shape}")

    # Prepare text data
    texts = df['text'].fillna('').astype(str).tolist()
    print(f"Processing {len(texts)} text entries")

    results_df = df.copy()

    # Run enhanced FinBERT-tone analysis
    if finbert_tone_loaded:
        print("\n🤖 Running enhanced FinBERT-tone analysis...")
        try:
            finbert_results = finbert_tone_enhanced.predict_sentiment_enhanced(texts)

            # Add enhanced results to dataframe
            for i, result in enumerate(finbert_results):
                results_df.loc[i, 'finbert_tone_label'] = result['predicted_label']
                results_df.loc[i, 'finbert_tone_score'] = result['predicted_score']
                results_df.loc[i, 'finbert_tone_calibrated'] = result['calibrated_score']
                results_df.loc[i, 'finbert_tone_entropy'] = result['entropy']
                results_df.loc[i, 'finbert_tone_max_diff'] = result['max_prob_diff']
                results_df.loc[i, 'finbert_tone_positive'] = result['positive_score']
                results_df.loc[i, 'finbert_tone_neutral'] = result['neutral_score']
                results_df.loc[i, 'finbert_tone_negative'] = result['negative_score']

                # Add text features
                for feature_name, feature_value in result.items():
                    if feature_name.endswith('_count') or feature_name.startswith('has_') or feature_name.endswith('_score'):
                        if feature_name not in ['positive_score', 'neutral_score', 'negative_score', 'predicted_score', 'calibrated_score']:
                            results_df.loc[i, f'finbert_{feature_name}'] = feature_value

            print("Enhanced FinBERT-tone analysis complete")

        except Exception as e:
            print(f"Error in enhanced FinBERT-tone analysis: {str(e)}")

    # Run enhanced ProsusAI analysis
    if prosus_finbert_loaded:
        print("\n🤖 Running enhanced ProsusAI FinBERT analysis...")
        try:
            prosus_results = prosus_finbert_enhanced.predict_sentiment_enhanced(texts)

            # Add enhanced results to dataframe
            for i, result in enumerate(prosus_results):
                results_df.loc[i, 'prosus_label'] = result['predicted_label']
                results_df.loc[i, 'prosus_score'] = result['predicted_score']
                results_df.loc[i, 'prosus_calibrated'] = result['calibrated_score']
                results_df.loc[i, 'prosus_entropy'] = result['entropy']
                results_df.loc[i, 'prosus_max_diff'] = result['max_prob_diff']
                results_df.loc[i, 'prosus_positive'] = result['positive_score']
                results_df.loc[i, 'prosus_neutral'] = result['neutral_score']
                results_df.loc[i, 'prosus_negative'] = result['negative_score']

                # Add text features
                for feature_name, feature_value in result.items():
                    if feature_name.endswith('_count') or feature_name.startswith('has_') or feature_name.endswith('_score'):
                        if feature_name not in ['positive_score', 'neutral_score', 'negative_score', 'predicted_score', 'calibrated_score']:
                            results_df.loc[i, f'prosus_{feature_name}'] = feature_value

            print("Enhanced ProsusAI FinBERT analysis complete")

        except Exception as e:
            print(f"Error in enhanced ProsusAI FinBERT analysis: {str(e)}")

    # Run ensemble analysis
    print("\nRunning ensemble analysis...")
    try:
        ensemble_results = ensemble.predict_ensemble(texts)

        # Add ensemble results
        for i, result in enumerate(ensemble_results):
            results_df.loc[i, 'ensemble_label'] = result['ensemble_label']
            results_df.loc[i, 'ensemble_confidence'] = result['ensemble_confidence']

            # Add individual model votes
            for model_name, prediction in result['individual_predictions'].items():
                results_df.loc[i, f'vote_{model_name}'] = prediction

        print("Ensemble analysis complete")

    except Exception as e:
        print(f"Error in ensemble analysis: {str(e)}")

    print(f"Final enhanced shape: {results_df.shape}")
    return results_df

# Run enhanced sentiment analysis on all datasets
enhanced_jpm_q1_2025_df = run_enhanced_sentiment_analysis(processed_jpm_q1_2025_df, "Q1 2025")
enhanced_jpm_q2_2025_df = run_enhanced_sentiment_analysis(processed_jpm_q2_2025_df, "Q2 2025")
enhanced_jpm_multi_2025_df = run_enhanced_sentiment_analysis(processed_jpm_multi_2025_df, "Multi 2025")



🔬 ENHANCED SENTIMENT ANALYSIS - Q1 2025
--------------------------------------------------
Input shape: (578, 9)
Processing 578 text entries

🤖 Running enhanced FinBERT-tone analysis...
Processing 578 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 37/37 [02:01<00:00,  3.29s/it]


Enhanced FinBERT-tone analysis complete

🤖 Running enhanced ProsusAI FinBERT analysis...
Processing 578 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 37/37 [01:38<00:00,  2.67s/it]


Enhanced ProsusAI FinBERT analysis complete

Running ensemble analysis...
Generating ensemble predictions for 578 texts...
Processing 578 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 37/37 [01:29<00:00,  2.41s/it]


Got predictions from finbert_tone
Processing 578 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 37/37 [01:26<00:00,  2.34s/it]


Got predictions from prosus_finbert
Ensemble analysis complete
Final enhanced shape: (578, 47)

🔬 ENHANCED SENTIMENT ANALYSIS - Q2 2025
--------------------------------------------------
Input shape: (532, 9)
Processing 532 text entries

🤖 Running enhanced FinBERT-tone analysis...
Processing 532 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 34/34 [01:24<00:00,  2.48s/it]


Enhanced FinBERT-tone analysis complete

🤖 Running enhanced ProsusAI FinBERT analysis...
Processing 532 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 34/34 [01:21<00:00,  2.41s/it]


Enhanced ProsusAI FinBERT analysis complete

Running ensemble analysis...
Generating ensemble predictions for 532 texts...
Processing 532 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 34/34 [01:23<00:00,  2.46s/it]


Got predictions from finbert_tone
Processing 532 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 34/34 [01:19<00:00,  2.35s/it]


Got predictions from prosus_finbert
Ensemble analysis complete
Final enhanced shape: (532, 47)

🔬 ENHANCED SENTIMENT ANALYSIS - Multi 2025
--------------------------------------------------
Input shape: (1110, 9)
Processing 1110 text entries

🤖 Running enhanced FinBERT-tone analysis...
Processing 1110 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 70/70 [02:53<00:00,  2.47s/it]


Enhanced FinBERT-tone analysis complete

🤖 Running enhanced ProsusAI FinBERT analysis...
Processing 1110 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 70/70 [02:46<00:00,  2.38s/it]


Enhanced ProsusAI FinBERT analysis complete

Running ensemble analysis...
Generating ensemble predictions for 1110 texts...
Processing 1110 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 70/70 [02:51<00:00,  2.44s/it]


Got predictions from finbert_tone
Processing 1110 texts with enhanced features...


Enhanced Sentiment Analysis: 100%|██████████| 70/70 [02:47<00:00,  2.40s/it]


Got predictions from prosus_finbert
Ensemble analysis complete
Final enhanced shape: (1110, 47)


In [19]:
## Performance Evaluation with Manual Labels

def evaluate_models_with_manual_labels(df: pd.DataFrame, dataset_name: str):
    """Evaluate model performance using manual labels."""
    if df is None or manual_labels_df is None:
        print(f"Cannot evaluate {dataset_name} - missing data")
        return {}

    print(f"\nPerformance Evaluation - {dataset_name}")
    print("-" * 40)

    # Merge with manual labels
    manual_mask = manual_labels_df['human_label'].notna() & (manual_labels_df['human_label'] != '')
    manual_eval_df = manual_labels_df[manual_mask].copy()

    # Find overlapping records (by text or sentence_id)
    if 'sentence_id' in df.columns and 'sentence_id' in manual_eval_df.columns:
        eval_df = df.merge(manual_eval_df[['sentence_id', 'human_label', 'human_confidence']],
                          on='sentence_id', how='inner')
    else:
        # Fallback to text matching
        eval_df = df.merge(manual_eval_df[['text', 'human_label', 'human_confidence']],
                          on='text', how='inner')

    if len(eval_df) == 0:
        print("No overlapping records found for evaluation")
        return {}

    print(f"Evaluating on {len(eval_df)} manually labeled records")

    evaluation_results = {}

    # Evaluate FinBERT-tone
    if 'finbert_tone_label' in eval_df.columns:
        finbert_eval = evaluator.evaluate_model_performance(
            predictions=eval_df['finbert_tone_label'].tolist(),
            true_labels=eval_df['human_label'].tolist(),
            confidences=eval_df['finbert_tone_calibrated'].tolist() if 'finbert_tone_calibrated' in eval_df.columns else None,
            model_name='Enhanced FinBERT-tone'
        )
        evaluation_results['finbert_tone'] = finbert_eval

    # Evaluate ProsusAI
    if 'prosus_label' in eval_df.columns:
        prosus_eval = evaluator.evaluate_model_performance(
            predictions=eval_df['prosus_label'].tolist(),
            true_labels=eval_df['human_label'].tolist(),
            confidences=eval_df['prosus_calibrated'].tolist() if 'prosus_calibrated' in eval_df.columns else None,
            model_name='Enhanced ProsusAI'
        )
        evaluation_results['prosus'] = prosus_eval

    # Evaluate Ensemble
    if 'ensemble_label' in eval_df.columns:
        ensemble_eval = evaluator.evaluate_model_performance(
            predictions=eval_df['ensemble_label'].tolist(),
            true_labels=eval_df['human_label'].tolist(),
            confidences=eval_df['ensemble_confidence'].tolist() if 'ensemble_confidence' in eval_df.columns else None,
            model_name='Ensemble Model'
        )
        evaluation_results['ensemble'] = ensemble_eval

    return evaluation_results

# Evaluate models if manual labels available
performance_results = {}
if manual_labels_df is not None:
    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE EVALUATION")
    print("=" * 60)

    for dataset_name, df in [("Q1 2025", enhanced_jpm_q1_2025_df),
                             ("Q2 2025", enhanced_jpm_q2_2025_df),
                             ("Multi 2025", enhanced_jpm_multi_2025_df)]:
        if df is not None:
            eval_results = evaluate_models_with_manual_labels(df, dataset_name)
            performance_results[dataset_name] = eval_results

    # Compare all models
    model_comparison = evaluator.compare_models()


MODEL PERFORMANCE EVALUATION

Performance Evaluation - Q1 2025
----------------------------------------
Evaluating on 200 manually labeled records
Evaluating Enhanced FinBERT-tone performance...
  Accuracy: 0.895
  Weighted F1: 0.894
  Macro F1: 0.662
  Avg Confidence: 0.895
  Calibration Error: 0.022
Evaluating Enhanced ProsusAI performance...
  Accuracy: 0.855
  Weighted F1: 0.853
  Macro F1: 0.618
  Avg Confidence: 0.855
  Calibration Error: 0.014
Evaluating Ensemble Model performance...
  Accuracy: 0.920
  Weighted F1: 0.916
  Macro F1: 0.676
  Avg Confidence: 0.850
  Calibration Error: 0.071

Performance Evaluation - Q2 2025
----------------------------------------
Evaluating on 45 manually labeled records
Evaluating Enhanced FinBERT-tone performance...
  Accuracy: 0.578
  Weighted F1: 0.589
  Macro F1: 0.248
  Avg Confidence: 0.895
  Calibration Error: 0.327
Evaluating Enhanced ProsusAI performance...
  Accuracy: 0.689
  Weighted F1: 0.696
  Macro F1: 0.345
  Avg Confidence: 0.8

In [20]:
## Continue with Enhanced Multi-level Analysis

def enhanced_aggregate_sentence_to_qa_level(df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced aggregation with additional metrics."""
    if df is None or 'original_qa_id' not in df.columns:
        print("Cannot aggregate - missing original_qa_id column")
        return None

    print("Enhanced aggregation to Q&A level...")

    # Enhanced aggregation functions
    agg_functions = {
        # Text info
        'text': lambda x: ' '.join(x),
        'speaker': 'first',
        'speaker_role': 'first',
        'quarter': 'first',
        'bank_code': 'first',

        # Sentence counts and stats
        'sentence_length': ['count', 'mean', 'sum', 'std'],
        'sentence_word_count': ['mean', 'sum', 'std'],

        # Enhanced FinBERT-tone metrics
        'finbert_tone_score': ['mean', 'std', 'min', 'max'],
        'finbert_tone_calibrated': ['mean', 'std'],
        'finbert_tone_entropy': ['mean', 'std'],
        'finbert_tone_positive': 'mean',
        'finbert_tone_neutral': 'mean',
        'finbert_tone_negative': 'mean',

        # Enhanced ProsusAI metrics
        'prosus_score': ['mean', 'std', 'min', 'max'],
        'prosus_calibrated': ['mean', 'std'],
        'prosus_entropy': ['mean', 'std'],
        'prosus_positive': 'mean',
        'prosus_neutral': 'mean',
        'prosus_negative': 'mean',

        # Ensemble metrics
        'ensemble_confidence': ['mean', 'std']
    }

    # Apply aggregations
    qa_level_df = df.groupby('original_qa_id').agg(agg_functions).reset_index()

    # Flatten column names
    qa_level_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                           for col in qa_level_df.columns]

    # Enhanced label determination using ensemble and confidence
    for model_prefix in ['finbert_tone', 'prosus', 'ensemble']:
        label_col = f'{model_prefix}_label'
        if label_col in df.columns:
            # Confidence-weighted majority vote
            def confidence_weighted_vote(group):
                if f'{model_prefix}_calibrated' in df.columns:
                    # Weight votes by calibrated confidence
                    labels = group[label_col]
                    weights = group[f'{model_prefix}_calibrated'].fillna(0.5)

                    # Calculate weighted votes
                    weighted_votes = {}
                    for label, weight in zip(labels, weights):
                        if label not in weighted_votes:
                            weighted_votes[label] = 0
                        weighted_votes[label] += weight

                    return max(weighted_votes, key=weighted_votes.get)
                else:
                    # Simple majority vote
                    return group[label_col].mode().iloc[0] if len(group[label_col].mode()) > 0 else 'neutral'

            qa_labels = df.groupby('original_qa_id').apply(confidence_weighted_vote).reset_index()
            qa_labels.columns = ['original_qa_id', f'{model_prefix}_qa_label']
            qa_level_df = qa_level_df.merge(qa_labels, on='original_qa_id')

    print(f"Enhanced Q&A level aggregation complete: {qa_level_df.shape}")
    return qa_level_df

def enhanced_aggregate_by_speaker_role(df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced speaker-level aggregation."""
    if df is None or 'speaker_role' not in df.columns:
        print("Cannot aggregate by speaker - missing speaker_role column")
        return None

    print("Enhanced speaker role aggregation...")

    # Enhanced speaker aggregation
    speaker_agg = {
        'text': 'count',
        'sentence_length': ['mean', 'std'],
        'sentence_word_count': ['mean', 'std'],

        # Enhanced metrics for both models
        'finbert_tone_score': ['mean', 'std'],
        'finbert_tone_calibrated': ['mean', 'std'],
        'finbert_tone_entropy': ['mean', 'std'],
        'finbert_tone_positive': 'mean',
        'finbert_tone_neutral': 'mean',
        'finbert_tone_negative': 'mean',

        'prosus_score': ['mean', 'std'],
        'prosus_calibrated': ['mean', 'std'],
        'prosus_entropy': ['mean', 'std'],
        'prosus_positive': 'mean',
        'prosus_neutral': 'mean',
        'prosus_negative': 'mean',

        'ensemble_confidence': ['mean', 'std']
    }

    # Group by quarter and speaker role if quarter available
    if 'quarter' in df.columns:
        speaker_df = df.groupby(['quarter', 'speaker_role']).agg(speaker_agg).reset_index()
    else:
        speaker_df = df.groupby('speaker_role').agg(speaker_agg).reset_index()

    # Flatten column names
    speaker_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                          for col in speaker_df.columns]

    # Add enhanced label distributions
    label_columns = [col for col in df.columns if col.endswith('_label')]

    for label_col in label_columns:
        model_name = label_col.replace('_label', '')

        if 'quarter' in df.columns:
            label_dist = df.groupby(['quarter', 'speaker_role'])[label_col].value_counts(normalize=True).unstack(fill_value=0)
        else:
            label_dist = df.groupby('speaker_role')[label_col].value_counts(normalize=True).unstack(fill_value=0)

        # Add distribution columns
        label_dist.columns = [f'{model_name}_{col}_pct' for col in label_dist.columns]

        # Merge distributions
        if 'quarter' in df.columns:
            speaker_df = speaker_df.set_index(['quarter', 'speaker_role']).join(label_dist).reset_index()
        else:
            speaker_df = speaker_df.set_index('speaker_role').join(label_dist).reset_index()

    print(f"Enhanced speaker-level aggregation complete: {speaker_df.shape}")
    return speaker_df

# Create enhanced multi-level aggregations
print("\n" + "=" * 60)
print("ENHANCED MULTI-LEVEL AGGREGATIONS")
print("=" * 60)

# Enhanced Q&A level aggregations
enhanced_qa_level_q1_df = enhanced_aggregate_sentence_to_qa_level(enhanced_jpm_q1_2025_df)
enhanced_qa_level_q2_df = enhanced_aggregate_sentence_to_qa_level(enhanced_jpm_q2_2025_df)
enhanced_qa_level_multi_df = enhanced_aggregate_sentence_to_qa_level(enhanced_jpm_multi_2025_df)

# Enhanced speaker level aggregations
enhanced_speaker_level_q1_df = enhanced_aggregate_by_speaker_role(enhanced_jpm_q1_2025_df)
enhanced_speaker_level_q2_df = enhanced_aggregate_by_speaker_role(enhanced_jpm_q2_2025_df)
enhanced_speaker_level_multi_df = enhanced_aggregate_by_speaker_role(enhanced_jpm_multi_2025_df)



ENHANCED MULTI-LEVEL AGGREGATIONS
Enhanced aggregation to Q&A level...
Enhanced Q&A level aggregation complete: (97, 40)
Enhanced aggregation to Q&A level...
Enhanced Q&A level aggregation complete: (121, 40)
Enhanced aggregation to Q&A level...
Enhanced Q&A level aggregation complete: (218, 40)
Enhanced speaker role aggregation...
Enhanced speaker-level aggregation complete: (3, 36)
Enhanced speaker role aggregation...
Enhanced speaker-level aggregation complete: (3, 36)
Enhanced speaker role aggregation...
Enhanced speaker-level aggregation complete: (6, 36)


In [21]:
## Enhanced Topic-Conditional Sentiment Analysis

def enhanced_extract_financial_topics(df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced financial topic extraction with more sophisticated categorization."""
    if df is None:
        return None

    print("Enhanced financial topic extraction...")

    df = df.copy()

    # Enhanced topic keywords with financial domain expertise
    enhanced_topic_keywords = {
        'revenue_growth': ['revenue growth', 'sales growth', 'income increase', 'earnings growth', 'top line growth'],
        'profitability': ['profit margin', 'profitability', 'ebitda', 'roi', 'return on equity', 'net income'],
        'credit_risk': ['credit risk', 'default rate', 'loan loss', 'provision', 'non-performing', 'charge-off'],
        'operational_risk': ['operational risk', 'compliance', 'regulatory', 'operational efficiency'],
        'market_risk': ['market risk', 'interest rate', 'trading', 'volatility', 'market conditions'],
        'capital_management': ['capital ratio', 'tier 1', 'leverage', 'capital adequacy', 'basel'],
        'digital_transformation': ['digital', 'technology', 'fintech', 'automation', 'innovation'],
        'customer_experience': ['customer satisfaction', 'client experience', 'customer acquisition', 'retention'],
        'regulatory_environment': ['regulation', 'regulatory change', 'compliance cost', 'regulatory capital'],
        'economic_outlook': ['economic environment', 'macro environment', 'economic outlook', 'recession']
    }

    # Identify topics using enhanced matching
    for topic, keywords in enhanced_topic_keywords.items():
        pattern = '|'.join([re.escape(kw) for kw in keywords])
        df[f'topic_{topic}'] = df['text'].str.lower().str.contains(pattern, regex=True, na=False)

    # Determine primary topic with confidence scoring
    topic_columns = [f'topic_{topic}' for topic in enhanced_topic_keywords.keys()]
    topic_scores = df[topic_columns].sum(axis=1)

    # Primary topic is the one with most keyword matches
    df['primary_topic'] = df[topic_columns].idxmax(axis=1).str.replace('topic_', '')
    df['topic_confidence'] = topic_scores / len(enhanced_topic_keywords)

    # If no clear topic, mark as 'general'
    no_topics_mask = topic_scores == 0
    df.loc[no_topics_mask, 'primary_topic'] = 'general'
    df.loc[no_topics_mask, 'topic_confidence'] = 0.0

    print(f"Enhanced topic extraction complete. Topic distribution:")
    topic_dist = df['primary_topic'].value_counts()
    for topic, count in topic_dist.items():
        print(f"  {topic}: {count}")

    return df

def enhanced_analyze_sentiment_by_topic(df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced topic-sentiment analysis with confidence weighting."""
    if df is None or 'primary_topic' not in df.columns:
        return None

    print("Enhanced sentiment analysis by topic...")

    # Enhanced topic aggregation with confidence weighting
    topic_agg = {
        'text': 'count',
        'topic_confidence': 'mean',

        # Enhanced metrics for all models
        'finbert_tone_score': ['mean', 'std'],
        'finbert_tone_calibrated': ['mean', 'std'],
        'finbert_tone_entropy': 'mean',
        'finbert_tone_positive': 'mean',
        'finbert_tone_neutral': 'mean',
        'finbert_tone_negative': 'mean',

        'prosus_score': ['mean', 'std'],
        'prosus_calibrated': ['mean', 'std'],
        'prosus_entropy': 'mean',
        'prosus_positive': 'mean',
        'prosus_neutral': 'mean',
        'prosus_negative': 'mean',

        'ensemble_confidence': ['mean', 'std']
    }

    # Group by topic and optionally quarter
    if 'quarter' in df.columns:
        topic_sentiment_df = df.groupby(['quarter', 'primary_topic']).agg(topic_agg).reset_index()
    else:
        topic_sentiment_df = df.groupby('primary_topic').agg(topic_agg).reset_index()

    # Flatten column names
    topic_sentiment_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                                  for col in topic_sentiment_df.columns]

    print(f"Enhanced topic-sentiment analysis complete: {topic_sentiment_df.shape}")
    return topic_sentiment_df

# Apply enhanced topic analysis
print("\n" + "=" * 50)
print("ENHANCED TOPIC-CONDITIONAL SENTIMENT")
print("=" * 50)

enhanced_topics_q1_df = enhanced_extract_financial_topics(enhanced_jpm_q1_2025_df)
enhanced_topics_q2_df = enhanced_extract_financial_topics(enhanced_jpm_q2_2025_df)
enhanced_topics_multi_df = enhanced_extract_financial_topics(enhanced_jpm_multi_2025_df)

# Enhanced topic-sentiment analysis
enhanced_topic_sentiment_q1_df = enhanced_analyze_sentiment_by_topic(enhanced_topics_q1_df)
enhanced_topic_sentiment_q2_df = enhanced_analyze_sentiment_by_topic(enhanced_topics_q2_df)
enhanced_topic_sentiment_multi_df = enhanced_analyze_sentiment_by_topic(enhanced_topics_multi_df)



ENHANCED TOPIC-CONDITIONAL SENTIMENT
Enhanced financial topic extraction...
Enhanced topic extraction complete. Topic distribution:
  general: 505
  market_risk: 24
  economic_outlook: 13
  regulatory_environment: 9
  credit_risk: 7
  profitability: 6
  capital_management: 5
  operational_risk: 5
  digital_transformation: 4
Enhanced financial topic extraction...
Enhanced topic extraction complete. Topic distribution:
  general: 494
  market_risk: 9
  digital_transformation: 7
  capital_management: 5
  operational_risk: 5
  regulatory_environment: 4
  credit_risk: 3
  profitability: 2
  revenue_growth: 2
  economic_outlook: 1
Enhanced financial topic extraction...
Enhanced topic extraction complete. Topic distribution:
  general: 999
  market_risk: 33
  economic_outlook: 14
  regulatory_environment: 13
  digital_transformation: 11
  capital_management: 10
  operational_risk: 10
  credit_risk: 10
  profitability: 8
  revenue_growth: 2
Enhanced sentiment analysis by topic...
Enhanced top

In [22]:
## Enhanced Anomaly Detection

def enhanced_detect_sentiment_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced anomaly detection with multiple methods."""
    if df is None:
        return None

    print("Enhanced sentiment anomaly detection...")

    df = df.copy()

    # Enhanced anomaly detection methods

    # 1. Statistical anomalies (Z-scores)
    numeric_columns = ['finbert_tone_score', 'prosus_score', 'ensemble_confidence',
                      'finbert_tone_entropy', 'prosus_entropy']

    for col in numeric_columns:
        if col in df.columns:
            z_scores = np.abs(stats.zscore(df[col].fillna(df[col].mean())))
            df[f'{col}_zscore'] = z_scores
            df[f'{col}_anomaly'] = z_scores > 2.5

    # 2. Model disagreement anomalies
    if 'finbert_tone_label' in df.columns and 'prosus_label' in df.columns:
        df['model_disagreement'] = df['finbert_tone_label'] != df['prosus_label']

        # Severe disagreement (high confidence but different predictions)
        if 'finbert_tone_calibrated' in df.columns and 'prosus_calibrated' in df.columns:
            high_confidence_mask = (df['finbert_tone_calibrated'] > 0.8) & (df['prosus_calibrated'] > 0.8)
            df['severe_disagreement'] = df['model_disagreement'] & high_confidence_mask

    # 3. Ensemble vs individual model anomalies
    if 'ensemble_label' in df.columns:
        for model_col in ['finbert_tone_label', 'prosus_label']:
            if model_col in df.columns:
                df[f'ensemble_{model_col}_disagreement'] = df['ensemble_label'] != df[model_col]

    # 4. Confidence calibration anomalies
    for model in ['finbert_tone', 'prosus']:
        score_col = f'{model}_score'
        calibrated_col = f'{model}_calibrated'
        if score_col in df.columns and calibrated_col in df.columns:
            confidence_diff = np.abs(df[score_col] - df[calibrated_col])
            df[f'{model}_calibration_diff'] = confidence_diff
            df[f'{model}_calibration_anomaly'] = confidence_diff > 0.3

    # 5. Text-based anomalies
    if 'sentence_length' in df.columns:
        length_z = np.abs(stats.zscore(df['sentence_length']))
        df['length_anomaly'] = length_z > 3

    # 6. Topic-sentiment mismatch anomalies
    if 'primary_topic' in df.columns:
        # Define expected sentiment patterns for topics
        topic_sentiment_expectations = {
            'revenue_growth': 'positive',
            'profitability': 'positive',
            'credit_risk': 'negative',
            'operational_risk': 'negative',
            'market_risk': 'negative'
        }

        df['topic_sentiment_mismatch'] = False
        for topic, expected_sentiment in topic_sentiment_expectations.items():
            topic_mask = df['primary_topic'] == topic
            if 'ensemble_label' in df.columns:
                mismatch_mask = topic_mask & (df['ensemble_label'] != expected_sentiment)
                df.loc[mismatch_mask, 'topic_sentiment_mismatch'] = True

    # Count total anomalies
    anomaly_columns = [col for col in df.columns if 'anomaly' in col or 'disagreement' in col or 'mismatch' in col]
    if anomaly_columns:
        df['total_anomaly_flags'] = df[anomaly_columns].sum(axis=1)
        anomaly_count = df['total_anomaly_flags'].sum()
        print(f"  Detected {anomaly_count} total anomaly flags across {len(df)} records")

        # Enhanced anomaly reporting
        for col in anomaly_columns:
            count = df[col].sum()
            pct = (count / len(df)) * 100
            print(f"  {col}: {count} ({pct:.1f}%)")

    return df

# Apply enhanced anomaly detection
print("\n" + "=" * 50)
print("ENHANCED ANOMALY DETECTION")
print("=" * 50)

enhanced_anomaly_q1_df = enhanced_detect_sentiment_anomalies(enhanced_topics_q1_df)
enhanced_anomaly_q2_df = enhanced_detect_sentiment_anomalies(enhanced_topics_q2_df)
enhanced_anomaly_multi_df = enhanced_detect_sentiment_anomalies(enhanced_topics_multi_df)



ENHANCED ANOMALY DETECTION
Enhanced sentiment anomaly detection...
  Detected 491 total anomaly flags across 578 records
  finbert_tone_score_anomaly: 30 (5.2%)
  prosus_score_anomaly: 17 (2.9%)
  ensemble_confidence_anomaly: 3 (0.5%)
  finbert_tone_entropy_anomaly: 9 (1.6%)
  prosus_entropy_anomaly: 8 (1.4%)
  model_disagreement: 146 (25.3%)
  severe_disagreement: 73 (12.6%)
  ensemble_finbert_tone_label_disagreement: 78 (13.5%)
  ensemble_prosus_label_disagreement: 68 (11.8%)
  finbert_tone_calibration_anomaly: 15 (2.6%)
  prosus_calibration_anomaly: 1 (0.2%)
  length_anomaly: 9 (1.6%)
  topic_sentiment_mismatch: 34 (5.9%)
Enhanced sentiment anomaly detection...
  Detected 422 total anomaly flags across 532 records
  finbert_tone_score_anomaly: 28 (5.3%)
  prosus_score_anomaly: 12 (2.3%)
  ensemble_confidence_anomaly: 2 (0.4%)
  finbert_tone_entropy_anomaly: 26 (4.9%)
  prosus_entropy_anomaly: 7 (1.3%)
  model_disagreement: 119 (22.4%)
  severe_disagreement: 66 (12.4%)
  ensemble_fi

In [23]:
## Save Enhanced Results

def save_enhanced_sentiment_results(df: pd.DataFrame, filename: str, description: str):
    """Save enhanced sentiment analysis results."""
    if df is None:
        print(f"Cannot save {description} - dataset is None")
        return

    print(f"Saving {description}...")

    # Save to results directory
    results_path = results_sentiment_path / filename
    df.to_csv(results_path, index=False)
    print(f"  Results: {results_path}")

    # Save to colab for easy access
    colab_results_path = colab_base / "results/sentiment/jpm" / filename
    colab_results_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(colab_results_path, index=False)
    print(f"  Colab: {colab_results_path}")

print("\n" + "=" * 60)
print("SAVING ENHANCED RESULTS")
print("=" * 60)

# Save enhanced sentence-level results
save_enhanced_sentiment_results(enhanced_anomaly_q1_df, "enhanced_sentiment_sentence_jpm_q1_2025.csv", "Enhanced Q1 sentence-level")
save_enhanced_sentiment_results(enhanced_anomaly_q2_df, "enhanced_sentiment_sentence_jpm_q2_2025.csv", "Enhanced Q2 sentence-level")
save_enhanced_sentiment_results(enhanced_anomaly_multi_df, "enhanced_sentiment_sentence_jpm_multi_2025.csv", "Enhanced Multi sentence-level")

# Save enhanced Q&A-level results
save_enhanced_sentiment_results(enhanced_qa_level_q1_df, "enhanced_sentiment_qa_jpm_q1_2025.csv", "Enhanced Q1 Q&A-level")
save_enhanced_sentiment_results(enhanced_qa_level_q2_df, "enhanced_sentiment_qa_jpm_q2_2025.csv", "Enhanced Q2 Q&A-level")
save_enhanced_sentiment_results(enhanced_qa_level_multi_df, "enhanced_sentiment_qa_jpm_multi_2025.csv", "Enhanced Multi Q&A-level")

# Save enhanced speaker-level results
save_enhanced_sentiment_results(enhanced_speaker_level_q1_df, "enhanced_sentiment_speaker_jpm_q1_2025.csv", "Enhanced Q1 speaker-level")
save_enhanced_sentiment_results(enhanced_speaker_level_q2_df, "enhanced_sentiment_speaker_jmp_q2_2025.csv", "Enhanced Q2 speaker-level")
save_enhanced_sentiment_results(enhanced_speaker_level_multi_df, "enhanced_sentiment_speaker_jpm_multi_2025.csv", "Enhanced Multi speaker-level")

# Save enhanced topic-sentiment results
save_enhanced_sentiment_results(enhanced_topic_sentiment_q1_df, "enhanced_sentiment_topic_jpm_q1_2025.csv", "Enhanced Q1 topic-sentiment")
save_enhanced_sentiment_results(enhanced_topic_sentiment_q2_df, "enhanced_sentiment_topic_jpm_q2_2025.csv", "Enhanced Q2 topic-sentiment")
save_enhanced_sentiment_results(enhanced_topic_sentiment_multi_df, "enhanced_sentiment_topic_jpm_multi_2025.csv", "Enhanced Multi topic-sentiment")

# Save performance evaluation results
if performance_results:
    performance_path = results_sentiment_path / "enhanced_performance_evaluation.json"
    with open(performance_path, 'w') as f:
        json.dump(performance_results, f, indent=2, default=str)
    print(f"Performance evaluation: {performance_path}")



SAVING ENHANCED RESULTS
Saving Enhanced Q1 sentence-level...
  Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/enhanced_sentiment_sentence_jpm_q1_2025.csv
  Colab: /content/cam_ds_ai_project/results/sentiment/jpm/enhanced_sentiment_sentence_jpm_q1_2025.csv
Saving Enhanced Q2 sentence-level...
  Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/enhanced_sentiment_sentence_jpm_q2_2025.csv
  Colab: /content/cam_ds_ai_project/results/sentiment/jpm/enhanced_sentiment_sentence_jpm_q2_2025.csv
Saving Enhanced Multi sentence-level...
  Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/enhanced_sentiment_sentence_jpm_multi_2025.csv
  Colab: /content/cam_ds_ai_project/results/sentiment/jpm/enhanced_sentiment_sentence_jpm_multi_2025.csv
Saving Enhanced Q1 Q&A-level...
  Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/enhanced_sentiment_qa_jpm_q1_2025.csv
  Colab: /content/cam_ds_ai_project/results/sentime

In [24]:
## Enhanced Summary Report

def create_enhanced_summary():
    """Create comprehensive enhanced analysis summary."""
    summary = {
        "analysis_timestamp": pd.Timestamp.now().isoformat(),
        "bank_code": BANK_CODE,
        "enhanced_features": [
            "confidence_calibration",
            "ensemble_methods",
            "enhanced_topic_extraction",
            "advanced_anomaly_detection",
            "performance_optimization",
            "text_feature_engineering"
        ],
        "models_used": [
            "Enhanced yiyanghkust/finbert-tone",
            "Enhanced ProsusAI/finbert",
            "Ensemble Model",
            "VADER Sentiment",
            "ML Classifier Backup"
        ],
        "analysis_levels": [
            "enhanced_sentence_level",
            "enhanced_qa_level",
            "enhanced_speaker_level",
            "enhanced_topic_conditional"
        ],
        "performance_improvements": {},
        "datasets_analyzed": {},
        "enhancement_metrics": {}
    }

    # Add dataset information
    enhanced_datasets_info = [
        ("enhanced_sentence_jpm_q1_2025", enhanced_anomaly_q1_df),
        ("enhanced_sentence_jpm_q2_2025", enhanced_anomaly_q2_df),
        ("enhanced_sentence_jpm_multi_2025", enhanced_anomaly_multi_df),
        ("enhanced_qa_jpm_q1_2025", enhanced_qa_level_q1_df),
        ("enhanced_qa_jpm_q2_2025", enhanced_qa_level_q2_df),
        ("enhanced_qa_jpm_multi_2025", enhanced_qa_level_multi_df)
    ]

    for name, df in enhanced_datasets_info:
        if df is not None:
            # Basic info
            summary["datasets_analyzed"][name] = {
                "shape": df.shape,
                "memory_mb": round(df.memory_usage(deep=True).sum() / 1024**2, 2)
            }

            # Enhanced features count
            enhanced_columns = [col for col in df.columns if any(x in col for x in
                              ['calibrated', 'entropy', 'ensemble', 'enhanced', 'anomaly'])]
            summary["datasets_analyzed"][name]["enhanced_features_count"] = len(enhanced_columns)

            # Performance metrics if available
            if name in performance_results:
                summary["datasets_analyzed"][name]["performance_metrics"] = performance_results[name]

    # Add enhancement metrics
    if manual_labels_df is not None:
        manual_count = validation_report.get('manually_labeled_count', 0)
        summary["enhancement_metrics"] = {
            "manual_labels_used": manual_count,
            "confidence_calibration_enabled": manual_count > 10,
            "ensemble_models_count": len(ensemble.models),
            "enhanced_topics_count": len([col for col in enhanced_anomaly_multi_df.columns
                                        if col.startswith('topic_') and col.endswith('_count')]) if enhanced_anomaly_multi_df is not None else 0
        }

    # Performance improvements
    if performance_results and evaluator.evaluation_results:
        best_model_info = evaluator.compare_models()
        if best_model_info.get('best_model'):
            summary["performance_improvements"] = {
                "best_performing_model": best_model_info['best_model'],
                "model_rankings": best_model_info.get('model_rankings', {}),
                "performance_summary": best_model_info.get('performance_summary', {})
            }

    # Save summary
    summary_path = results_sentiment_path / "enhanced_sentiment_analysis_summary.json"
    with open(summary_path, "w") as f:
        json.dump(summary, f, indent=2, default=str)

    return summary, summary_path

enhanced_summary, enhanced_summary_path = create_enhanced_summary()

print("\n" + "=" * 60)
print("ENHANCED SENTIMENT ANALYSIS COMPLETE")
print("=" * 60)

print(f"Summary report: {enhanced_summary_path}")
print("\nEnhanced results generated:")
for dataset_name, info in enhanced_summary["datasets_analyzed"].items():
    if info is not None:
        enhanced_features = info.get('enhanced_features_count', 0)
        print(f"  {dataset_name}: {info['shape']} - {info['memory_mb']} MB ({enhanced_features} enhanced features)")

total_records = sum(info['shape'][0] for info in enhanced_summary["datasets_analyzed"].values() if info is not None)
total_memory = sum(info['memory_mb'] for info in enhanced_summary["datasets_analyzed"].values() if info is not None)

print(f"\nTotal enhanced records: {total_records:,}")
print(f"Total memory usage: {total_memory:.2f} MB")

# Performance summary
if enhanced_summary.get("performance_improvements"):
    best_model = enhanced_summary["performance_improvements"].get("best_performing_model")
    if best_model:
        print(f"\nBest performing model: {best_model}")

        # Show performance metrics for best model
        perf_summary = enhanced_summary["performance_improvements"].get("performance_summary", {})
        if best_model in perf_summary:
            metrics = perf_summary[best_model]
            print(f"  F1-Score: {metrics.get('weighted_f1', 0):.3f}")
            print(f"  Precision: {metrics.get('weighted_precision', 0):.3f}")
            print(f"  Recall: {metrics.get('weighted_recall', 0):.3f}")
            print(f"  Accuracy: {metrics.get('accuracy', 0):.3f}")

# Enhancement summary
enhancement_metrics = enhanced_summary.get("enhancement_metrics", {})
if enhancement_metrics:
    print(f"\nEnhancement Features:")
    print(f"  Manual labels utilized: {enhancement_metrics.get('manual_labels_used', 0)}")
    print(f"  Confidence calibration: {'Enabled' if enhancement_metrics.get('confidence_calibration_enabled') else 'Disabled'}")
    print(f"  Ensemble models: {enhancement_metrics.get('ensemble_models_count', 0)}")
    print(f"  Enhanced topics: {enhancement_metrics.get('enhanced_topics_count', 0)}")

print(f"\nKey Enhancements Applied:")
for feature in enhanced_summary.get("enhanced_features", []):
    print(f"  ✓ {feature.replace('_', ' ').title()}")

print(f"\nNext step: Run 04b_model_finetuning.ipynb for fine-tuning with manual labels")
print(f"          Then continue to 05_model_comparison_jpm_enhanced.ipynb")



Model Comparison Summary:
Enhanced FinBERT-tone:
  F1: 0.894
  Precision: 0.897
  Recall: 0.895
  Accuracy: 0.895
Enhanced ProsusAI:
  F1: 0.853
  Precision: 0.855
  Recall: 0.855
  Accuracy: 0.855
Ensemble Model:
  F1: 0.916
  Precision: 0.918
  Recall: 0.920
  Accuracy: 0.920

Best performing model: Ensemble Model

ENHANCED SENTIMENT ANALYSIS COMPLETE
Summary report: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/enhanced_sentiment_analysis_summary.json

Enhanced results generated:
  enhanced_sentence_jpm_q1_2025: (578, 80) - 0.8 MB (20 enhanced features)
  enhanced_sentence_jpm_q2_2025: (532, 80) - 0.73 MB (20 enhanced features)
  enhanced_sentence_jpm_multi_2025: (1110, 80) - 1.53 MB (20 enhanced features)
  enhanced_qa_jpm_q1_2025: (97, 40) - 0.15 MB (11 enhanced features)
  enhanced_qa_jpm_q2_2025: (121, 40) - 0.15 MB (11 enhanced features)
  enhanced_qa_jpm_multi_2025: (218, 40) - 0.31 MB (11 enhanced features)

Total enhanced records: 2,656
Total memory usage: 

In [25]:
## Research Questions Analysis Preview

def preview_enhanced_research_questions(df: pd.DataFrame):
    """Preview enhanced analysis for key research questions."""
    if df is None:
        return

    print("\nENHanced Research Questions Preview")
    print("-" * 40)

    # Question 1: Enhanced banker vs analyst sentiment divergence
    if 'speaker_role' in df.columns and 'ensemble_label' in df.columns:
        print("1. Enhanced Banker vs Analyst Sentiment (Ensemble Model):")

        speaker_sentiment = df.groupby('speaker_role')['ensemble_label'].value_counts(normalize=True).unstack(fill_value=0)
        print(speaker_sentiment.round(3))

        # Enhanced divergence with confidence weighting
        if 'ensemble_confidence' in df.columns:
            weighted_sentiment = df.groupby('speaker_role').apply(
                lambda x: (x['ensemble_label'] == 'positive').sum() * x['ensemble_confidence'].mean()
            )
            print(f"  Confidence-weighted positive sentiment:")
            for role, score in weighted_sentiment.items():
                print(f"    {role}: {score:.3f}")

    # Question 2: Enhanced temporal analysis with calibrated confidence
    if 'quarter' in df.columns and 'finbert_tone_calibrated' in df.columns:
        print("\n2. Enhanced Temporal Analysis (Calibrated Confidence):")

        quarter_confidence = df.groupby('quarter')['finbert_tone_calibrated'].mean()
        print("  Average calibrated confidence by quarter:")
        for quarter, conf in quarter_confidence.items():
            print(f"    {quarter}: {conf:.3f}")

    # Question 3: Topic-specific sentiment patterns
    if 'primary_topic' in df.columns and 'ensemble_label' in df.columns:
        print("\n3. Topic-Specific Sentiment Patterns:")

        topic_sentiment = df.groupby('primary_topic')['ensemble_label'].value_counts(normalize=True).unstack(fill_value=0)

        # Show top 5 topics by volume
        topic_counts = df['primary_topic'].value_counts().head(5)
        for topic in topic_counts.index:
            if topic in topic_sentiment.index:
                pos_pct = topic_sentiment.loc[topic, 'positive'] if 'positive' in topic_sentiment.columns else 0
                print(f"    {topic}: {pos_pct:.1%} positive")

# Preview enhanced analysis
if enhanced_anomaly_multi_df is not None:
    preview_enhanced_research_questions(enhanced_anomaly_multi_df)

print(f"\n💡 Enhanced sentiment analysis with performance optimization complete!")
print(f"   Ready for fine-tuning in next notebook: 04b_model_finetuning.ipynb")


ENHanced Research Questions Preview
----------------------------------------
1. Enhanced Banker vs Analyst Sentiment (Ensemble Model):
ensemble_label  negative  neutral  positive
speaker_role                               
analyst            0.045    0.868     0.086
cfo                0.068    0.736     0.196
executive          0.068    0.843     0.089
  Confidence-weighted positive sentiment:
    analyst: 17.122
    cfo: 79.352
    executive: 28.260

2. Enhanced Temporal Analysis (Calibrated Confidence):
  Average calibrated confidence by quarter:
    q1_2025: 0.888
    q2_2025: 0.898

3. Topic-Specific Sentiment Patterns:
    general: 13.1% positive
    market_risk: 18.2% positive
    economic_outlook: 7.1% positive
    regulatory_environment: 15.4% positive
    digital_transformation: 45.5% positive

💡 Enhanced sentiment analysis with performance optimization complete!
   Ready for fine-tuning in next notebook: 04b_model_finetuning.ipynb
