In [2]:
# 04_sentiment_analysis.ipynb
# Purpose: Enhanced sentiment analysis with 4 models on JP Morgan and HSBC data
# Banks: JP Morgan (JPM) and HSBC
# Models: FinBERT (yiyanghkust), FinBERT (ProsusAI), DistilRoBERTa, CardiffNLP (Twitter-RoBERTa)
# Input: Processed datasets + manual validation results
# Output: Comprehensive sentiment results with 4-model analysis

## Import Libraries

import pandas as pd
import numpy as np
import json
import torch
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Enhanced ML libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
import torch.nn.functional as F

# Progress tracking
from tqdm import tqdm
tqdm.pandas()

# Statistical analysis
from scipy import stats
import itertools

# Google Colab
from google.colab import drive
drive.mount("/content/drive")

# Load enhanced configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_config.json")
with open(config_path, "r") as f:
    enhanced_config = json.load(f)

SEED = enhanced_config["SEED"]
BANKS = enhanced_config["BANKS"]
QUARTERS = enhanced_config["QUARTERS"]
MODELS = enhanced_config["MODELS"]
drive_base = Path(enhanced_config["drive_base"])
colab_base = Path(enhanced_config["colab_base"])

# Set seeds for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

print(f"Enhanced sentiment analysis for banks: {', '.join([bank.upper() for bank in BANKS])}")
print(f"Models: {len(MODELS)} sentiment analysis models")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

Mounted at /content/drive
Enhanced sentiment analysis for banks: JPM, HSBC
Models: 4 sentiment analysis models
Device: CPU


In [3]:
## Define Enhanced Paths

sentiment_paths = {}
for bank in BANKS:
    sentiment_paths[bank] = {
        "processed_data": drive_base / f"data/processed/{bank}",
        "manual_validation": drive_base / f"data/manual_validation/{bank}",
        "results_sentiment": drive_base / f"results/sentiment/{bank}",
        "models": drive_base / f"models"
    }

    # Ensure results directory exists
    sentiment_paths[bank]["results_sentiment"].mkdir(parents=True, exist_ok=True)

In [4]:
## Enhanced Data Loading

def load_processed_datasets_enhanced():
    """Load all processed datasets for 4-model sentiment analysis."""
    print(f"\n{'='*60}")
    print("LOADING PROCESSED DATASETS FOR 4-MODEL ANALYSIS")
    print(f"{'='*60}")

    processed_datasets = {}

    for bank in BANKS:
        print(f"\n📂 Loading {bank.upper()} processed datasets...")
        processed_datasets[bank] = {}

        # Load sentence-level datasets (primary for sentiment analysis)
        dataset_files = [
            f"processed_{bank}_q1_2025_sentence_level.csv",
            f"processed_{bank}_q2_2025_sentence_level.csv",
            f"processed_{bank}_combined_sentence_level.csv"
        ]

        for filename in dataset_files:
            file_path = sentiment_paths[bank]["processed_data"] / filename

            if file_path.exists():
                try:
                    df = pd.read_csv(file_path)

                    # Determine dataset type
                    if "q1_2025" in filename:
                        dataset_key = "q1_2025"
                    elif "q2_2025" in filename:
                        dataset_key = "q2_2025"
                    else:
                        dataset_key = "combined"

                    processed_datasets[bank][dataset_key] = df
                    print(f"  ✅ {dataset_key}: {df.shape}")

                    # Check model readiness columns
                    model_ready_cols = [col for col in df.columns if col.startswith('ready_for_')]
                    if model_ready_cols:
                        print(f"    Model readiness columns: {len(model_ready_cols)}")

                except Exception as e:
                    print(f"  ❌ Error loading {filename}: {str(e)}")
            else:
                print(f"  ⚠️ File not found: {filename}")

    return processed_datasets

def load_manual_validation_results_enhanced():
    """Load manual validation results for all banks."""
    print(f"\n📋 Loading manual validation results...")

    manual_validation_data = {}

    for bank in BANKS:
        bank_validation = {}

        # Load validated manual labels
        validated_labels_path = sentiment_paths[bank]["results_sentiment"] / f"manual_labels_{bank}_validated.csv"
        if validated_labels_path.exists():
            try:
                df = pd.read_csv(validated_labels_path)
                bank_validation["labels"] = df
                print(f"  ✅ {bank.upper()} manual labels: {df.shape}")
            except Exception as e:
                print(f"  ❌ Error loading {bank.upper()} manual labels: {e}")

        # Load validation report
        validation_report_path = sentiment_paths[bank]["manual_validation"] / f"validation_report_{bank}.json"
        if validation_report_path.exists():
            try:
                with open(validation_report_path, 'r') as f:
                    report = json.load(f)
                bank_validation["report"] = report
                print(f"  ✅ {bank.upper()} validation report loaded")
            except Exception as e:
                print(f"  ❌ Error loading {bank.upper()} validation report: {e}")

        manual_validation_data[bank] = bank_validation

    return manual_validation_data

# Load processed datasets and manual validation
processed_datasets = load_processed_datasets_enhanced()
manual_validation_data = load_manual_validation_results_enhanced()



LOADING PROCESSED DATASETS FOR 4-MODEL ANALYSIS

📂 Loading JPM processed datasets...
  ✅ q1_2025: (313, 21)
    Model readiness columns: 4
  ✅ q2_2025: (440, 21)
    Model readiness columns: 4
  ✅ combined: (752, 21)
    Model readiness columns: 4

📂 Loading HSBC processed datasets...
  ✅ q1_2025: (300, 21)
    Model readiness columns: 4
  ✅ q2_2025: (340, 21)
    Model readiness columns: 4
  ✅ combined: (640, 21)
    Model readiness columns: 4

📋 Loading manual validation results...
  ✅ JPM manual labels: (1121, 27)
  ✅ JPM validation report loaded
  ✅ HSBC manual labels: (858, 27)
  ✅ HSBC validation report loaded


In [5]:
## Enhanced 4-Model Sentiment Analyzer

class Enhanced4ModelAnalyzer:
    """Enhanced sentiment analyzer for 4 financial models."""

    def __init__(self):
        self.models = {}
        self.tokenizers = {}
        self.pipelines = {}
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # Enhanced model configurations
        self.model_configs = {
            'finbert_yiyanghkust': {
                'model_name': 'yiyanghkust/finbert-tone',
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16
            },
            'finbert_prosusai': {
                'model_name': 'ProsusAI/finbert',
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16
            },
            'distilroberta': {
                'model_name': 'j-hartmann/emotion-english-distilroberta-base',
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16
            },
            'cardiffnlp_roberta': {
                'model_name': 'cardiffnlp/twitter-roberta-base-sentiment-latest',
                'labels': ['positive', 'neutral', 'negative'],
                'max_length': 512,
                'batch_size': 16
            }
        }

    def load_all_models(self):
        """Load all 4 sentiment models."""
        print(f"\n🤖 LOADING 4 SENTIMENT MODELS")
        print("-" * 40)

        loaded_models = []
        failed_models = []

        for model_key, config in self.model_configs.items():
            try:
                print(f"Loading {model_key}: {config['model_name']}")

                # Load tokenizer
                tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

                # Load model
                model = AutoModelForSequenceClassification.from_pretrained(
                    config['model_name'],
                    return_dict=True
                )

                # Move to device
                model.to(self.device)
                model.eval()

                # Create pipeline
                pipeline_obj = pipeline(
                    'sentiment-analysis',
                    model=model,
                    tokenizer=tokenizer,
                    device=0 if self.device == 'cuda' else -1,
                    return_all_scores=True
                )

                # Store components
                self.models[model_key] = model
                self.tokenizers[model_key] = tokenizer
                self.pipelines[model_key] = pipeline_obj

                loaded_models.append(model_key)
                print(f"  ✅ {model_key} loaded successfully")

            except Exception as e:
                failed_models.append((model_key, str(e)))
                print(f"  ❌ {model_key} failed: {str(e)}")

        print(f"\nModel loading summary:")
        print(f"  Loaded: {len(loaded_models)}/{len(self.model_configs)}")
        print(f"  Failed: {len(failed_models)}")

        if failed_models:
            print(f"  Failed models:")
            for model_key, error in failed_models:
                print(f"    {model_key}: {error}")

        return loaded_models, failed_models

    def predict_sentiment_enhanced(self, texts: List[str], model_key: str) -> List[Dict]:
        """Enhanced sentiment prediction with additional features."""
        if model_key not in self.pipelines:
            raise ValueError(f"Model {model_key} not loaded")

        pipeline_obj = self.pipelines[model_key]
        config = self.model_configs[model_key]
        batch_size = config.get('batch_size', 16)

        results = []

        print(f"  Processing {len(texts)} texts with {model_key}...")

        for i in tqdm(range(0, len(texts), batch_size), desc=f"{model_key}"):
            batch = texts[i:i + batch_size]

            try:
                # Get predictions
                batch_results = pipeline_obj(batch)

                # Process results
                for text_idx, text_result in enumerate(batch_results):
                    text = batch[text_idx]

                    # Handle different output formats
                    if isinstance(text_result, list):
                        # Multiple scores format
                        scores_dict = {item['label'].lower(): item['score'] for item in text_result}
                    else:
                        # Single prediction format
                        label = text_result['label'].lower()
                        score = text_result['score']
                        scores_dict = {label: score}

                        # Fill in missing scores
                        for expected_label in config['labels']:
                            if expected_label not in scores_dict:
                                scores_dict[expected_label] = 0.0

                    # Normalize label names
                    normalized_scores = {}
                    for label, score in scores_dict.items():
                        # Map various label formats to standard
                        if 'pos' in label.lower() or label.lower() == 'label_2':
                            normalized_scores['positive'] = score
                        elif 'neg' in label.lower() or label.lower() == 'label_0':
                            normalized_scores['negative'] = score
                        elif 'neu' in label.lower() or label.lower() == 'label_1':
                            normalized_scores['neutral'] = score
                        else:
                            normalized_scores[label.lower()] = score

                    # Ensure all required labels exist
                    for required_label in ['positive', 'neutral', 'negative']:
                        if required_label not in normalized_scores:
                            normalized_scores[required_label] = 0.0

                    # Get predicted label and score
                    predicted_label = max(normalized_scores, key=normalized_scores.get)
                    predicted_score = normalized_scores[predicted_label]

                    # Calculate additional metrics
                    entropy = -sum(score * np.log(score + 1e-8) for score in normalized_scores.values())
                    sorted_scores = sorted(normalized_scores.values(), reverse=True)
                    max_prob_diff = sorted_scores[0] - sorted_scores[1] if len(sorted_scores) > 1 else 1.0

                    result = {
                        'text': text,
                        'model': model_key,
                        'predicted_label': predicted_label,
                        'predicted_score': predicted_score,
                        'positive_score': normalized_scores.get('positive', 0.0),
                        'neutral_score': normalized_scores.get('neutral', 0.0),
                        'negative_score': normalized_scores.get('negative', 0.0),
                        'entropy': entropy,
                        'max_prob_diff': max_prob_diff,
                        'prediction_confidence': predicted_score
                    }

                    results.append(result)

            except Exception as e:
                print(f"    Error processing batch {i//batch_size + 1}: {str(e)}")
                # Add placeholder results for failed batch
                for j in range(len(batch)):
                    results.append({
                        'text': batch[j],
                        'model': model_key,
                        'predicted_label': 'neutral',
                        'predicted_score': 0.33,
                        'positive_score': 0.33,
                        'neutral_score': 0.34,
                        'negative_score': 0.33,
                        'entropy': 1.0,
                        'max_prob_diff': 0.0,
                        'prediction_confidence': 0.33,
                        'error': True
                    })

        return results

# Initialize enhanced analyzer
analyzer = Enhanced4ModelAnalyzer()
loaded_models, failed_models = analyzer.load_all_models()



🤖 LOADING 4 SENTIMENT MODELS
----------------------------------------
Loading finbert_yiyanghkust: yiyanghkust/finbert-tone


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


  ✅ finbert_yiyanghkust loaded successfully
Loading finbert_prosusai: ProsusAI/finbert


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cpu


  ✅ finbert_prosusai loaded successfully
Loading distilroberta: j-hartmann/emotion-english-distilroberta-base


tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Device set to use cpu


  ✅ distilroberta loaded successfully
Loading cardiffnlp_roberta: cardiffnlp/twitter-roberta-base-sentiment-latest


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


  ✅ cardiffnlp_roberta loaded successfully

Model loading summary:
  Loaded: 4/4
  Failed: 0


In [6]:
## Enhanced Sentiment Analysis Pipeline

def run_enhanced_sentiment_analysis(df: pd.DataFrame, dataset_name: str, bank_code: str) -> pd.DataFrame:
    """Run enhanced sentiment analysis with all 4 models."""
    if df is None or len(df) == 0:
        print(f"❌ Cannot process {dataset_name} for {bank_code.upper()} - empty dataset")
        return None

    print(f"\n🔬 [{bank_code.upper()}] ENHANCED 4-MODEL SENTIMENT ANALYSIS - {dataset_name.upper()}")
    print("-" * 60)
    print(f"Input shape: {df.shape}")

    # Prepare text data
    if 'text' not in df.columns:
        print(f"❌ Text column not found in {dataset_name}")
        return df

    texts = df['text'].fillna('').astype(str).tolist()
    valid_texts = [text for text in texts if len(str(text).strip()) > 0]

    print(f"Processing {len(valid_texts)} valid text entries")

    results_df = df.copy()

    # Run analysis with each loaded model
    for model_key in loaded_models:
        print(f"\n🤖 Running {model_key} analysis...")

        try:
            model_results = analyzer.predict_sentiment_enhanced(texts, model_key)

            # Add results to dataframe
            for i, result in enumerate(model_results):
                if i < len(results_df):
                    # Add model-specific columns
                    results_df.loc[i, f'{model_key}_label'] = result['predicted_label']
                    results_df.loc[i, f'{model_key}_score'] = result['predicted_score']
                    results_df.loc[i, f'{model_key}_positive'] = result['positive_score']
                    results_df.loc[i, f'{model_key}_neutral'] = result['neutral_score']
                    results_df.loc[i, f'{model_key}_negative'] = result['negative_score']
                    results_df.loc[i, f'{model_key}_entropy'] = result['entropy']
                    results_df.loc[i, f'{model_key}_confidence'] = result['prediction_confidence']
                    results_df.loc[i, f'{model_key}_max_diff'] = result['max_prob_diff']

                    # Mark if there was an error
                    if result.get('error', False):
                        results_df.loc[i, f'{model_key}_error'] = True

            print(f"  ✅ {model_key} analysis complete")

        except Exception as e:
            print(f"  ❌ {model_key} analysis failed: {str(e)}")

    print(f"\nFinal enhanced shape: {results_df.shape}")
    return results_df

# Run enhanced sentiment analysis on all datasets
print(f"\n{'='*60}")
print("RUNNING ENHANCED 4-MODEL SENTIMENT ANALYSIS")
print(f"{'='*60}")

sentiment_results = {}

for bank in BANKS:
    sentiment_results[bank] = {}

    if bank in processed_datasets:
        print(f"\n🏦 Processing {bank.upper()} datasets...")

        for dataset_type, df in processed_datasets[bank].items():
            if df is not None:
                sentiment_results[bank][dataset_type] = run_enhanced_sentiment_analysis(
                    df, dataset_type, bank
                )



RUNNING ENHANCED 4-MODEL SENTIMENT ANALYSIS

🏦 Processing JPM datasets...

🔬 [JPM] ENHANCED 4-MODEL SENTIMENT ANALYSIS - Q1_2025
------------------------------------------------------------
Input shape: (313, 21)
Processing 313 valid text entries

🤖 Running finbert_yiyanghkust analysis...
  Processing 313 texts with finbert_yiyanghkust...


finbert_yiyanghkust:   5%|▌         | 1/20 [00:00<00:08,  2.35it/s]

    Error processing batch 1: The size of tensor a (1083) must match the size of tensor b (512) at non-singleton dimension 1


finbert_yiyanghkust: 100%|██████████| 20/20 [00:51<00:00,  2.57s/it]


  ✅ finbert_yiyanghkust analysis complete

🤖 Running finbert_prosusai analysis...
  Processing 313 texts with finbert_prosusai...


finbert_prosusai:   0%|          | 0/20 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1115 > 512). Running this sequence through the model will result in indexing errors


    Error processing batch 1: The size of tensor a (1115) must match the size of tensor b (512) at non-singleton dimension 1


finbert_prosusai: 100%|██████████| 20/20 [00:50<00:00,  2.51s/it]


  ✅ finbert_prosusai analysis complete

🤖 Running distilroberta analysis...
  Processing 313 texts with distilroberta...


distilroberta:   0%|          | 0/20 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1090 > 512). Running this sequence through the model will result in indexing errors


    Error processing batch 1: The expanded size of the tensor (1090) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1090].  Tensor sizes: [1, 514]


distilroberta: 100%|██████████| 20/20 [00:26<00:00,  1.32s/it]


  ✅ distilroberta analysis complete

🤖 Running cardiffnlp_roberta analysis...
  Processing 313 texts with cardiffnlp_roberta...


cardiffnlp_roberta:   0%|          | 0/20 [00:00<?, ?it/s]

    Error processing batch 1: The expanded size of the tensor (1090) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1090].  Tensor sizes: [1, 514]


cardiffnlp_roberta: 100%|██████████| 20/20 [00:47<00:00,  2.40s/it]


  ✅ cardiffnlp_roberta analysis complete

Final enhanced shape: (313, 57)

🔬 [JPM] ENHANCED 4-MODEL SENTIMENT ANALYSIS - Q2_2025
------------------------------------------------------------
Input shape: (440, 21)
Processing 440 valid text entries

🤖 Running finbert_yiyanghkust analysis...
  Processing 440 texts with finbert_yiyanghkust...


finbert_yiyanghkust: 100%|██████████| 28/28 [01:10<00:00,  2.53s/it]


  ✅ finbert_yiyanghkust analysis complete

🤖 Running finbert_prosusai analysis...
  Processing 440 texts with finbert_prosusai...


finbert_prosusai: 100%|██████████| 28/28 [01:10<00:00,  2.53s/it]


  ✅ finbert_prosusai analysis complete

🤖 Running distilroberta analysis...
  Processing 440 texts with distilroberta...


distilroberta: 100%|██████████| 28/28 [00:35<00:00,  1.28s/it]


  ✅ distilroberta analysis complete

🤖 Running cardiffnlp_roberta analysis...
  Processing 440 texts with cardiffnlp_roberta...


cardiffnlp_roberta: 100%|██████████| 28/28 [01:08<00:00,  2.45s/it]


  ✅ cardiffnlp_roberta analysis complete

Final enhanced shape: (440, 53)

🔬 [JPM] ENHANCED 4-MODEL SENTIMENT ANALYSIS - COMBINED
------------------------------------------------------------
Input shape: (752, 21)
Processing 752 valid text entries

🤖 Running finbert_yiyanghkust analysis...
  Processing 752 texts with finbert_yiyanghkust...


finbert_yiyanghkust:   0%|          | 0/47 [00:00<?, ?it/s]

    Error processing batch 1: The size of tensor a (1083) must match the size of tensor b (512) at non-singleton dimension 1


finbert_yiyanghkust: 100%|██████████| 47/47 [02:02<00:00,  2.60s/it]


  ✅ finbert_yiyanghkust analysis complete

🤖 Running finbert_prosusai analysis...
  Processing 752 texts with finbert_prosusai...


finbert_prosusai:   0%|          | 0/47 [00:00<?, ?it/s]

    Error processing batch 1: The size of tensor a (1115) must match the size of tensor b (512) at non-singleton dimension 1


finbert_prosusai: 100%|██████████| 47/47 [02:02<00:00,  2.60s/it]


  ✅ finbert_prosusai analysis complete

🤖 Running distilroberta analysis...
  Processing 752 texts with distilroberta...


distilroberta:   0%|          | 0/47 [00:00<?, ?it/s]

    Error processing batch 1: The expanded size of the tensor (1090) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1090].  Tensor sizes: [1, 514]


distilroberta: 100%|██████████| 47/47 [01:00<00:00,  1.28s/it]


  ✅ distilroberta analysis complete

🤖 Running cardiffnlp_roberta analysis...
  Processing 752 texts with cardiffnlp_roberta...


cardiffnlp_roberta:   0%|          | 0/47 [00:00<?, ?it/s]

    Error processing batch 1: The expanded size of the tensor (1090) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1090].  Tensor sizes: [1, 514]


cardiffnlp_roberta: 100%|██████████| 47/47 [02:00<00:00,  2.56s/it]


  ✅ cardiffnlp_roberta analysis complete

Final enhanced shape: (752, 57)

🏦 Processing HSBC datasets...

🔬 [HSBC] ENHANCED 4-MODEL SENTIMENT ANALYSIS - Q1_2025
------------------------------------------------------------
Input shape: (300, 21)
Processing 300 valid text entries

🤖 Running finbert_yiyanghkust analysis...
  Processing 300 texts with finbert_yiyanghkust...


finbert_yiyanghkust: 100%|██████████| 19/19 [00:57<00:00,  3.05s/it]


  ✅ finbert_yiyanghkust analysis complete

🤖 Running finbert_prosusai analysis...
  Processing 300 texts with finbert_prosusai...


finbert_prosusai: 100%|██████████| 19/19 [01:01<00:00,  3.22s/it]


  ✅ finbert_prosusai analysis complete

🤖 Running distilroberta analysis...
  Processing 300 texts with distilroberta...


distilroberta: 100%|██████████| 19/19 [00:29<00:00,  1.56s/it]


  ✅ distilroberta analysis complete

🤖 Running cardiffnlp_roberta analysis...
  Processing 300 texts with cardiffnlp_roberta...


cardiffnlp_roberta: 100%|██████████| 19/19 [01:00<00:00,  3.18s/it]


  ✅ cardiffnlp_roberta analysis complete

Final enhanced shape: (300, 53)

🔬 [HSBC] ENHANCED 4-MODEL SENTIMENT ANALYSIS - Q2_2025
------------------------------------------------------------
Input shape: (340, 21)
Processing 340 valid text entries

🤖 Running finbert_yiyanghkust analysis...
  Processing 340 texts with finbert_yiyanghkust...


finbert_yiyanghkust:   0%|          | 0/22 [00:00<?, ?it/s]

    Error processing batch 1: The size of tensor a (1015) must match the size of tensor b (512) at non-singleton dimension 1


finbert_yiyanghkust: 100%|██████████| 22/22 [00:53<00:00,  2.43s/it]


  ✅ finbert_yiyanghkust analysis complete

🤖 Running finbert_prosusai analysis...
  Processing 340 texts with finbert_prosusai...


finbert_prosusai:   0%|          | 0/22 [00:00<?, ?it/s]

    Error processing batch 1: The size of tensor a (1029) must match the size of tensor b (512) at non-singleton dimension 1


finbert_prosusai: 100%|██████████| 22/22 [00:51<00:00,  2.35s/it]


  ✅ finbert_prosusai analysis complete

🤖 Running distilroberta analysis...
  Processing 340 texts with distilroberta...


distilroberta:   0%|          | 0/22 [00:00<?, ?it/s]

    Error processing batch 1: The expanded size of the tensor (1036) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1036].  Tensor sizes: [1, 514]


distilroberta: 100%|██████████| 22/22 [00:26<00:00,  1.21s/it]


  ✅ distilroberta analysis complete

🤖 Running cardiffnlp_roberta analysis...
  Processing 340 texts with cardiffnlp_roberta...


cardiffnlp_roberta:   0%|          | 0/22 [00:00<?, ?it/s]

    Error processing batch 1: The expanded size of the tensor (1036) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1036].  Tensor sizes: [1, 514]


cardiffnlp_roberta: 100%|██████████| 22/22 [00:50<00:00,  2.29s/it]


  ✅ cardiffnlp_roberta analysis complete

Final enhanced shape: (340, 57)

🔬 [HSBC] ENHANCED 4-MODEL SENTIMENT ANALYSIS - COMBINED
------------------------------------------------------------
Input shape: (640, 21)
Processing 640 valid text entries

🤖 Running finbert_yiyanghkust analysis...
  Processing 640 texts with finbert_yiyanghkust...


finbert_yiyanghkust:  48%|████▊     | 19/40 [01:00<01:13,  3.52s/it]

    Error processing batch 19: The size of tensor a (1015) must match the size of tensor b (512) at non-singleton dimension 1


finbert_yiyanghkust: 100%|██████████| 40/40 [01:51<00:00,  2.78s/it]


  ✅ finbert_yiyanghkust analysis complete

🤖 Running finbert_prosusai analysis...
  Processing 640 texts with finbert_prosusai...


finbert_prosusai:  48%|████▊     | 19/40 [01:00<01:12,  3.45s/it]

    Error processing batch 19: The size of tensor a (1029) must match the size of tensor b (512) at non-singleton dimension 1


finbert_prosusai: 100%|██████████| 40/40 [01:52<00:00,  2.80s/it]


  ✅ finbert_prosusai analysis complete

🤖 Running distilroberta analysis...
  Processing 640 texts with distilroberta...


distilroberta:  48%|████▊     | 19/40 [00:30<00:39,  1.88s/it]

    Error processing batch 19: The expanded size of the tensor (1036) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1036].  Tensor sizes: [1, 514]


distilroberta: 100%|██████████| 40/40 [00:57<00:00,  1.43s/it]


  ✅ distilroberta analysis complete

🤖 Running cardiffnlp_roberta analysis...
  Processing 640 texts with cardiffnlp_roberta...


cardiffnlp_roberta:  48%|████▊     | 19/40 [00:59<01:10,  3.36s/it]

    Error processing batch 19: The expanded size of the tensor (1036) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1036].  Tensor sizes: [1, 514]


cardiffnlp_roberta: 100%|██████████| 40/40 [01:51<00:00,  2.78s/it]


  ✅ cardiffnlp_roberta analysis complete

Final enhanced shape: (640, 57)


In [7]:
## Enhanced Multi-level Analysis

def enhanced_aggregate_sentence_to_qa_level(df: pd.DataFrame, bank_code: str) -> pd.DataFrame:
    """Enhanced aggregation from sentence to Q&A level with 4-model support."""
    if df is None or 'original_qa_id' not in df.columns:
        print(f"❌ Cannot aggregate {bank_code.upper()} - missing original_qa_id column")
        return None

    print(f"📊 [{bank_code.upper()}] Enhanced Q&A level aggregation...")

    # Enhanced aggregation functions for all models
    agg_functions = {
        # Basic info
        'text': lambda x: ' '.join(x),
        'speaker': 'first',
        'speaker_role': 'first',
        'quarter': 'first',
        'bank_code': 'first',
        'sentence_length': ['count', 'mean', 'sum'],
        'sentence_word_count': ['mean', 'sum']
    }

    # Add aggregation for all loaded models
    for model_key in loaded_models:
        model_cols = {
            f'{model_key}_score': ['mean', 'std', 'min', 'max'],
            f'{model_key}_positive': 'mean',
            f'{model_key}_neutral': 'mean',
            f'{model_key}_negative': 'mean',
            f'{model_key}_entropy': 'mean',
            f'{model_key}_confidence': 'mean'
        }
        agg_functions.update(model_cols)

    # Apply aggregations
    qa_level_df = df.groupby('original_qa_id').agg(agg_functions).reset_index()

    # Flatten column names
    qa_level_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                           for col in qa_level_df.columns]

    # Enhanced label determination using confidence-weighted voting
    for model_key in loaded_models:
        label_col = f'{model_key}_label'
        confidence_col = f'{model_key}_confidence'

        if label_col in df.columns:
            def confidence_weighted_vote(group):
                if confidence_col in df.columns:
                    # Weight votes by confidence
                    labels = group[label_col]
                    weights = group[confidence_col].fillna(0.5)

                    weighted_votes = {}
                    for label, weight in zip(labels, weights):
                        if label not in weighted_votes:
                            weighted_votes[label] = 0
                        weighted_votes[label] += weight

                    return max(weighted_votes, key=weighted_votes.get) if weighted_votes else 'neutral'
                else:
                    # Simple majority vote
                    return group[label_col].mode().iloc[0] if len(group[label_col].mode()) > 0 else 'neutral'

            qa_labels = df.groupby('original_qa_id').apply(confidence_weighted_vote).reset_index()
            qa_labels.columns = ['original_qa_id', f'{model_key}_qa_label']
            qa_level_df = qa_level_df.merge(qa_labels, on='original_qa_id')

    print(f"  Enhanced Q&A aggregation complete: {qa_level_df.shape}")
    return qa_level_df

def enhanced_aggregate_by_speaker_role(df: pd.DataFrame, bank_code: str) -> pd.DataFrame:
    """Enhanced speaker-level aggregation with 4-model support."""
    if df is None or 'speaker_role' not in df.columns:
        print(f"❌ Cannot aggregate by speaker for {bank_code.upper()} - missing speaker_role")
        return None

    print(f"👥 [{bank_code.upper()}] Enhanced speaker role aggregation...")

    # Enhanced speaker aggregation for all models
    speaker_agg = {
        'text': 'count',
        'sentence_length': ['mean', 'std'],
        'sentence_word_count': ['mean', 'std']
    }

    # Add aggregation for all loaded models
    for model_key in loaded_models:
        model_agg = {
            f'{model_key}_score': ['mean', 'std'],
            f'{model_key}_positive': 'mean',
            f'{model_key}_neutral': 'mean',
            f'{model_key}_negative': 'mean',
            f'{model_key}_entropy': 'mean',
            f'{model_key}_confidence': 'mean'
        }
        speaker_agg.update(model_agg)

    # Group by quarter and speaker role if quarter available
    if 'quarter' in df.columns:
        speaker_df = df.groupby(['quarter', 'speaker_role']).agg(speaker_agg).reset_index()
    else:
        speaker_df = df.groupby('speaker_role').agg(speaker_agg).reset_index()

    # Flatten column names
    speaker_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                          for col in speaker_df.columns]

    print(f"  Enhanced speaker aggregation complete: {speaker_df.shape}")
    return speaker_df

# Create enhanced multi-level aggregations
print(f"\n{'='*60}")
print("ENHANCED MULTI-LEVEL AGGREGATIONS")
print(f"{'='*60}")

qa_level_results = {}
speaker_level_results = {}

for bank in BANKS:
    qa_level_results[bank] = {}
    speaker_level_results[bank] = {}

    if bank in sentiment_results:
        print(f"\n📊 Creating aggregations for {bank.upper()}...")

        for dataset_type, df in sentiment_results[bank].items():
            if df is not None:
                # Q&A level aggregation
                qa_agg = enhanced_aggregate_sentence_to_qa_level(df, bank)
                if qa_agg is not None:
                    qa_level_results[bank][dataset_type] = qa_agg

                # Speaker level aggregation
                speaker_agg = enhanced_aggregate_by_speaker_role(df, bank)
                if speaker_agg is not None:
                    speaker_level_results[bank][dataset_type] = speaker_agg


ENHANCED MULTI-LEVEL AGGREGATIONS

📊 Creating aggregations for JPM...
📊 [JPM] Enhanced Q&A level aggregation...
  Enhanced Q&A aggregation complete: (105, 51)
👥 [JPM] Enhanced speaker role aggregation...
  Enhanced speaker aggregation complete: (1, 35)
📊 [JPM] Enhanced Q&A level aggregation...
  Enhanced Q&A aggregation complete: (140, 51)
👥 [JPM] Enhanced speaker role aggregation...
  Enhanced speaker aggregation complete: (2, 35)
📊 [JPM] Enhanced Q&A level aggregation...
  Enhanced Q&A aggregation complete: (244, 51)
👥 [JPM] Enhanced speaker role aggregation...
  Enhanced speaker aggregation complete: (3, 35)

📊 Creating aggregations for HSBC...
📊 [HSBC] Enhanced Q&A level aggregation...
  Enhanced Q&A aggregation complete: (49, 51)
👥 [HSBC] Enhanced speaker role aggregation...
  Enhanced speaker aggregation complete: (1, 35)
📊 [HSBC] Enhanced Q&A level aggregation...
  Enhanced Q&A aggregation complete: (30, 51)
👥 [HSBC] Enhanced speaker role aggregation...
  Enhanced speaker aggre

In [8]:
## Enhanced Topic-Conditional Sentiment Analysis

import re

def enhanced_extract_financial_topics(df: pd.DataFrame, bank_code: str) -> pd.DataFrame:
    """Enhanced financial topic extraction with banking domain expertise."""
    if df is None:
        return None

    print(f"🏷️ [{bank_code.upper()}] Enhanced financial topic extraction...")

    df = df.copy()

    # Enhanced banking/financial topic keywords
    enhanced_topic_keywords = {
        'revenue_growth': ['revenue', 'income', 'top line', 'sales', 'earnings growth'],
        'net_income': ['net income', 'profit', 'bottom line', 'earnings'],
        'credit_risk': ['credit risk', 'loan loss', 'provision', 'default', 'npl', 'charge-off'],
        'capital_adequacy': ['capital ratio', 'tier 1', 'cet1', 'leverage', 'basel'],
        'interest_rates': ['interest rate', 'net interest', 'nim', 'yield', 'spread'],
        'trading_revenue': ['trading', 'market making', 'sales trading', 'ficc'],
        'operational_efficiency': ['efficiency', 'cost income', 'overhead', 'expenses'],
        'digital_banking': ['digital', 'technology', 'fintech', 'mobile', 'online'],
        'regulatory': ['regulation', 'compliance', 'regulatory', 'oversight'],
        'market_conditions': ['market', 'economic', 'environment', 'outlook']
    }

    # Identify topics
    for topic, keywords in enhanced_topic_keywords.items():
        pattern = '|'.join([re.escape(kw) for kw in keywords])
        df[f'topic_{topic}'] = df['text'].str.lower().str.contains(pattern, regex=True, na=False)

    # Determine primary topic
    topic_columns = [f'topic_{topic}' for topic in enhanced_topic_keywords.keys()]
    df['topic_score'] = df[topic_columns].sum(axis=1)
    df['primary_topic'] = df[topic_columns].idxmax(axis=1).str.replace('topic_', '')

    # Mark general topics
    no_topics_mask = df['topic_score'] == 0
    df.loc[no_topics_mask, 'primary_topic'] = 'general'

    print(f"  Topic distribution:")
    topic_dist = df['primary_topic'].value_counts()
    for topic, count in topic_dist.items():
        print(f"    {topic}: {count}")

    return df

# Apply enhanced topic analysis
print(f"\n{'='*60}")
print("ENHANCED TOPIC-CONDITIONAL SENTIMENT")
print(f"{'='*60}")

topic_enhanced_results = {}

for bank in BANKS:
    topic_enhanced_results[bank] = {}

    if bank in sentiment_results:
        print(f"\n🏷️ Adding topics for {bank.upper()}...")

        for dataset_type, df in sentiment_results[bank].items():
            if df is not None:
                topic_df = enhanced_extract_financial_topics(df, bank)
                topic_enhanced_results[bank][dataset_type] = topic_df



ENHANCED TOPIC-CONDITIONAL SENTIMENT

🏷️ Adding topics for JPM...
🏷️ [JPM] Enhanced financial topic extraction...
  Topic distribution:
    general: 242
    market_conditions: 30
    revenue_growth: 8
    regulatory: 6
    interest_rates: 6
    net_income: 5
    capital_adequacy: 5
    trading_revenue: 5
    credit_risk: 4
    digital_banking: 2
🏷️ [JPM] Enhanced financial topic extraction...
  Topic distribution:
    general: 353
    market_conditions: 33
    revenue_growth: 19
    regulatory: 8
    capital_adequacy: 8
    digital_banking: 5
    interest_rates: 4
    trading_revenue: 4
    net_income: 3
    credit_risk: 2
    operational_efficiency: 1
🏷️ [JPM] Enhanced financial topic extraction...
  Topic distribution:
    general: 594
    market_conditions: 63
    revenue_growth: 27
    regulatory: 14
    capital_adequacy: 13
    interest_rates: 10
    trading_revenue: 9
    net_income: 8
    digital_banking: 7
    credit_risk: 6
    operational_efficiency: 1

🏷️ Adding topics for 

In [9]:
## Enhanced Anomaly Detection

def enhanced_detect_sentiment_anomalies(df: pd.DataFrame, bank_code: str) -> pd.DataFrame:
    """Enhanced anomaly detection across 4 models."""
    if df is None:
        return None

    print(f"🔍 [{bank_code.upper()}] Enhanced anomaly detection...")

    df = df.copy()

    # Model disagreement analysis
    model_predictions = []
    for model_key in loaded_models:
        label_col = f'{model_key}_label'
        if label_col in df.columns:
            model_predictions.append(label_col)

    if len(model_predictions) >= 2:
        # Calculate inter-model agreement
        agreement_counts = []
        for i in range(len(df)):
            row_predictions = [df.loc[i, col] for col in model_predictions if pd.notna(df.loc[i, col])]
            if row_predictions:
                most_common = max(set(row_predictions), key=row_predictions.count)
                agreement_count = row_predictions.count(most_common)
                agreement_counts.append(agreement_count / len(row_predictions))
            else:
                agreement_counts.append(0)

        df['model_agreement_rate'] = agreement_counts
        df['model_disagreement'] = df['model_agreement_rate'] < 0.5

        # High confidence disagreement
        confidence_cols = [f'{model_key}_confidence' for model_key in loaded_models]
        available_conf_cols = [col for col in confidence_cols if col in df.columns]

        if available_conf_cols:
            df['avg_confidence'] = df[available_conf_cols].mean(axis=1)
            df['high_confidence_disagreement'] = (
                (df['model_disagreement']) & (df['avg_confidence'] > 0.8)
            )

    # Statistical anomalies
    for model_key in loaded_models:
        score_col = f'{model_key}_score'
        if score_col in df.columns:
            z_scores = np.abs(stats.zscore(df[score_col].fillna(df[score_col].mean())))
            df[f'{model_key}_score_anomaly'] = z_scores > 2.5

    # Count total anomalies
    anomaly_columns = [col for col in df.columns if 'anomaly' in col or 'disagreement' in col]
    if anomaly_columns:
        df['total_anomaly_flags'] = df[anomaly_columns].sum(axis=1)

        anomaly_count = (df['total_anomaly_flags'] > 0).sum()
        print(f"  Detected {anomaly_count} records with anomalies ({anomaly_count/len(df)*100:.1f}%)")

    return df

# Apply enhanced anomaly detection
anomaly_enhanced_results = {}

for bank in BANKS:
    anomaly_enhanced_results[bank] = {}

    if bank in topic_enhanced_results:
        print(f"\n🔍 Anomaly detection for {bank.upper()}...")

        for dataset_type, df in topic_enhanced_results[bank].items():
            if df is not None:
                anomaly_df = enhanced_detect_sentiment_anomalies(df, bank)
                anomaly_enhanced_results[bank][dataset_type] = anomaly_df



🔍 Anomaly detection for JPM...
🔍 [JPM] Enhanced anomaly detection...
  Detected 18 records with anomalies (5.8%)
🔍 [JPM] Enhanced anomaly detection...
  Detected 45 records with anomalies (10.2%)
🔍 [JPM] Enhanced anomaly detection...
  Detected 44 records with anomalies (5.9%)

🔍 Anomaly detection for HSBC...
🔍 [HSBC] Enhanced anomaly detection...
  Detected 31 records with anomalies (10.3%)
🔍 [HSBC] Enhanced anomaly detection...
  Detected 17 records with anomalies (5.0%)
🔍 [HSBC] Enhanced anomaly detection...
  Detected 40 records with anomalies (6.2%)


In [10]:
## Save Enhanced Results

def save_enhanced_sentiment_results():
    """Save all enhanced sentiment analysis results."""
    print(f"\n{'='*60}")
    print("SAVING ENHANCED SENTIMENT RESULTS")
    print(f"{'='*60}")

    saved_files = {}

    for bank in BANKS:
        saved_files[bank] = {}

        print(f"\n💾 Saving {bank.upper()} results...")

        # Save sentence-level results
        if bank in anomaly_enhanced_results:
            for dataset_type, df in anomaly_enhanced_results[bank].items():
                if df is not None:
                    filename = f"enhanced_4model_sentiment_{bank}_{dataset_type}_sentence.csv"
                    file_path = sentiment_paths[bank]["results_sentiment"] / filename

                    df.to_csv(file_path, index=False)
                    saved_files[bank][f"sentence_{dataset_type}"] = {
                        "path": str(file_path),
                        "shape": df.shape,
                        "models": loaded_models
                    }
                    print(f"  ✅ Sentence {dataset_type}: {filename} ({df.shape})")

        # Save Q&A-level results
        if bank in qa_level_results:
            for dataset_type, df in qa_level_results[bank].items():
                if df is not None:
                    filename = f"enhanced_4model_sentiment_{bank}_{dataset_type}_qa.csv"
                    file_path = sentiment_paths[bank]["results_sentiment"] / filename

                    df.to_csv(file_path, index=False)
                    saved_files[bank][f"qa_{dataset_type}"] = {
                        "path": str(file_path),
                        "shape": df.shape,
                        "models": loaded_models
                    }
                    print(f"  ✅ Q&A {dataset_type}: {filename} ({df.shape})")

        # Save speaker-level results
        if bank in speaker_level_results:
            for dataset_type, df in speaker_level_results[bank].items():
                if df is not None:
                    filename = f"enhanced_4model_sentiment_{bank}_{dataset_type}_speaker.csv"
                    file_path = sentiment_paths[bank]["results_sentiment"] / filename

                    df.to_csv(file_path, index=False)
                    saved_files[bank][f"speaker_{dataset_type}"] = {
                        "path": str(file_path),
                        "shape": df.shape,
                        "models": loaded_models
                    }
                    print(f"  ✅ Speaker {dataset_type}: {filename} ({df.shape})")

    return saved_files

# Save all enhanced results
saved_sentiment_files = save_enhanced_sentiment_results()



SAVING ENHANCED SENTIMENT RESULTS

💾 Saving JPM results...
  ✅ Sentence q1_2025: enhanced_4model_sentiment_jpm_q1_2025_sentence.csv ((313, 78))
  ✅ Sentence q2_2025: enhanced_4model_sentiment_jpm_q2_2025_sentence.csv ((440, 74))
  ✅ Sentence combined: enhanced_4model_sentiment_jpm_combined_sentence.csv ((752, 78))
  ✅ Q&A q1_2025: enhanced_4model_sentiment_jpm_q1_2025_qa.csv ((105, 51))
  ✅ Q&A q2_2025: enhanced_4model_sentiment_jpm_q2_2025_qa.csv ((140, 51))
  ✅ Q&A combined: enhanced_4model_sentiment_jpm_combined_qa.csv ((244, 51))
  ✅ Speaker q1_2025: enhanced_4model_sentiment_jpm_q1_2025_speaker.csv ((1, 35))
  ✅ Speaker q2_2025: enhanced_4model_sentiment_jpm_q2_2025_speaker.csv ((2, 35))
  ✅ Speaker combined: enhanced_4model_sentiment_jpm_combined_speaker.csv ((3, 35))

💾 Saving HSBC results...
  ✅ Sentence q1_2025: enhanced_4model_sentiment_hsbc_q1_2025_sentence.csv ((300, 74))
  ✅ Sentence q2_2025: enhanced_4model_sentiment_hsbc_q2_2025_sentence.csv ((340, 78))
  ✅ Sentence com

In [11]:
## Enhanced Summary Report

def create_enhanced_sentiment_summary():
    """Create comprehensive enhanced sentiment analysis summary."""
    summary = {
        "analysis_timestamp": pd.Timestamp.now().isoformat(),
        "banks_analyzed": BANKS,
        "models_used": loaded_models,
        "failed_models": failed_models,
        "analysis_levels": ["sentence", "qa", "speaker", "topic"],
        "datasets_processed": {},
        "model_performance": {},
        "anomaly_detection": {},
        "cross_bank_comparison": {}
    }

    # Dataset processing summary
    total_records = 0
    for bank in BANKS:
        bank_summary = {
            "datasets": len(saved_sentiment_files.get(bank, {})),
            "total_records": 0,
            "model_coverage": {}
        }

        if bank in anomaly_enhanced_results:
            for dataset_type, df in anomaly_enhanced_results[bank].items():
                if df is not None:
                    bank_summary["total_records"] += len(df)
                    total_records += len(df)

                    # Model coverage
                    for model_key in loaded_models:
                        label_col = f'{model_key}_label'
                        if label_col in df.columns:
                            coverage = df[label_col].notna().sum() / len(df)
                            if model_key not in bank_summary["model_coverage"]:
                                bank_summary["model_coverage"][model_key] = []
                            bank_summary["model_coverage"][model_key].append(coverage)

        # Average model coverage
        for model_key in bank_summary["model_coverage"]:
            bank_summary["model_coverage"][model_key] = np.mean(bank_summary["model_coverage"][model_key])

        summary["datasets_processed"][bank] = bank_summary

    # Model performance across banks
    for model_key in loaded_models:
        model_stats = {
            "total_predictions": 0,
            "avg_confidence": 0,
            "label_distribution": {"positive": 0, "neutral": 0, "negative": 0}
        }

        all_confidences = []
        all_labels = []

        for bank in BANKS:
            if bank in anomaly_enhanced_results:
                for dataset_type, df in anomaly_enhanced_results[bank].items():
                    if df is not None:
                        conf_col = f'{model_key}_confidence'
                        label_col = f'{model_key}_label'

                        if conf_col in df.columns:
                            confidences = df[conf_col].dropna()
                            all_confidences.extend(confidences.tolist())

                        if label_col in df.columns:
                            labels = df[label_col].dropna()
                            all_labels.extend(labels.tolist())

        if all_confidences:
            model_stats["avg_confidence"] = np.mean(all_confidences)

        if all_labels:
            model_stats["total_predictions"] = len(all_labels)
            label_counts = pd.Series(all_labels).value_counts()
            for label in ["positive", "neutral", "negative"]:
                model_stats["label_distribution"][label] = label_counts.get(label, 0)

        summary["model_performance"][model_key] = model_stats

    # Anomaly detection summary
    total_anomalies = 0
    for bank in BANKS:
        if bank in anomaly_enhanced_results:
            for dataset_type, df in anomaly_enhanced_results[bank].items():
                if df is not None and 'total_anomaly_flags' in df.columns:
                    total_anomalies += (df['total_anomaly_flags'] > 0).sum()

    summary["anomaly_detection"] = {
        "total_anomalies_detected": total_anomalies,
        "anomaly_rate": total_anomalies / total_records if total_records > 0 else 0
    }

    return summary

# Create enhanced summary
enhanced_sentiment_summary = create_enhanced_sentiment_summary()

# Save summary
summary_path = drive_base / "configs" / "enhanced_sentiment_analysis_summary.json"
with open(summary_path, "w") as f:
    json.dump(enhanced_sentiment_summary, f, indent=2, default=str)

print(f"Enhanced sentiment summary saved: {summary_path}")


Enhanced sentiment summary saved: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_sentiment_analysis_summary.json


In [12]:
## Final Summary

print(f"\n{'='*60}")
print("ENHANCED 4-MODEL SENTIMENT ANALYSIS COMPLETE")
print(f"{'='*60}")

# Summary statistics
total_banks = len(BANKS)
successful_models = len(loaded_models)
total_files = sum(len(bank_files) for bank_files in saved_sentiment_files.values())
total_records = sum(
    enhanced_sentiment_summary["datasets_processed"][bank]["total_records"]
    for bank in BANKS
    if bank in enhanced_sentiment_summary["datasets_processed"]
)

print(f"📊 Analysis Summary:")
print(f"  Banks analyzed: {total_banks} ({', '.join([b.upper() for b in BANKS])})")
print(f"  Models successfully loaded: {successful_models}/{len(MODELS)}")
print(f"  Total files generated: {total_files}")
print(f"  Total records processed: {total_records:,}")

print(f"\n🤖 Model Performance:")
for model_key in loaded_models:
    model_stats = enhanced_sentiment_summary["model_performance"][model_key]
    predictions = model_stats["total_predictions"]
    avg_conf = model_stats["avg_confidence"]
    print(f"  {model_key}:")
    print(f"    Predictions: {predictions:,}")
    print(f"    Avg confidence: {avg_conf:.3f}")

print(f"\n🏦 Bank Coverage:")
for bank in BANKS:
    if bank in enhanced_sentiment_summary["datasets_processed"]:
        bank_stats = enhanced_sentiment_summary["datasets_processed"][bank]
        records = bank_stats["total_records"]
        datasets = bank_stats["datasets"]
        print(f"  {bank.upper()}: {records:,} records across {datasets} datasets")

if failed_models:
    print(f"\n⚠️ Failed Models:")
    for model_key, error in failed_models:
        print(f"  {model_key}: {error}")

print(f"\n🔍 Anomaly Detection:")
anomaly_stats = enhanced_sentiment_summary["anomaly_detection"]
total_anomalies = anomaly_stats["total_anomalies_detected"]
anomaly_rate = anomaly_stats["anomaly_rate"]
print(f"  Anomalies detected: {total_anomalies:,} ({anomaly_rate:.1%})")

print(f"\n🚀 Next Steps:")
print(f"  1. Run 04b_model_finetuning.ipynb for enhanced fine-tuning")
print(f"  2. Continue to 05_model_comparison.ipynb for comprehensive comparison")
print(f"  3. Enhanced sentiment data ready for all analysis levels")

print(f"\n🎉 Enhanced 4-model sentiment analysis complete!")
print(f"   Multi-bank, multi-model, multi-level analysis framework operational")


ENHANCED 4-MODEL SENTIMENT ANALYSIS COMPLETE
📊 Analysis Summary:
  Banks analyzed: 2 (JPM, HSBC)
  Models successfully loaded: 4/4
  Total files generated: 18
  Total records processed: 2,785

🤖 Model Performance:
  finbert_yiyanghkust:
    Predictions: 2,785
    Avg confidence: 0.934
  finbert_prosusai:
    Predictions: 2,785
    Avg confidence: 0.808
  distilroberta:
    Predictions: 2,785
    Avg confidence: 0.813
  cardiffnlp_roberta:
    Predictions: 2,785
    Avg confidence: 0.736

🏦 Bank Coverage:
  JPM: 1,505 records across 9 datasets
  HSBC: 1,280 records across 9 datasets

🔍 Anomaly Detection:
  Anomalies detected: 195 (7.0%)

🚀 Next Steps:
  1. Run 04b_model_finetuning.ipynb for enhanced fine-tuning
  2. Continue to 05_model_comparison.ipynb for comprehensive comparison
  3. Enhanced sentiment data ready for all analysis levels

🎉 Enhanced 4-model sentiment analysis complete!
   Multi-bank, multi-model, multi-level analysis framework operational
