In [1]:
# 05_model_comparison.ipynb
# Purpose: Compare FinBERT models and analyze performance differences
# Input: sentiment analysis results from 04_sentiment_analysis.ipynb
# Output: model comparison metrics, agreement analysis, disagreement patterns

## Import Libraries

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive")


# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

print(f"Model comparison analysis for bank: {BANK_CODE.upper()}")

## Define Paths

results_sentiment_path = drive_base / "results/sentiment/jpm"
results_comparison_path = drive_base / "results/comparison/jpm"
viz_path = drive_base / "outputs/visualizations/jpm"

# Ensure directories exist
results_comparison_path.mkdir(parents=True, exist_ok=True)
viz_path.mkdir(parents=True, exist_ok=True)


Mounted at /content/drive
Model comparison analysis for bank: JPM


In [2]:

## Load Sentiment Analysis Results

def load_sentiment_results(filename: str) -> pd.DataFrame:
    """Load sentiment analysis results with error handling."""
    file_path = results_sentiment_path / filename

    if not file_path.exists():
        print(f"❌ File not found: {file_path}")
        return None

    try:
        df = pd.read_csv(file_path)
        print(f"✓ Loaded {filename}: {df.shape}")
        return df
    except Exception as e:
        print(f"❌ Error loading {filename}: {str(e)}")
        return None

print("Loading sentiment analysis results...")

# Load sentence-level results
sentiment_jpm_q1_2025_df = load_sentiment_results("sentiment_sentence_jpm_q1_2025.csv")
sentiment_jpm_q2_2025_df = load_sentiment_results("sentiment_sentence_jpm_q2_2025.csv")
sentiment_jpm_multi_2025_df = load_sentiment_results("sentiment_sentence_jpm_multi_2025.csv")

# Load aggregated results
qa_level_jpm_multi_2025_df = load_sentiment_results("sentiment_qa_jpm_multi_2025.csv")
speaker_level_jpm_multi_2025_df = load_sentiment_results("sentiment_speaker_jpm_multi_2025.csv")
topic_level_jpm_multi_2025_df = load_sentiment_results("sentiment_topic_jpm_multi_2025.csv")


Loading sentiment analysis results...
✓ Loaded sentiment_sentence_jpm_q1_2025.csv: (578, 37)
✓ Loaded sentiment_sentence_jpm_q2_2025.csv: (532, 37)
✓ Loaded sentiment_sentence_jpm_multi_2025.csv: (1110, 37)
✓ Loaded sentiment_qa_jpm_multi_2025.csv: (218, 27)
✓ Loaded sentiment_speaker_jpm_multi_2025.csv: (6, 21)
✓ Loaded sentiment_topic_jpm_multi_2025.csv: (16, 13)


In [3]:
## Model Agreement Analysis

class ModelComparisonAnalyzer:
    """Analyzer for comparing FinBERT model performance and agreement."""

    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.finbert_tone_col = 'finbert_tone_label'
        self.prosus_col = 'prosus_label'
        self.finbert_score_col = 'finbert_tone_score'
        self.prosus_score_col = 'prosus_score'

    def calculate_agreement_metrics(self) -> Dict:
        """Calculate inter-model agreement metrics."""
        if self.df is None:
            return {}

        # Filter out rows with missing predictions
        valid_df = self.df.dropna(subset=[self.finbert_tone_col, self.prosus_col])

        if len(valid_df) == 0:
            print("❌ No valid predictions found for comparison")
            return {}

        print(f"Calculating agreement metrics for {len(valid_df)} predictions...")

        # Basic agreement rate
        agreement = valid_df[self.finbert_tone_col] == valid_df[self.prosus_col]
        agreement_rate = agreement.mean()

        # Cohen's Kappa (accounting for chance agreement)
        kappa = cohen_kappa_score(
            valid_df[self.finbert_tone_col],
            valid_df[self.prosus_col]
        )

        # Correlation between confidence scores
        score_correlation = None
        if self.finbert_score_col in valid_df.columns and self.prosus_score_col in valid_df.columns:
            finbert_scores = valid_df[self.finbert_score_col].fillna(0)
            prosus_scores = valid_df[self.prosus_score_col].fillna(0)

            pearson_r, pearson_p = pearsonr(finbert_scores, prosus_scores)
            spearman_r, spearman_p = spearmanr(finbert_scores, prosus_scores)

            score_correlation = {
                'pearson': {'r': pearson_r, 'p_value': pearson_p},
                'spearman': {'r': spearman_r, 'p_value': spearman_p}
            }

        # Agreement by sentiment class
        agreement_by_class = {}
        for sentiment in ['positive', 'neutral', 'negative']:
            mask = valid_df[self.finbert_tone_col] == sentiment
            if mask.sum() > 0:
                class_agreement = (
                    valid_df.loc[mask, self.finbert_tone_col] ==
                    valid_df.loc[mask, self.prosus_col]
                ).mean()
                agreement_by_class[sentiment] = class_agreement

        metrics = {
            'total_predictions': len(valid_df),
            'agreement_rate': agreement_rate,
            'cohen_kappa': kappa,
            'score_correlation': score_correlation,
            'agreement_by_class': agreement_by_class
        }

        return metrics

    def analyze_distribution_differences(self) -> Dict:
        """Analyze sentiment distribution differences between models."""
        if self.df is None:
            return {}

        valid_df = self.df.dropna(subset=[self.finbert_tone_col, self.prosus_col])

        if len(valid_df) == 0:
            return {}

        print("Analyzing sentiment distribution differences...")

        # Distribution for each model
        finbert_dist = valid_df[self.finbert_tone_col].value_counts(normalize=True).sort_index()
        prosus_dist = valid_df[self.prosus_col].value_counts(normalize=True).sort_index()

        # Ensure same index
        all_sentiments = set(finbert_dist.index).union(set(prosus_dist.index))
        finbert_dist = finbert_dist.reindex(all_sentiments, fill_value=0)
        prosus_dist = prosus_dist.reindex(all_sentiments, fill_value=0)

        # Chi-square test for distribution similarity
        try:
            chi2_stat, chi2_p = stats.chisquare(finbert_dist.values, prosus_dist.values)
        except:
            chi2_stat, chi2_p = None, None

        # Distribution differences
        dist_diff = (finbert_dist - prosus_dist).abs()

        distribution_analysis = {
            'finbert_tone_distribution': finbert_dist.to_dict(),
            'prosus_distribution': prosus_dist.to_dict(),
            'distribution_differences': dist_diff.to_dict(),
            'chi_square_test': {
                'statistic': chi2_stat,
                'p_value': chi2_p
            } if chi2_stat is not None else None,
            'max_difference': dist_diff.max(),
            'total_variation_distance': dist_diff.sum() / 2
        }

        return distribution_analysis

    def analyze_confidence_patterns(self) -> Dict:
        """Analyze confidence score patterns."""
        if self.df is None:
            return {}

        valid_df = self.df.dropna(subset=[self.finbert_score_col, self.prosus_score_col])

        if len(valid_df) == 0:
            return {}

        print("Analyzing confidence score patterns...")

        # Confidence statistics by model
        finbert_confidence = {
            'mean': valid_df[self.finbert_score_col].mean(),
            'std': valid_df[self.finbert_score_col].std(),
            'median': valid_df[self.finbert_score_col].median(),
            'min': valid_df[self.finbert_score_col].min(),
            'max': valid_df[self.finbert_score_col].max()
        }

        prosus_confidence = {
            'mean': valid_df[self.prosus_score_col].mean(),
            'std': valid_df[self.prosus_score_col].std(),
            'median': valid_df[self.prosus_score_col].median(),
            'min': valid_df[self.prosus_score_col].min(),
            'max': valid_df[self.prosus_score_col].max()
        }

        # Confidence by sentiment class
        finbert_conf_by_class = {}
        prosus_conf_by_class = {}

        for sentiment in ['positive', 'neutral', 'negative']:
            finbert_mask = valid_df[self.finbert_tone_col] == sentiment
            prosus_mask = valid_df[self.prosus_col] == sentiment

            if finbert_mask.sum() > 0:
                finbert_conf_by_class[sentiment] = valid_df.loc[finbert_mask, self.finbert_score_col].mean()

            if prosus_mask.sum() > 0:
                prosus_conf_by_class[sentiment] = valid_df.loc[prosus_mask, self.prosus_score_col].mean()

        # Confidence difference analysis
        conf_diff = valid_df[self.finbert_score_col] - valid_df[self.prosus_score_col]

        confidence_analysis = {
            'finbert_tone_confidence': finbert_confidence,
            'prosus_confidence': prosus_confidence,
            'finbert_conf_by_class': finbert_conf_by_class,
            'prosus_conf_by_class': prosus_conf_by_class,
            'confidence_difference': {
                'mean': conf_diff.mean(),
                'std': conf_diff.std(),
                'median': conf_diff.median()
            }
        }

        return confidence_analysis

    def analyze_disagreement_patterns(self) -> Dict:
        """Analyze patterns in model disagreements."""
        if self.df is None:
            return {}

        valid_df = self.df.dropna(subset=[self.finbert_tone_col, self.prosus_col])

        if len(valid_df) == 0:
            return {}

        print("Analyzing disagreement patterns...")

        # Find disagreements
        disagreements = valid_df[valid_df[self.finbert_tone_col] != valid_df[self.prosus_col]].copy()

        if len(disagreements) == 0:
            return {'total_disagreements': 0}

        # Disagreement patterns
        disagreement_patterns = disagreements.groupby([self.finbert_tone_col, self.prosus_col]).size()

        # Most common disagreements
        top_disagreements = disagreement_patterns.nlargest(5).to_dict()

        # Disagreement by text characteristics
        disagreement_analysis = {
            'total_disagreements': len(disagreements),
            'disagreement_rate': len(disagreements) / len(valid_df),
            'disagreement_patterns': disagreement_patterns.to_dict(),
            'top_disagreement_pairs': top_disagreements
        }

        # Add text length analysis if available
        if 'sentence_length' in disagreements.columns:
            disagreement_analysis['avg_disagreement_text_length'] = disagreements['sentence_length'].mean()
            agreement_df = valid_df[valid_df[self.finbert_tone_col] == valid_df[self.prosus_col]]
            disagreement_analysis['avg_agreement_text_length'] = agreement_df['sentence_length'].mean()

        # Add topic analysis if available
        if 'primary_topic' in disagreements.columns:
            topic_disagreements = disagreements['primary_topic'].value_counts(normalize=True)
            disagreement_analysis['disagreement_by_topic'] = topic_disagreements.to_dict()

        return disagreement_analysis

# Run comprehensive model comparison
print("\n" + "="*60)
print("MODEL COMPARISON ANALYSIS")
print("="*60)

# Analyze each dataset
comparison_results = {}

datasets_to_analyze = [
    ("Q1 2025", sentiment_jpm_q1_2025_df),
    ("Q2 2025", sentiment_jpm_q2_2025_df),
    ("Multi 2025", sentiment_jpm_multi_2025_df)
]

for dataset_name, df in datasets_to_analyze:
    if df is not None:
        print(f"\n🔍 ANALYZING {dataset_name}")
        print("-" * 40)

        analyzer = ModelComparisonAnalyzer(df)

        # Calculate all metrics
        agreement_metrics = analyzer.calculate_agreement_metrics()
        distribution_analysis = analyzer.analyze_distribution_differences()
        confidence_analysis = analyzer.analyze_confidence_patterns()
        disagreement_analysis = analyzer.analyze_disagreement_patterns()

        comparison_results[dataset_name] = {
            'agreement_metrics': agreement_metrics,
            'distribution_analysis': distribution_analysis,
            'confidence_analysis': confidence_analysis,
            'disagreement_analysis': disagreement_analysis
        }

        # Print key results
        if agreement_metrics:
            print(f"  Agreement rate: {agreement_metrics['agreement_rate']:.3f}")
            print(f"  Cohen's Kappa: {agreement_metrics['cohen_kappa']:.3f}")
            if agreement_metrics['score_correlation']:
                print(f"  Score correlation (Pearson): {agreement_metrics['score_correlation']['pearson']['r']:.3f}")

        if disagreement_analysis:
            print(f"  Disagreement rate: {disagreement_analysis['disagreement_rate']:.3f}")



MODEL COMPARISON ANALYSIS

🔍 ANALYZING Q1 2025
----------------------------------------
Calculating agreement metrics for 578 predictions...
Analyzing sentiment distribution differences...
Analyzing confidence score patterns...
Analyzing disagreement patterns...
  Agreement rate: 0.747
  Cohen's Kappa: 0.442
  Score correlation (Pearson): 0.188
  Disagreement rate: 0.253

🔍 ANALYZING Q2 2025
----------------------------------------
Calculating agreement metrics for 532 predictions...
Analyzing sentiment distribution differences...
Analyzing confidence score patterns...
Analyzing disagreement patterns...
  Agreement rate: 0.776
  Cohen's Kappa: 0.419
  Score correlation (Pearson): 0.205
  Disagreement rate: 0.224

🔍 ANALYZING Multi 2025
----------------------------------------
Calculating agreement metrics for 1110 predictions...
Analyzing sentiment distribution differences...
Analyzing confidence score patterns...
Analyzing disagreement patterns...
  Agreement rate: 0.761
  Cohen's Ka

In [5]:
## Financial Context Metrics

def analyze_financial_context_performance(df: pd.DataFrame, dataset_name: str) -> Dict:
    """Analyze model performance in financial context."""
    if df is None:
        return {}

    print(f"Analyzing financial context performance for {dataset_name}...")

    # Financial keywords that should correlate with sentiment
    positive_financial_keywords = [
        'growth', 'profit', 'increase', 'strong', 'improved', 'positive',
        'success', 'beat', 'outperform', 'exceed'
    ]

    negative_financial_keywords = [
        'loss', 'decline', 'decrease', 'weak', 'poor', 'negative',
        'miss', 'underperform', 'below', 'concern', 'risk'
    ]

    # Create keyword-based expected sentiment
    df_analysis = df.copy()

    # Count positive and negative keywords
    text_col = 'text' if 'text' in df.columns else 'text_clean'
    if text_col not in df.columns:
        return {}

    df_analysis['positive_keywords'] = df_analysis[text_col].str.lower().str.count(
        '|'.join(positive_financial_keywords)
    )
    df_analysis['negative_keywords'] = df_analysis[text_col].str.lower().str.count(
        '|'.join(negative_financial_keywords)
    )

    # Expected sentiment based on keyword balance
    df_analysis['keyword_sentiment_score'] = (
        df_analysis['positive_keywords'] - df_analysis['negative_keywords']
    )

    # Categorize expected sentiment
    df_analysis['expected_sentiment'] = 'neutral'
    df_analysis.loc[df_analysis['keyword_sentiment_score'] > 0, 'expected_sentiment'] = 'positive'
    df_analysis.loc[df_analysis['keyword_sentiment_score'] < 0, 'expected_sentiment'] = 'negative'

    # Calculate alignment with keyword-based expectations
    finbert_alignment = (
        df_analysis['finbert_tone_label'] == df_analysis['expected_sentiment']
    ).mean()

    prosus_alignment = (
        df_analysis['prosus_label'] == df_analysis['expected_sentiment']
    ).mean()

    # Analyze performance by financial topic
    topic_performance = {}
    if 'primary_topic' in df_analysis.columns:
        for topic in df_analysis['primary_topic'].unique():
            topic_mask = df_analysis['primary_topic'] == topic
            topic_df = df_analysis[topic_mask]

            if len(topic_df) > 10:  # Only analyze topics with sufficient data
                # Model agreement within topic
                topic_agreement = (
                    topic_df['finbert_tone_label'] == topic_df['prosus_label']
                ).mean()

                # Keyword alignment within topic
                finbert_topic_alignment = (
                    topic_df['finbert_tone_label'] == topic_df['expected_sentiment']
                ).mean()

                prosus_topic_alignment = (
                    topic_df['prosus_label'] == topic_df['expected_sentiment']
                ).mean()

                topic_performance[topic] = {
                    'sample_size': len(topic_df),
                    'model_agreement': topic_agreement,
                    'finbert_keyword_alignment': finbert_topic_alignment,
                    'prosus_keyword_alignment': prosus_topic_alignment
                }

    financial_context_metrics = {
        'keyword_based_alignment': {
            'finbert_tone': finbert_alignment,
            'prosus': prosus_alignment
        },
        'topic_performance': topic_performance,
        'keyword_distribution': {
            'texts_with_positive_keywords': (df_analysis['positive_keywords'] > 0).sum(),
            'texts_with_negative_keywords': (df_analysis['negative_keywords'] > 0).sum(),
            'texts_with_mixed_keywords': (
                (df_analysis['positive_keywords'] > 0) &
                (df_analysis['negative_keywords'] > 0)
            ).sum()
        }
    }

    return financial_context_metrics


In [6]:
# Analyze financial context performance
print("\n" + "="*50)
print("FINANCIAL CONTEXT ANALYSIS")
print("="*50)

for dataset_name, df in datasets_to_analyze:
    if df is not None:
        financial_metrics = analyze_financial_context_performance(df, dataset_name)
        comparison_results[dataset_name]['financial_context_metrics'] = financial_metrics

        if financial_metrics and 'keyword_based_alignment' in financial_metrics:
            finbert_align = financial_metrics['keyword_based_alignment']['finbert_tone']
            prosus_align = financial_metrics['keyword_based_alignment']['prosus']
            print(f"  {dataset_name} - FinBERT keyword alignment: {finbert_align:.3f}")
            print(f"  {dataset_name} - ProsusAI keyword alignment: {prosus_align:.3f}")



FINANCIAL CONTEXT ANALYSIS
Analyzing financial context performance for Q1 2025...
  Q1 2025 - FinBERT keyword alignment: 0.687
  Q1 2025 - ProsusAI keyword alignment: 0.735
Analyzing financial context performance for Q2 2025...
  Q2 2025 - FinBERT keyword alignment: 0.722
  Q2 2025 - ProsusAI keyword alignment: 0.789
Analyzing financial context performance for Multi 2025...
  Multi 2025 - FinBERT keyword alignment: 0.704
  Multi 2025 - ProsusAI keyword alignment: 0.761


In [7]:
## Volatility Correlation Analysis (Placeholder)

def analyze_volatility_correlation(df: pd.DataFrame, dataset_name: str) -> Dict:
    """Analyze correlation with market volatility (placeholder for future stock data)."""
    print(f"Placeholder: Volatility correlation analysis for {dataset_name}")

    # This would integrate with stock price data when available
    # For now, we create a framework for future integration

    volatility_metrics = {
        'note': 'Placeholder for stock price correlation analysis',
        'framework_ready': True,
        'required_data': [
            'Stock price data for JPM during Q1 and Q2 2025',
            'Market volatility indices',
            'Earnings announcement dates'
        ],
        'analysis_approach': [
            'Calculate sentiment scores around earnings dates',
            'Correlate with stock price movements',
            'Analyze sentiment-volatility relationship',
            'Compare model sensitivity to market-relevant sentiment'
        ]
    }

    return volatility_metrics

# Add volatility analysis placeholder
for dataset_name in comparison_results:
    volatility_metrics = analyze_volatility_correlation(None, dataset_name)
    comparison_results[dataset_name]['volatility_correlation'] = volatility_metrics


Placeholder: Volatility correlation analysis for Q1 2025
Placeholder: Volatility correlation analysis for Q2 2025
Placeholder: Volatility correlation analysis for Multi 2025


In [8]:
## Qualitative Disagreement Analysis

def perform_qualitative_disagreement_analysis(df: pd.DataFrame, dataset_name: str, sample_size: int = 20) -> Dict:
    """Perform qualitative analysis of model disagreements."""
    if df is None:
        return {}

    print(f"Performing qualitative disagreement analysis for {dataset_name}...")

    # Find disagreements
    disagreements = df[df['finbert_tone_label'] != df['prosus_label']].copy()

    if len(disagreements) == 0:
        return {'note': 'No disagreements found'}

    # Sample for qualitative analysis
    sample_disagreements = disagreements.sample(
        min(sample_size, len(disagreements)),
        random_state=SEED
    )

    # Categorize disagreement types
    disagreement_types = sample_disagreements.groupby([
        'finbert_tone_label', 'prosus_label'
    ]).size().to_dict()

    # Extract text examples for each disagreement type
    examples = {}
    for (finbert_pred, prosus_pred), count in disagreement_types.items():
        mask = (
            (sample_disagreements['finbert_tone_label'] == finbert_pred) &
            (sample_disagreements['prosus_label'] == prosus_pred)
        )

        examples_for_type = sample_disagreements[mask]['text'].head(3).tolist()

        examples[f"{finbert_pred}_vs_{prosus_pred}"] = {
            'count': count,
            'examples': examples_for_type
        }

    # Analyze text characteristics of disagreements
    text_characteristics = {}
    if 'sentence_length' in disagreements.columns:
        text_characteristics['avg_length'] = disagreements['sentence_length'].mean()
        text_characteristics['length_std'] = disagreements['sentence_length'].std()

    if 'sentence_word_count' in disagreements.columns:
        text_characteristics['avg_words'] = disagreements['sentence_word_count'].mean()
        text_characteristics['word_std'] = disagreements['sentence_word_count'].std()

    qualitative_analysis = {
        'total_disagreements_analyzed': len(sample_disagreements),
        'disagreement_types': disagreement_types,
        'text_examples': examples,
        'text_characteristics': text_characteristics
    }

    return qualitative_analysis

# Perform qualitative analysis
print("\n" + "="*50)
print("QUALITATIVE DISAGREEMENT ANALYSIS")
print("="*50)

for dataset_name, df in datasets_to_analyze:
    if df is not None:
        qualitative_analysis = perform_qualitative_disagreement_analysis(df, dataset_name)
        comparison_results[dataset_name]['qualitative_analysis'] = qualitative_analysis

        if 'disagreement_types' in qualitative_analysis:
            print(f"\n{dataset_name} - Top disagreement patterns:")
            for pattern, count in list(qualitative_analysis['disagreement_types'].items())[:3]:
                print(f"  {pattern}: {count} cases")



QUALITATIVE DISAGREEMENT ANALYSIS
Performing qualitative disagreement analysis for Q1 2025...

Q1 2025 - Top disagreement patterns:
  ('negative', 'neutral'): 9 cases
  ('neutral', 'negative'): 1 cases
  ('neutral', 'positive'): 4 cases
Performing qualitative disagreement analysis for Q2 2025...

Q2 2025 - Top disagreement patterns:
  ('negative', 'neutral'): 3 cases
  ('neutral', 'negative'): 1 cases
  ('neutral', 'positive'): 5 cases
Performing qualitative disagreement analysis for Multi 2025...

Multi 2025 - Top disagreement patterns:
  ('negative', 'neutral'): 7 cases
  ('negative', 'positive'): 2 cases
  ('neutral', 'positive'): 4 cases


In [9]:
## Research Questions Analysis

def analyze_research_questions(df: pd.DataFrame) -> Dict:
    """Analyze key research questions using model comparison results."""
    if df is None:
        return {}

    print("Analyzing key research questions...")

    research_analysis = {}

    # Question 1: Do bankers and analysts show diverging sentiment?
    if 'speaker_role' in df.columns:
        speaker_sentiment_analysis = {}

        for model_col in ['finbert_tone_label', 'prosus_label']:
            if model_col in df.columns:
                model_name = 'finbert_tone' if 'finbert' in model_col else 'prosus'

                # Calculate sentiment distribution by speaker role
                speaker_dist = df.groupby('speaker_role')[model_col].value_counts(normalize=True).unstack(fill_value=0)

                # Calculate divergence metrics
                if 'analyst' in speaker_dist.index and 'executive' in speaker_dist.index:
                    analyst_sentiment = speaker_dist.loc['analyst']
                    exec_sentiment = speaker_dist.loc['executive']

                    # Total variation distance
                    divergence = 0.5 * (analyst_sentiment - exec_sentiment).abs().sum()

                    # Positive sentiment difference
                    pos_diff = analyst_sentiment.get('positive', 0) - exec_sentiment.get('positive', 0)

                    speaker_sentiment_analysis[model_name] = {
                        'analyst_distribution': analyst_sentiment.to_dict(),
                        'executive_distribution': exec_sentiment.to_dict(),
                        'total_variation_distance': divergence,
                        'positive_sentiment_difference': pos_diff
                    }

        research_analysis['speaker_divergence'] = speaker_sentiment_analysis

    # Question 2: Tone shift over time
    if 'quarter' in df.columns:
        temporal_analysis = {}

        for model_col in ['finbert_tone_label', 'prosus_label']:
            if model_col in df.columns:
                model_name = 'finbert_tone' if 'finbert' in model_col else 'prosus'

                # Calculate sentiment by quarter
                quarter_sentiment = df.groupby('quarter')[model_col].value_counts(normalize=True).unstack(fill_value=0)

                # Calculate temporal changes
                if len(quarter_sentiment) >= 2:
                    quarters = sorted(quarter_sentiment.index)
                    q1_sentiment = quarter_sentiment.loc[quarters[0]]
                    q2_sentiment = quarter_sentiment.loc[quarters[-1]]

                    # Sentiment shift metrics
                    sentiment_shift = q2_sentiment - q1_sentiment

                    temporal_analysis[model_name] = {
                        'q1_distribution': q1_sentiment.to_dict(),
                        'q2_distribution': q2_sentiment.to_dict(),
                        'sentiment_shift': sentiment_shift.to_dict(),
                        'positive_change': sentiment_shift.get('positive', 0)
                    }

        research_analysis['temporal_shifts'] = temporal_analysis

    # Question 3: Model consistency across contexts
    model_consistency = {}

    # Agreement rate by topic
    if 'primary_topic' in df.columns:
        topic_agreement = {}
        for topic in df['primary_topic'].unique():
            topic_mask = df['primary_topic'] == topic
            topic_df = df[topic_mask]

            if len(topic_df) > 5:
                agreement_rate = (
                    topic_df['finbert_tone_label'] == topic_df['prosus_label']
                ).mean()
                topic_agreement[topic] = agreement_rate

        model_consistency['agreement_by_topic'] = topic_agreement

    # Agreement rate by speaker
    if 'speaker_role' in df.columns:
        speaker_agreement = {}
        for speaker in df['speaker_role'].unique():
            speaker_mask = df['speaker_role'] == speaker
            speaker_df = df[speaker_mask]

            if len(speaker_df) > 5:
                agreement_rate = (
                    speaker_df['finbert_tone_label'] == speaker_df['prosus_label']
                ).mean()
                speaker_agreement[speaker] = agreement_rate

        model_consistency['agreement_by_speaker'] = speaker_agreement

    research_analysis['model_consistency'] = model_consistency

    return research_analysis

# Analyze research questions
print("\n" + "="*60)
print("RESEARCH QUESTIONS ANALYSIS")
print("="*60)

if sentiment_jpm_multi_2025_df is not None:
    research_results = analyze_research_questions(sentiment_jpm_multi_2025_df)
    comparison_results['research_questions'] = research_results

    # Print key findings
    if 'speaker_divergence' in research_results:
        print("👥 Speaker Sentiment Divergence:")
        for model, analysis in research_results['speaker_divergence'].items():
            pos_diff = analysis.get('positive_sentiment_difference', 0)
            print(f"  {model}: Analyst vs Executive positive sentiment difference = {pos_diff:.3f}")

    if 'temporal_shifts' in research_results:
        print("\n📈 Temporal Sentiment Shifts:")
        for model, analysis in research_results['temporal_shifts'].items():
            pos_change = analysis.get('positive_change', 0)
            print(f"  {model}: Q1 to Q2 positive sentiment change = {pos_change:.3f}")

    if 'model_consistency' in research_results:
        print("\n🤝 Model Agreement by Context:")
        if 'agreement_by_topic' in research_results['model_consistency']:
            topic_agreements = research_results['model_consistency']['agreement_by_topic']
            for topic, agreement in sorted(topic_agreements.items(), key=lambda x: x[1], reverse=True)[:3]:
                print(f"  {topic}: {agreement:.3f} agreement rate")



RESEARCH QUESTIONS ANALYSIS
Analyzing key research questions...
👥 Speaker Sentiment Divergence:
  finbert_tone: Analyst vs Executive positive sentiment difference = -0.025
  prosus: Analyst vs Executive positive sentiment difference = 0.021

📈 Temporal Sentiment Shifts:
  finbert_tone: Q1 to Q2 positive sentiment change = 0.020
  prosus: Q1 to Q2 positive sentiment change = -0.030

🤝 Model Agreement by Context:
  capital: 0.825 agreement rate
  general: 0.787 agreement rate
  revenue: 0.786 agreement rate


In [11]:
## Save Comparison Results

import json
import pandas as pd

def make_json_serializable(obj):
    """Recursively convert dict keys to JSON-safe strings, flattening tuples."""
    if isinstance(obj, dict):
        new_dict = {}
        for k, v in obj.items():
            if isinstance(k, tuple):
                # Join tuple elements into a single string
                key_str = "_".join(map(str, k))
            else:
                key_str = str(k)
            new_dict[key_str] = make_json_serializable(v)
        return new_dict
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    else:
        return obj


def save_comparison_results():
    """Save all comparison results to files."""

    # Convert results to JSON-safe format
    safe_comparison_results = make_json_serializable(comparison_results)

    # Save main comparison results
    comparison_path = results_comparison_path / "model_comparison_results.json"
    with open(comparison_path, "w") as f:
        json.dump(safe_comparison_results, f, indent=2, default=str)
    print(f"✓ Saved comparison results to: {comparison_path}")

    # Create summary report
    summary_report = {
        "analysis_timestamp": pd.Timestamp.now().isoformat(),
        "bank_code": BANK_CODE,
        "models_compared": ["yiyanghkust/finbert-tone", "ProsusAI/finbert"],
        "datasets_analyzed": list(comparison_results.keys()),
        "key_metrics": {}
    }

    # Extract key metrics for summary
    for dataset_name, results in comparison_results.items():
        if dataset_name == 'research_questions':
            continue

        if 'agreement_metrics' in results and results['agreement_metrics']:
            summary_report["key_metrics"][dataset_name] = {
                "agreement_rate": results['agreement_metrics'].get('agreement_rate', 0),
                "cohen_kappa": results['agreement_metrics'].get('cohen_kappa', 0),
                "total_predictions": results['agreement_metrics'].get('total_predictions', 0)
            }

    # Save summary (also make safe)
    summary_path = results_comparison_path / "comparison_summary.json"
    with open(summary_path, "w") as f:
        json.dump(make_json_serializable(summary_report), f, indent=2, default=str)
    print(f"✓ Saved summary report to: {summary_path}")

    return comparison_path, summary_path


print("\n" + "="*60)
print("SAVING COMPARISON RESULTS")
print("="*60)

comparison_path, summary_path = save_comparison_results()



SAVING COMPARISON RESULTS
✓ Saved comparison results to: /content/drive/MyDrive/CAM_DS_AI_Project/results/comparison/jpm/model_comparison_results.json
✓ Saved summary report to: /content/drive/MyDrive/CAM_DS_AI_Project/results/comparison/jpm/comparison_summary.json


In [12]:
## Final Model Comparison Summary

print("\n" + "="*60)
print("MODEL COMPARISON ANALYSIS COMPLETE")
print("="*60)

print(f"Results saved to: {results_comparison_path}")

# Overall model performance summary
total_predictions = 0
total_agreements = 0
avg_kappa = 0
kappa_count = 0

for dataset_name, results in comparison_results.items():
    if dataset_name == 'research_questions':
        continue

    if 'agreement_metrics' in results and results['agreement_metrics']:
        metrics = results['agreement_metrics']
        predictions = metrics.get('total_predictions', 0)
        agreement_rate = metrics.get('agreement_rate', 0)
        kappa = metrics.get('cohen_kappa', 0)

        total_predictions += predictions
        total_agreements += predictions * agreement_rate

        if kappa != 0:
            avg_kappa += kappa
            kappa_count += 1

if total_predictions > 0:
    overall_agreement = total_agreements / total_predictions
    print(f"\n📊 OVERALL MODEL PERFORMANCE:")
    print(f"  Total predictions analyzed: {total_predictions:,}")
    print(f"  Overall agreement rate: {overall_agreement:.3f}")

    if kappa_count > 0:
        avg_kappa = avg_kappa / kappa_count
        print(f"  Average Cohen's Kappa: {avg_kappa:.3f}")

# Interpretation guide
print(f"\n📋 INTERPRETATION GUIDE:")
print(f"  Agreement Rate: {overall_agreement:.1%} of predictions match between models")
print(f"  Cohen's Kappa: {avg_kappa:.3f} " +
      ("(Strong agreement)" if avg_kappa > 0.8 else
       "(Moderate agreement)" if avg_kappa > 0.6 else
       "(Fair agreement)" if avg_kappa > 0.4 else "(Poor agreement)"))

print(f"\nNext step: Run 06_results_visualization.ipynb to create visualizations and final report")

# Export key metrics for visualization
key_metrics_for_viz = {
    'overall_agreement_rate': overall_agreement if total_predictions > 0 else 0,
    'overall_kappa': avg_kappa if kappa_count > 0 else 0,
    'total_predictions': total_predictions,
    'datasets_analyzed': [name for name in comparison_results.keys() if name != 'research_questions']
}

viz_metrics_path = results_comparison_path / "key_metrics_for_visualization.json"
with open(viz_metrics_path, "w") as f:
    json.dump(key_metrics_for_viz, f, indent=2)

print(f"✓ Key metrics for visualization saved to: {viz_metrics_path}")


MODEL COMPARISON ANALYSIS COMPLETE
Results saved to: /content/drive/MyDrive/CAM_DS_AI_Project/results/comparison/jpm

📊 OVERALL MODEL PERFORMANCE:
  Total predictions analyzed: 2,220
  Overall agreement rate: 0.761
  Average Cohen's Kappa: 0.431

📋 INTERPRETATION GUIDE:
  Agreement Rate: 76.1% of predictions match between models
  Cohen's Kappa: 0.431 (Fair agreement)

Next step: Run 06_results_visualization.ipynb to create visualizations and final report
✓ Key metrics for visualization saved to: /content/drive/MyDrive/CAM_DS_AI_Project/results/comparison/jpm/key_metrics_for_visualization.json
