In [8]:
# 05_model_comparison_fixed.ipynb
# Purpose: Fixed model comparison framework for 4 models across JP Morgan and HSBC
# Banks: JP Morgan (JPM) and HSBC
# Models: FinBERT (yiyanghkust), FinBERT (ProsusAI), DistilRoBERTa, CardiffNLP (Twitter-RoBERTa)
# Input: Enhanced sentiment results (agreement-based analysis since manual validation doesn't match)
# Output: Comprehensive model comparison with agreement metrics and financial context insights

print("="*70)
print("FIXED MODEL COMPARISON FRAMEWORK")
print("Agreement-Based Analysis Without Manual Validation")
print("="*70)

## Import Libraries

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Enhanced statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr, wilcoxon, mannwhitneyu, chi2_contingency
from sklearn.metrics import (
    confusion_matrix, classification_report, cohen_kappa_score,
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    balanced_accuracy_score
)
import itertools

# Google Colab
from google.colab import drive
drive.mount("/content/drive")

# Load enhanced configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_config.json")
with open(config_path, "r") as f:
    enhanced_config = json.load(f)

SEED = enhanced_config["SEED"]
BANKS = enhanced_config["BANKS"]
QUARTERS = enhanced_config["QUARTERS"]
MODELS = enhanced_config["MODELS"]
drive_base = Path(enhanced_config["drive_base"])
colab_base = Path(enhanced_config["colab_base"])

print(f"Fixed model comparison for banks: {', '.join([bank.upper() for bank in BANKS])}")
print(f"Models to compare: {len(MODELS)} models")


FIXED MODEL COMPARISON FRAMEWORK
Agreement-Based Analysis Without Manual Validation
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fixed model comparison for banks: JPM, HSBC
Models to compare: 4 models


In [9]:
## Define Enhanced Paths

comparison_paths = {}
for bank in BANKS:
    comparison_paths[bank] = {
        "results_sentiment": drive_base / f"results/sentiment/{bank}",
        "results_finetuning": drive_base / f"results/finetuning/{bank}",
        "results_comparison": drive_base / f"results/comparison/{bank}",
        "manual_validation": drive_base / f"data/manual_validation/{bank}"
    }

    # Ensure comparison results directory exists
    comparison_paths[bank]["results_comparison"].mkdir(parents=True, exist_ok=True)


In [10]:
## Enhanced Data Loading

def load_enhanced_sentiment_results():
    """Load enhanced sentiment analysis results for all banks."""
    print(f"\n{'='*60}")
    print("LOADING ENHANCED SENTIMENT RESULTS")
    print(f"{'='*60}")

    sentiment_results = {}

    for bank in BANKS:
        print(f"\n📂 Loading {bank.upper()} sentiment results...")
        sentiment_results[bank] = {}

        # Load sentence-level results (primary for comparison)
        result_files = [
            f"enhanced_4model_sentiment_{bank}_q1_2025_sentence.csv",
            f"enhanced_4model_sentiment_{bank}_q2_2025_sentence.csv",
            f"enhanced_4model_sentiment_{bank}_combined_sentence.csv"
        ]

        for filename in result_files:
            file_path = comparison_paths[bank]["results_sentiment"] / filename

            if file_path.exists():
                try:
                    df = pd.read_csv(file_path)

                    # Determine dataset type
                    if "q1_2025" in filename:
                        dataset_key = "q1_2025"
                    elif "q2_2025" in filename:
                        dataset_key = "q2_2025"
                    else:
                        dataset_key = "combined"

                    sentiment_results[bank][dataset_key] = df
                    print(f"  ✅ {dataset_key}: {df.shape}")

                    # Check available model columns
                    model_columns = [col for col in df.columns if any(model in col for model in MODELS.keys())]
                    label_columns = [col for col in df.columns if col.endswith('_label')]
                    print(f"    Total model columns: {len(model_columns)}")
                    print(f"    Label columns: {len(label_columns)} - {label_columns}")

                except Exception as e:
                    print(f"  ❌ Error loading {filename}: {str(e)}")
            else:
                print(f"  ⚠️ File not found: {filename}")

    return sentiment_results

def load_finetuning_results():
    """Load fine-tuning results for comparison analysis."""
    print(f"\n📋 Loading fine-tuning results...")

    finetuning_results = {}

    for bank in BANKS:
        result_path = comparison_paths[bank]["results_finetuning"] / f"enhanced_finetuning_results_{bank}.json"

        if result_path.exists():
            try:
                with open(result_path, 'r') as f:
                    results = json.load(f)
                finetuning_results[bank] = results
                print(f"  ✅ {bank.upper()} fine-tuning results loaded")
            except Exception as e:
                print(f"  ❌ Error loading {bank.upper()} fine-tuning results: {e}")
        else:
            print(f"  ⚠️ No fine-tuning results found for {bank.upper()}")

    return finetuning_results

# Load all data
sentiment_results = load_enhanced_sentiment_results()
finetuning_results = load_finetuning_results()


LOADING ENHANCED SENTIMENT RESULTS

📂 Loading JPM sentiment results...
  ✅ q1_2025: (313, 78)
    Total model columns: 44
    Label columns: 4 - ['finbert_yiyanghkust_label', 'finbert_prosusai_label', 'distilroberta_label', 'cardiffnlp_roberta_label']
  ✅ q2_2025: (440, 74)
    Total model columns: 40
    Label columns: 4 - ['finbert_yiyanghkust_label', 'finbert_prosusai_label', 'distilroberta_label', 'cardiffnlp_roberta_label']
  ✅ combined: (752, 78)
    Total model columns: 44
    Label columns: 4 - ['finbert_yiyanghkust_label', 'finbert_prosusai_label', 'distilroberta_label', 'cardiffnlp_roberta_label']

📂 Loading HSBC sentiment results...
  ✅ q1_2025: (300, 74)
    Total model columns: 40
    Label columns: 4 - ['finbert_yiyanghkust_label', 'finbert_prosusai_label', 'distilroberta_label', 'cardiffnlp_roberta_label']
  ✅ q2_2025: (340, 78)
    Total model columns: 44
    Label columns: 4 - ['finbert_yiyanghkust_label', 'finbert_prosusai_label', 'distilroberta_label', 'cardiffnlp_r

In [11]:
## Fixed Model Comparison Framework

class FixedModelComparator:
    """Fixed model comparison framework that works with available sentiment data."""

    def __init__(self, sentiment_results: Dict, finetuning_results: Dict):
        self.sentiment_results = sentiment_results
        self.finetuning_results = finetuning_results
        self.models = list(MODELS.keys())
        self.banks = BANKS
        self.comparison_results = {}

        print(f"Fixed comparator initialized:")
        print(f"  Banks: {len(self.banks)}")
        print(f"  Models: {len(self.models)}")
        print(f"  Datasets per bank: {len(sentiment_results.get(BANKS[0], {})) if BANKS else 0}")

    def identify_available_models(self, df: pd.DataFrame) -> List[str]:
        """Identify available model predictions in dataset."""
        available_models = []

        for model_key in self.models:
            label_col = f'{model_key}_label'
            if label_col in df.columns:
                # Check if column has actual predictions
                non_null_count = df[label_col].notna().sum()
                if non_null_count > 0:
                    available_models.append(model_key)

        return available_models

    def calculate_comprehensive_agreement_metrics(self) -> Dict:
        """Calculate comprehensive agreement metrics between all model pairs across banks."""
        print("\n🤝 Calculating comprehensive agreement metrics...")

        agreement_results = {}

        for bank in self.banks:
            bank_agreements = {}

            if bank in self.sentiment_results:
                for dataset_type, df in self.sentiment_results[bank].items():
                    if df is None or len(df) == 0:
                        continue

                    available_models = self.identify_available_models(df)
                    print(f"  [{bank.upper()}] {dataset_type}: {len(available_models)} models available")

                    if len(available_models) < 2:
                        continue

                    dataset_agreements = {}

                    # Pairwise model comparisons
                    for model1, model2 in itertools.combinations(available_models, 2):
                        pair_name = f"{model1}_vs_{model2}"

                        label_col1 = f"{model1}_label"
                        label_col2 = f"{model2}_label"

                        # Filter valid predictions
                        valid_mask = (
                            df[label_col1].notna() &
                            df[label_col2].notna()
                        )
                        valid_df = df[valid_mask]

                        if len(valid_df) == 0:
                            continue

                        labels1 = valid_df[label_col1].values
                        labels2 = valid_df[label_col2].values

                        # Basic agreement
                        agreement_rate = (labels1 == labels2).mean()

                        # Cohen's Kappa
                        try:
                            kappa = cohen_kappa_score(labels1, labels2)
                        except:
                            kappa = 0.0

                        # Confidence correlation if available
                        score_col1 = f"{model1}_confidence"
                        score_col2 = f"{model2}_confidence"

                        confidence_correlation = None
                        if score_col1 in valid_df.columns and score_col2 in valid_df.columns:
                            scores1 = valid_df[score_col1].fillna(0.5)
                            scores2 = valid_df[score_col2].fillna(0.5)

                            try:
                                pearson_r, pearson_p = pearsonr(scores1, scores2)
                                confidence_correlation = {
                                    'pearson_r': pearson_r,
                                    'pearson_p': pearson_p
                                }
                            except:
                                confidence_correlation = None

                        # Class-wise agreement
                        class_agreement = {}
                        unique_labels = set(labels1) | set(labels2)
                        for label in unique_labels:
                            mask = (labels1 == label) | (labels2 == label)
                            if mask.sum() > 0:
                                class_agreement[label] = (labels1[mask] == labels2[mask]).mean()

                        # Statistical significance test
                        try:
                            contingency_table = confusion_matrix(labels1, labels2)
                            chi2, p_value, dof, expected = chi2_contingency(contingency_table)
                            statistical_test = {
                                'chi2': chi2,
                                'p_value': p_value,
                                'significant': p_value < 0.05
                            }
                        except:
                            statistical_test = None

                        dataset_agreements[pair_name] = {
                            'sample_size': len(valid_df),
                            'agreement_rate': agreement_rate,
                            'cohen_kappa': kappa,
                            'confidence_correlation': confidence_correlation,
                            'class_agreement': class_agreement,
                            'statistical_test': statistical_test
                        }

                        print(f"    {pair_name}: {agreement_rate:.3f} agreement, κ={kappa:.3f}")

                    bank_agreements[dataset_type] = dataset_agreements

            agreement_results[bank] = bank_agreements

        return agreement_results

    def analyze_financial_context_performance(self) -> Dict:
        """Analyze model performance in financial context."""
        print("\n💰 Analyzing financial context performance...")

        financial_context_results = {}

        # Enhanced financial indicators
        financial_indicators = {
            'bullish_indicators': [
                'growth', 'profit', 'increase', 'strong', 'improved', 'positive',
                'beat', 'exceed', 'outperform', 'robust', 'solid', 'gains', 'higher',
                'expansion', 'successful', 'achieving', 'strength'
            ],
            'bearish_indicators': [
                'loss', 'decline', 'decrease', 'weak', 'poor', 'negative',
                'miss', 'underperform', 'below', 'concern', 'risk', 'lower',
                'challenges', 'pressure', 'difficult', 'disappointing'
            ],
            'neutral_indicators': [
                'stable', 'maintain', 'steady', 'consistent', 'unchanged', 'flat',
                'comparable', 'similar', 'continuation', 'ongoing'
            ]
        }

        for bank in self.banks:
            bank_financial_results = {}

            if bank not in self.sentiment_results:
                continue

            for dataset_type, df in self.sentiment_results[bank].items():
                if df is None or 'text' not in df.columns:
                    continue

                print(f"  [{bank.upper()}] {dataset_type}: Financial context analysis")

                # Calculate financial sentiment scores
                df_analysis = df.copy()

                for indicator_type, keywords in financial_indicators.items():
                    pattern = '|'.join([f'\\b{kw}\\b' for kw in keywords])
                    df_analysis[f'{indicator_type}_count'] = df_analysis['text'].str.lower().str.count(pattern)

                # Create expected sentiment based on financial indicators
                df_analysis['financial_sentiment_score'] = (
                    df_analysis['bullish_indicators_count'] * 1 +
                    df_analysis['neutral_indicators_count'] * 0 +
                    df_analysis['bearish_indicators_count'] * (-1)
                )

                # Categorize expected sentiment
                df_analysis['expected_financial_sentiment'] = 'neutral'
                df_analysis.loc[df_analysis['financial_sentiment_score'] > 0, 'expected_financial_sentiment'] = 'positive'
                df_analysis.loc[df_analysis['financial_sentiment_score'] < 0, 'expected_financial_sentiment'] = 'negative'

                # Analyze model alignment with financial context
                dataset_financial_results = {}
                available_models = self.identify_available_models(df_analysis)

                for model_key in available_models:
                    label_col = f"{model_key}_label"

                    valid_mask = (
                        df_analysis[label_col].notna() &
                        df_analysis['expected_financial_sentiment'].notna()
                    )
                    valid_df = df_analysis[valid_mask]

                    if len(valid_df) == 0:
                        continue

                    # Calculate alignment with financial context
                    financial_alignment = (
                        valid_df[label_col] == valid_df['expected_financial_sentiment']
                    ).mean()

                    # Strong signal alignment
                    strong_signals_mask = np.abs(valid_df['financial_sentiment_score']) >= 2
                    strong_signal_alignment = None
                    if strong_signals_mask.sum() > 0:
                        strong_signal_alignment = (
                            valid_df.loc[strong_signals_mask, label_col] ==
                            valid_df.loc[strong_signals_mask, 'expected_financial_sentiment']
                        ).mean()

                    dataset_financial_results[model_key] = {
                        'overall_financial_alignment': financial_alignment,
                        'strong_signal_alignment': strong_signal_alignment,
                        'sample_size': len(valid_df),
                        'strong_signals_count': strong_signals_mask.sum()
                    }

                    print(f"    {model_key}: {financial_alignment:.3f} financial alignment")

                bank_financial_results[dataset_type] = dataset_financial_results

            financial_context_results[bank] = bank_financial_results

        return financial_context_results

    def create_agreement_based_rankings(self, agreement_results: Dict, financial_results: Dict) -> Dict:
        """Create model rankings based on agreement metrics and financial alignment."""
        print("\n🏆 Ranking models based on agreement and financial alignment...")

        model_scores = {}

        # Initialize scores for each model
        for model in self.models:
            model_scores[model] = {
                'agreement_scores': [],
                'financial_alignment_scores': [],
                'kappa_scores': [],
                'confidence_correlations': []
            }

        # Collect agreement scores
        for bank, bank_data in agreement_results.items():
            for dataset, dataset_data in bank_data.items():
                for pair_name, metrics in dataset_data.items():
                    # Extract model names from pair
                    if '_vs_' in pair_name:
                        model1, model2 = pair_name.split('_vs_')
                        agreement_rate = metrics.get('agreement_rate', 0)
                        kappa = metrics.get('cohen_kappa', 0)

                        # Add scores to both models
                        model_scores[model1]['agreement_scores'].append(agreement_rate)
                        model_scores[model1]['kappa_scores'].append(kappa)
                        model_scores[model2]['agreement_scores'].append(agreement_rate)
                        model_scores[model2]['kappa_scores'].append(kappa)

                        # Confidence correlation if available
                        conf_corr = metrics.get('confidence_correlation')
                        if conf_corr and 'pearson_r' in conf_corr:
                            model_scores[model1]['confidence_correlations'].append(abs(conf_corr['pearson_r']))
                            model_scores[model2]['confidence_correlations'].append(abs(conf_corr['pearson_r']))

        # Collect financial alignment scores
        for bank, bank_data in financial_results.items():
            for dataset, dataset_data in bank_data.items():
                for model, metrics in dataset_data.items():
                    alignment = metrics.get('overall_financial_alignment', 0)
                    model_scores[model]['financial_alignment_scores'].append(alignment)

        # Calculate final rankings
        final_rankings = {}

        for model, scores in model_scores.items():
            # Calculate averages
            avg_agreement = np.mean(scores['agreement_scores']) if scores['agreement_scores'] else 0
            avg_kappa = np.mean(scores['kappa_scores']) if scores['kappa_scores'] else 0
            avg_financial = np.mean(scores['financial_alignment_scores']) if scores['financial_alignment_scores'] else 0
            avg_conf_corr = np.mean(scores['confidence_correlations']) if scores['confidence_correlations'] else 0

            # Create composite score (adjusted weights)
            composite_score = (
                avg_agreement * 0.35 +      # Inter-model agreement
                avg_kappa * 0.25 +          # Statistical agreement quality
                avg_financial * 0.30 +      # Financial context alignment
                avg_conf_corr * 0.10        # Confidence correlation
            )

            final_rankings[model] = {
                'average_agreement': avg_agreement,
                'average_kappa': avg_kappa,
                'average_financial_alignment': avg_financial,
                'average_confidence_correlation': avg_conf_corr,
                'composite_score': composite_score,
                'sample_counts': {
                    'agreement_pairs': len(scores['agreement_scores']),
                    'financial_datasets': len(scores['financial_alignment_scores']),
                    'confidence_pairs': len(scores['confidence_correlations'])
                }
            }

        # Sort by composite score
        sorted_rankings = sorted(final_rankings.items(), key=lambda x: x[1]['composite_score'], reverse=True)

        print("Rankings by composite score:")
        for i, (model, metrics) in enumerate(sorted_rankings):
            print(f"  {i+1}. {model}:")
            print(f"     Composite Score: {metrics['composite_score']:.3f}")
            print(f"     Agreement: {metrics['average_agreement']:.3f}")
            print(f"     Kappa: {metrics['average_kappa']:.3f}")
            print(f"     Financial Alignment: {metrics['average_financial_alignment']:.3f}")
            print(f"     Confidence Correlation: {metrics['average_confidence_correlation']:.3f}")

        return {
            'rankings_by_composite': sorted_rankings,
            'detailed_scores': final_rankings,
            'best_model': sorted_rankings[0][0] if sorted_rankings else None
        }

    def analyze_cross_bank_consistency(self, agreement_results: Dict, financial_results: Dict) -> Dict:
        """Analyze model consistency across banks."""
        print("\n🏦 Analyzing cross-bank consistency...")

        cross_bank_results = {}

        # Analyze agreement consistency
        model_bank_agreement = {}
        for bank, bank_data in agreement_results.items():
            for dataset, dataset_data in bank_data.items():
                for pair_name, metrics in dataset_data.items():
                    if '_vs_' in pair_name:
                        models = pair_name.split('_vs_')
                        agreement = metrics.get('agreement_rate', 0)
                        for model in models:
                            if model not in model_bank_agreement:
                                model_bank_agreement[model] = {}
                            if bank not in model_bank_agreement[model]:
                                model_bank_agreement[model][bank] = []
                            model_bank_agreement[model][bank].append(agreement)

        # Analyze financial alignment consistency
        model_bank_financial = {}
        for bank, bank_data in financial_results.items():
            for dataset, dataset_data in bank_data.items():
                for model, metrics in dataset_data.items():
                    alignment = metrics.get('overall_financial_alignment', 0)
                    if model not in model_bank_financial:
                        model_bank_financial[model] = {}
                    if bank not in model_bank_financial[model]:
                        model_bank_financial[model][bank] = []
                    model_bank_financial[model][bank].append(alignment)

        # Calculate consistency scores
        for model in self.models:
            agreement_scores = []
            financial_scores = []

            # Collect scores across banks
            if model in model_bank_agreement:
                for bank_scores in model_bank_agreement[model].values():
                    agreement_scores.extend(bank_scores)

            if model in model_bank_financial:
                for bank_scores in model_bank_financial[model].values():
                    financial_scores.extend(bank_scores)

            # Calculate consistency (1 - coefficient of variation)
            agreement_consistency = 1 - (np.std(agreement_scores) / (np.mean(agreement_scores) + 1e-8)) if agreement_scores else 0
            financial_consistency = 1 - (np.std(financial_scores) / (np.mean(financial_scores) + 1e-8)) if financial_scores else 0

            cross_bank_results[model] = {
                'agreement_consistency': max(0, agreement_consistency),  # Ensure non-negative
                'financial_consistency': max(0, financial_consistency),
                'agreement_scores_count': len(agreement_scores),
                'financial_scores_count': len(financial_scores),
                'avg_agreement': np.mean(agreement_scores) if agreement_scores else 0,
                'avg_financial': np.mean(financial_scores) if financial_scores else 0
            }

            print(f"  {model}: Agreement consistency={agreement_consistency:.3f}, Financial consistency={financial_consistency:.3f}")

        return cross_bank_results

    def run_fixed_comparison(self) -> Dict:
        """Run fixed model comparison analysis."""
        print(f"\n{'='*60}")
        print("RUNNING FIXED MODEL COMPARISON")
        print(f"{'='*60}")

        # Calculate metrics that work with available data
        agreement_metrics = self.calculate_comprehensive_agreement_metrics()
        financial_context_analysis = self.analyze_financial_context_performance()

        # Create rankings based on available metrics
        model_rankings = self.create_agreement_based_rankings(agreement_metrics, financial_context_analysis)

        # Analyze cross-bank consistency
        cross_bank_analysis = self.analyze_cross_bank_consistency(agreement_metrics, financial_context_analysis)

        self.comparison_results = {
            'agreement_metrics': agreement_metrics,
            'financial_context_analysis': financial_context_analysis,
            'model_rankings': model_rankings,
            'cross_bank_analysis': cross_bank_analysis
        }

        return self.comparison_results

# Initialize fixed comparator
comparator = FixedModelComparator(sentiment_results, finetuning_results)

# Run fixed comparison
comparison_results = comparator.run_fixed_comparison()


Fixed comparator initialized:
  Banks: 2
  Models: 4
  Datasets per bank: 3

RUNNING FIXED MODEL COMPARISON

🤝 Calculating comprehensive agreement metrics...
  [JPM] q1_2025: 4 models available
    finbert_yiyanghkust_vs_finbert_prosusai: 0.789 agreement, κ=0.406
    finbert_yiyanghkust_vs_distilroberta: 0.636 agreement, κ=0.050
    finbert_yiyanghkust_vs_cardiffnlp_roberta: 0.671 agreement, κ=0.309
    finbert_prosusai_vs_distilroberta: 0.741 agreement, κ=0.050
    finbert_prosusai_vs_cardiffnlp_roberta: 0.709 agreement, κ=0.292
    distilroberta_vs_cardiffnlp_roberta: 0.591 agreement, κ=0.076
  [JPM] q2_2025: 4 models available
    finbert_yiyanghkust_vs_finbert_prosusai: 0.782 agreement, κ=0.423
    finbert_yiyanghkust_vs_distilroberta: 0.666 agreement, κ=0.106
    finbert_yiyanghkust_vs_cardiffnlp_roberta: 0.693 agreement, κ=0.361
    finbert_prosusai_vs_distilroberta: 0.741 agreement, κ=0.117
    finbert_prosusai_vs_cardiffnlp_roberta: 0.686 agreement, κ=0.284
    distilroberta_vs

In [12]:
## Enhanced Research Questions Analysis

def analyze_research_questions() -> Dict:
    """Analyze research questions with available data."""
    print(f"\n{'='*60}")
    print("RESEARCH QUESTIONS ANALYSIS")
    print(f"{'='*60}")

    research_results = {}

    # Question 1: Do bankers and analysts show diverging sentiment?
    print("\n1. 📊 Banker vs Analyst Sentiment Divergence")

    speaker_analysis = {}
    for bank in BANKS:
        if bank in sentiment_results:
            for dataset_type, df in sentiment_results[bank].items():
                if df is None or 'speaker_role' not in df.columns:
                    continue

                print(f"  [{bank.upper()}] {dataset_type}:")

                available_models = comparator.identify_available_models(df)
                for model_key in available_models:
                    label_col = f'{model_key}_label'

                    # Filter to analysts and executives
                    speaker_mask = df['speaker_role'].isin(['analyst', 'executive', 'ceo', 'cfo'])
                    speaker_df = df[speaker_mask]

                    if len(speaker_df) == 0:
                        continue

                    # Calculate sentiment distributions
                    try:
                        speaker_sentiment = speaker_df.groupby('speaker_role')[label_col].value_counts(normalize=True).unstack(fill_value=0)

                        if len(speaker_sentiment) >= 2:
                            # Statistical test for difference
                            contingency_table = speaker_df.groupby(['speaker_role', label_col]).size().unstack(fill_value=0)
                            chi2, p_value, dof, expected = chi2_contingency(contingency_table)

                            print(f"    {model_key}: χ²={chi2:.3f}, p={p_value:.3f}")

                            speaker_analysis[f"{bank}_{dataset_type}_{model_key}"] = {
                                'speaker_distributions': speaker_sentiment.to_dict(),
                                'chi2_statistic': chi2,
                                'p_value': p_value,
                                'significant_divergence': p_value < 0.05
                            }
                    except Exception as e:
                        print(f"    {model_key}: Analysis failed - {e}")

    research_results['speaker_divergence'] = speaker_analysis

    # Question 2: Has tone shifted over time?
    print("\n2. ⏰ Temporal Tone Analysis")

    temporal_analysis = {}
    for bank in BANKS:
        if bank in sentiment_results:
            # Compare Q1 vs Q2 if both available
            if 'q1_2025' in sentiment_results[bank] and 'q2_2025' in sentiment_results[bank]:
                q1_df = sentiment_results[bank]['q1_2025']
                q2_df = sentiment_results[bank]['q2_2025']

                if q1_df is not None and q2_df is not None:
                    print(f"  [{bank.upper()}] Q1 vs Q2 comparison:")

                    available_models = set(comparator.identify_available_models(q1_df)) & set(comparator.identify_available_models(q2_df))

                    for model_key in available_models:
                        label_col = f'{model_key}_label'

                        q1_sentiment = q1_df[label_col].value_counts(normalize=True)
                        q2_sentiment = q2_df[label_col].value_counts(normalize=True)

                        # Calculate sentiment shift
                        sentiment_shift = {}
                        for label in ['positive', 'neutral', 'negative']:
                            q1_pct = q1_sentiment.get(label, 0)
                            q2_pct = q2_sentiment.get(label, 0)
                            sentiment_shift[label] = q2_pct - q1_pct

                        # Total variation distance
                        tvd = 0.5 * sum(abs(sentiment_shift[label]) for label in sentiment_shift)

                        print(f"    {model_key}: TVD={tvd:.3f}")

                        temporal_analysis[f"{bank}_{model_key}"] = {
                            'q1_distribution': q1_sentiment.to_dict(),
                            'q2_distribution': q2_sentiment.to_dict(),
                            'sentiment_shift': sentiment_shift,
                            'total_variation_distance': tvd
                        }

    research_results['temporal_shifts'] = temporal_analysis

    # Question 3: Cross-bank tone comparison
    print("\n3. 🏦 Cross-Bank Tone Comparison")

    cross_bank_comparison = {}
    if len(BANKS) >= 2:
        # Compare sentiment distributions across banks
        for model_key in MODELS.keys():
            bank_distributions = {}

            for bank in BANKS:
                if bank in sentiment_results and 'combined' in sentiment_results[bank]:
                    df = sentiment_results[bank]['combined']
                    if df is not None:
                        label_col = f'{model_key}_label'
                        if label_col in df.columns:
                            sentiment_dist = df[label_col].value_counts(normalize=True)
                            bank_distributions[bank] = sentiment_dist.to_dict()

            if len(bank_distributions) >= 2:
                # Calculate pairwise differences
                bank_pairs = list(itertools.combinations(bank_distributions.keys(), 2))

                pairwise_comparisons = {}
                for bank1, bank2 in bank_pairs:
                    dist1 = bank_distributions[bank1]
                    dist2 = bank_distributions[bank2]

                    # Total variation distance
                    labels = set(dist1.keys()) | set(dist2.keys())
                    tvd = 0.5 * sum(abs(dist1.get(label, 0) - dist2.get(label, 0)) for label in labels)

                    pairwise_comparisons[f"{bank1}_vs_{bank2}"] = {
                        'bank1_distribution': dist1,
                        'bank2_distribution': dist2,
                        'total_variation_distance': tvd
                    }

                    print(f"  {model_key} - {bank1.upper()} vs {bank2.upper()}: TVD={tvd:.3f}")

                cross_bank_comparison[model_key] = {
                    'bank_distributions': bank_distributions,
                    'pairwise_comparisons': pairwise_comparisons
                }

    research_results['cross_bank_comparison'] = cross_bank_comparison

    return research_results

# Run research questions analysis
research_questions_results = analyze_research_questions()


RESEARCH QUESTIONS ANALYSIS

1. 📊 Banker vs Analyst Sentiment Divergence
  [JPM] q1_2025:
  [JPM] q2_2025:
  [JPM] combined:
  [HSBC] q1_2025:
  [HSBC] q2_2025:
  [HSBC] combined:

2. ⏰ Temporal Tone Analysis
  [JPM] Q1 vs Q2 comparison:
    finbert_yiyanghkust: TVD=0.069
    cardiffnlp_roberta: TVD=0.047
    finbert_prosusai: TVD=0.065
    distilroberta: TVD=0.008
  [HSBC] Q1 vs Q2 comparison:
    finbert_yiyanghkust: TVD=0.044
    cardiffnlp_roberta: TVD=0.053
    finbert_prosusai: TVD=0.002
    distilroberta: TVD=0.050

3. 🏦 Cross-Bank Tone Comparison
  finbert_yiyanghkust - JPM vs HSBC: TVD=0.143
  finbert_prosusai - JPM vs HSBC: TVD=0.249
  distilroberta - JPM vs HSBC: TVD=0.051
  cardiffnlp_roberta - JPM vs HSBC: TVD=0.108


In [13]:
## Model Characteristics Analysis

def analyze_model_characteristics():
    """Analyze what each model is good at based on available data."""
    print(f"\n{'='*60}")
    print("MODEL CHARACTERISTICS ANALYSIS")
    print(f"{'='*60}")

    # Best for inter-model agreement
    print("\n🤝 Model Agreement Rankings:")
    agreement_scores = {}
    for bank, bank_data in comparison_results['agreement_metrics'].items():
        for dataset, dataset_data in bank_data.items():
            for pair_name, metrics in dataset_data.items():
                if '_vs_' in pair_name:
                    models = pair_name.split('_vs_')
                    agreement = metrics.get('agreement_rate', 0)
                    for model in models:
                        if model not in agreement_scores:
                            agreement_scores[model] = []
                        agreement_scores[model].append(agreement)

    for model, scores in sorted(agreement_scores.items(), key=lambda x: np.mean(x[1]), reverse=True):
        print(f"  {model}: {np.mean(scores):.3f} avg agreement")

    # Best for financial context
    print("\n💰 Financial Context Rankings:")
    financial_scores = {}
    for bank, bank_data in comparison_results['financial_context_analysis'].items():
        for dataset, dataset_data in bank_data.items():
            for model, metrics in dataset_data.items():
                alignment = metrics.get('overall_financial_alignment', 0)
                if model not in financial_scores:
                    financial_scores[model] = []
                financial_scores[model].append(alignment)

    for model, scores in sorted(financial_scores.items(), key=lambda x: np.mean(x[1]), reverse=True):
        print(f"  {model}: {np.mean(scores):.3f} avg financial alignment")

    # Model pairing analysis
    print("\n🔗 Best Model Pairs (Highest Agreement):")
    all_pairs = []
    for bank, bank_data in comparison_results['agreement_metrics'].items():
        for dataset, dataset_data in bank_data.items():
            for pair_name, metrics in dataset_data.items():
                if '_vs_' in pair_name:
                    agreement = metrics.get('agreement_rate', 0)
                    kappa = metrics.get('cohen_kappa', 0)
                    all_pairs.append((pair_name, agreement, kappa))

    # Sort by agreement rate
    top_pairs = sorted(all_pairs, key=lambda x: x[1], reverse=True)[:5]
    for pair, agreement, kappa in top_pairs:
        print(f"  {pair}: {agreement:.3f} agreement, κ={kappa:.3f}")

    # Cross-bank consistency
    print("\n🏦 Cross-Bank Consistency:")
    for model, metrics in comparison_results['cross_bank_analysis'].items():
        agreement_consistency = metrics.get('agreement_consistency', 0)
        financial_consistency = metrics.get('financial_consistency', 0)
        print(f"  {model}: Agreement={agreement_consistency:.3f}, Financial={financial_consistency:.3f}")

analyze_model_characteristics()



MODEL CHARACTERISTICS ANALYSIS

🤝 Model Agreement Rankings:
  finbert_prosusai: 0.719 avg agreement
  finbert_yiyanghkust: 0.708 avg agreement
  cardiffnlp_roberta: 0.681 avg agreement
  distilroberta: 0.610 avg agreement

💰 Financial Context Rankings:
  finbert_prosusai: 0.718 avg financial alignment
  distilroberta: 0.711 avg financial alignment
  finbert_yiyanghkust: 0.697 avg financial alignment
  cardiffnlp_roberta: 0.609 avg financial alignment

🔗 Best Model Pairs (Highest Agreement):
  finbert_yiyanghkust_vs_finbert_prosusai: 0.815 agreement, κ=0.655
  finbert_yiyanghkust_vs_cardiffnlp_roberta: 0.806 agreement, κ=0.622
  finbert_yiyanghkust_vs_finbert_prosusai: 0.798 agreement, κ=0.622
  finbert_yiyanghkust_vs_finbert_prosusai: 0.789 agreement, κ=0.406
  finbert_yiyanghkust_vs_finbert_prosusai: 0.785 agreement, κ=0.418

🏦 Cross-Bank Consistency:
  finbert_yiyanghkust: Agreement=0.879, Financial=0.977
  finbert_prosusai: Agreement=0.875, Financial=0.918
  distilroberta: Agreemen

In [14]:
## Save Enhanced Comparison Results

def save_enhanced_comparison_results():
    """Save all enhanced comparison results."""
    print(f"\n{'='*60}")
    print("SAVING ENHANCED COMPARISON RESULTS")
    print(f"{'='*60}")

    # Compile comprehensive results
    comprehensive_results = {
        'timestamp': pd.Timestamp.now().isoformat(),
        'banks_analyzed': BANKS,
        'models_compared': list(MODELS.keys()),
        'analysis_type': 'agreement_based_without_manual_validation',
        'analysis_components': {
            'model_comparison': comparison_results,
            'research_questions': research_questions_results,
            'finetuning_integration': finetuning_results
        },
        'summary_statistics': {
            'total_models_compared': len(MODELS),
            'total_banks_analyzed': len(BANKS),
            'best_model': comparison_results.get('model_rankings', {}).get('best_model'),
            'analysis_timestamp': pd.Timestamp.now().isoformat()
        }
    }

    # Save main results for each bank
    for bank in BANKS:
        results_file = comparison_paths[bank]["results_comparison"] / f"fixed_model_comparison_{bank}.json"
        try:
            with open(results_file, 'w') as f:
                json.dump(comprehensive_results, f, indent=2, default=str)
            print(f"  ✅ {bank.upper()} comparison results: {results_file}")
        except Exception as e:
            print(f"  ❌ Failed to save {bank.upper()} results: {e}")

    # Save consolidated multi-bank results
    consolidated_file = drive_base / "results" / "fixed_multi_bank_model_comparison.json"
    consolidated_file.parent.mkdir(parents=True, exist_ok=True)
    try:
        with open(consolidated_file, 'w') as f:
            json.dump(comprehensive_results, f, indent=2, default=str)
        print(f"  ✅ Consolidated comparison: {consolidated_file}")
    except Exception as e:
        print(f"  ❌ Failed to save consolidated results: {e}")

    return comprehensive_results

# Save enhanced comparison results
saved_comparison_results = save_enhanced_comparison_results()


SAVING ENHANCED COMPARISON RESULTS
  ✅ JPM comparison results: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/results/comparison/jpm/fixed_model_comparison_jpm.json
  ✅ HSBC comparison results: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/results/comparison/hsbc/fixed_model_comparison_hsbc.json
  ✅ Consolidated comparison: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/results/fixed_multi_bank_model_comparison.json


In [15]:
## Final Summary

print(f"\n{'='*60}")
print("FIXED MODEL COMPARISON COMPLETE")
print(f"{'='*60}")

# Performance summary
if 'model_rankings' in comparison_results and 'best_model' in comparison_results['model_rankings']:
    best_model = comparison_results['model_rankings']['best_model']
    print(f"\n🏆 Best Performing Model: {best_model}")

    # Show rankings
    rankings = comparison_results['model_rankings']['rankings_by_composite']
    print(f"\nFinal Rankings:")
    for i, (model, metrics) in enumerate(rankings):
        print(f"  {i+1}. {model}: {metrics['composite_score']:.3f} composite score")

# Model agreement summary
if 'agreement_metrics' in comparison_results:
    print(f"\n🤝 Model Agreement Summary:")
    all_agreements = []

    for bank, bank_data in comparison_results['agreement_metrics'].items():
        for dataset_type, dataset_data in bank_data.items():
            for pair, metrics in dataset_data.items():
                agreement_rate = metrics.get('agreement_rate', 0)
                all_agreements.append(agreement_rate)

    if all_agreements:
        avg_agreement = np.mean(all_agreements)
        print(f"  Average inter-model agreement: {avg_agreement:.3f}")

# Research questions summary
print(f"\n📋 Research Questions Summary:")

if 'speaker_divergence' in research_questions_results:
    significant_divergences = sum(
        1 for analysis in research_questions_results['speaker_divergence'].values()
        if analysis.get('significant_divergence', False)
    )
    total_tests = len(research_questions_results['speaker_divergence'])
    if total_tests > 0:
        print(f"  Banker vs Analyst divergence: {significant_divergences}/{total_tests} tests show significant differences")

if 'temporal_shifts' in research_questions_results:
    temporal_shifts = research_questions_results['temporal_shifts'].values()
    if temporal_shifts:
        avg_temporal_shift = np.mean([
            analysis.get('total_variation_distance', 0)
            for analysis in temporal_shifts
        ])
        print(f"  Average temporal shift (TVD): {avg_temporal_shift:.3f}")

if 'cross_bank_comparison' in research_questions_results:
    cross_bank_diffs = []
    for model_data in research_questions_results['cross_bank_comparison'].values():
        for pair_data in model_data.get('pairwise_comparisons', {}).values():
            cross_bank_diffs.append(pair_data.get('total_variation_distance', 0))

    if cross_bank_diffs:
        avg_cross_bank_diff = np.mean(cross_bank_diffs)
        print(f"  Average cross-bank difference (TVD): {avg_cross_bank_diff:.3f}")

# File summary
print(f"\n📁 Files Generated:")
print(f"  Fixed comparison results: Saved for all {len(BANKS)} banks")
print(f"  Agreement-based analysis: Complete")
print(f"  Research analysis: Statistical testing complete")

print(f"\n🚀 Fixed model comparison framework complete!")
print(f"   Analysis based on inter-model agreement and financial context")


FIXED MODEL COMPARISON COMPLETE

🏆 Best Performing Model: finbert_prosusai

Final Rankings:
  1. finbert_prosusai: 0.582 composite score
  2. finbert_yiyanghkust: 0.577 composite score
  3. cardiffnlp_roberta: 0.526 composite score
  4. distilroberta: 0.472 composite score

🤝 Model Agreement Summary:
  Average inter-model agreement: 0.680

📋 Research Questions Summary:
  Average temporal shift (TVD): 0.042
  Average cross-bank difference (TVD): 0.138

📁 Files Generated:
  Fixed comparison results: Saved for all 2 banks
  Agreement-based analysis: Complete
  Research analysis: Statistical testing complete

🚀 Fixed model comparison framework complete!
   Analysis based on inter-model agreement and financial context
