In [None]:
# 05_model_comparison_jpm_enhanced.ipynb
# Purpose: Enhanced model comparison with fine-tuned models and comprehensive evaluation
# Input: Enhanced sentiment analysis results + fine-tuned models
# Output: Comprehensive model comparison with performance metrics and insights

## Import Libraries

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Enhanced statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr, wilcoxon, mannwhitneyu
from sklearn.metrics import (
    confusion_matrix, classification_report, cohen_kappa_score,
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    average_precision_score, balanced_accuracy_score
)

# Advanced model evaluation
from sklearn.calibration import calibration_curve
from sklearn.model_selection import bootstrap_resample
import itertools

# Visualization enhancements
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Statistical tests
from scipy.stats import chi2_contingency, fisher_exact

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive")

# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

print(f"Enhanced model comparison analysis for bank: {BANK_CODE.upper()}")

ImportError: cannot import name 'bootstrap_resample' from 'sklearn.model_selection' (/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/__init__.py)

In [None]:
## Define Paths

results_sentiment_path = drive_base / "results/sentiment/jpm"
results_comparison_path = drive_base / "results/comparison/jpm"
viz_path = drive_base / "outputs/visualizations/jpm"
models_path = drive_base / "models"
finetuned_models_path = models_path / "finetuned"

# Ensure directories exist
results_comparison_path.mkdir(parents=True, exist_ok=True)
viz_path.mkdir(parents=True, exist_ok=True)

In [None]:
## Load Enhanced Results and Fine-tuned Models

def load_enhanced_sentiment_results(filename: str) -> pd.DataFrame:
    """Load enhanced sentiment analysis results."""
    file_path = results_sentiment_path / filename
    if not file_path.exists():
        print(f"Warning: File not found: {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {filename}: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading {filename}: {str(e)}")
        return None

def load_finetuning_results() -> Dict:
    """Load fine-tuning results and model registry."""
    finetuning_results = {}

    # Load fine-tuning results
    results_path = finetuned_models_path / "finetuning_results.json"
    if results_path.exists():
        try:
            with open(results_path, 'r') as f:
                finetuning_results = json.load(f)
            print("Loaded fine-tuning results")
        except Exception as e:
            print(f"Could not load fine-tuning results: {e}")

    # Load model registry
    registry_path = finetuned_models_path / "model_registry.json"
    model_registry = {}
    if registry_path.exists():
        try:
            with open(registry_path, 'r') as f:
                model_registry = json.load(f)
            print("Loaded model registry")
        except Exception as e:
            print(f"Could not load model registry: {e}")

    return finetuning_results, model_registry

print("Loading enhanced sentiment analysis results...")

# Load enhanced results
enhanced_sentiment_q1_df = load_enhanced_sentiment_results("enhanced_sentiment_sentence_jpm_q1_2025.csv")
enhanced_sentiment_q2_df = load_enhanced_sentiment_results("enhanced_sentiment_sentence_jpm_q2_2025.csv")
enhanced_sentiment_multi_df = load_enhanced_sentiment_results("enhanced_sentiment_sentence_jpm_multi_2025.csv")

# Load aggregated results
enhanced_qa_level_df = load_enhanced_sentiment_results("enhanced_sentiment_qa_jpm_multi_2025.csv")
enhanced_speaker_level_df = load_enhanced_sentiment_results("enhanced_sentiment_speaker_jpm_multi_2025.csv")
enhanced_topic_level_df = load_enhanced_sentiment_results("enhanced_sentiment_topic_jpm_multi_2025.csv")

# Load fine-tuning results
finetuning_results, model_registry = load_finetuning_results()


In [None]:
## Enhanced Model Comparison Framework

class EnhancedModelComparator:
    """Comprehensive model comparison with statistical rigor."""

    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.models = self.identify_available_models()
        self.comparison_results = {}

    def identify_available_models(self) -> List[str]:
        """Identify all available model predictions in the dataset."""
        if self.df is None:
            return []

        model_columns = []

        # Standard models
        if 'finbert_tone_label' in self.df.columns:
            model_columns.append('finbert_tone')
        if 'prosus_label' in self.df.columns:
            model_columns.append('prosus')
        if 'ensemble_label' in self.df.columns:
            model_columns.append('ensemble')

        # Check for fine-tuned model results
        for col in self.df.columns:
            if col.endswith('_finetuned_label'):
                model_name = col.replace('_finetuned_label', '_finetuned')
                model_columns.append(model_name)

        print(f"Available models for comparison: {model_columns}")
        return model_columns

    def calculate_comprehensive_agreement_metrics(self) -> Dict:
        """Calculate comprehensive agreement metrics between all model pairs."""
        if len(self.models) < 2:
            return {}

        print("Calculating comprehensive agreement metrics...")

        agreement_results = {}

        # Pairwise comparisons
        for model1, model2 in itertools.combinations(self.models, 2):
            pair_name = f"{model1}_vs_{model2}"

            # Get label columns
            label_col1 = f"{model1}_label"
            label_col2 = f"{model2}_label"

            if label_col1 not in self.df.columns or label_col2 not in self.df.columns:
                continue

            # Filter valid predictions
            valid_mask = (
                self.df[label_col1].notna() &
                self.df[label_col2].notna()
            )
            valid_df = self.df[valid_mask]

            if len(valid_df) == 0:
                continue

            labels1 = valid_df[label_col1].values
            labels2 = valid_df[label_col2].values

            # Basic agreement
            agreement_rate = (labels1 == labels2).mean()

            # Cohen's Kappa
            kappa = cohen_kappa_score(labels1, labels2)

            # Confidence correlations
            score_col1 = f"{model1}_calibrated" if f"{model1}_calibrated" in self.df.columns else f"{model1}_score"
            score_col2 = f"{model2}_calibrated" if f"{model2}_calibrated" in self.df.columns else f"{model2}_score"

            score_correlation = None
            if score_col1 in valid_df.columns and score_col2 in valid_df.columns:
                scores1 = valid_df[score_col1].fillna(0.5)
                scores2 = valid_df[score_col2].fillna(0.5)

                pearson_r, pearson_p = pearsonr(scores1, scores2)
                spearman_r, spearman_p = spearmanr(scores1, scores2)

                score_correlation = {
                    'pearson': {'r': pearson_r, 'p_value': pearson_p},
                    'spearman': {'r': spearman_r, 'p_value': spearman_p}
                }

            # Class-wise agreement
            class_agreement = {}
            unique_labels = set(labels1) | set(labels2)
            for label in unique_labels:
                mask1 = labels1 == label
                if mask1.sum() > 0:
                    class_agreement[label] = (labels1[mask1] == labels2[mask1]).mean()

            # Statistical significance test
            try:
                contingency_table = confusion_matrix(labels1, labels2)
                chi2, p_value, dof, expected = chi2_contingency(contingency_table)

                statistical_test = {
                    'test': 'chi_square',
                    'statistic': chi2,
                    'p_value': p_value,
                    'degrees_of_freedom': dof
                }
            except Exception as e:
                statistical_test = {'error': str(e)}

            agreement_results[pair_name] = {
                'model1': model1,
                'model2': model2,
                'sample_size': len(valid_df),
                'agreement_rate': agreement_rate,
                'cohen_kappa': kappa,
                'score_correlation': score_correlation,
                'class_agreement': class_agreement,
                'statistical_test': statistical_test
            }

            print(f"  {pair_name}: Agreement={agreement_rate:.3f}, Kappa={kappa:.3f}")

        return agreement_results

    def analyze_prediction_distributions(self) -> Dict:
        """Analyze prediction distributions across models."""
        print("Analyzing prediction distributions...")

        distribution_results = {}

        for model in self.models:
            label_col = f"{model}_label"
            if label_col not in self.df.columns:
                continue

            # Distribution
            distribution = self.df[label_col].value_counts(normalize=True).sort_index()

            # Entropy (uncertainty measure)
            entropy = -sum(p * np.log(p + 1e-8) for p in distribution.values)

            # Confidence statistics
            confidence_stats = {}
            score_cols = [f"{model}_score", f"{model}_calibrated"]
            for score_col in score_cols:
                if score_col in self.df.columns:
                    scores = self.df[score_col].dropna()
                    if len(scores) > 0:
                        confidence_stats[score_col] = {
                            'mean': scores.mean(),
                            'std': scores.std(),
                            'median': scores.median(),
                            'min': scores.min(),
                            'max': scores.max()
                        }

            distribution_results[model] = {
                'distribution': distribution.to_dict(),
                'entropy': entropy,
                'confidence_stats': confidence_stats,
                'total_predictions': self.df[label_col].notna().sum()
            }

        return distribution_results

    def evaluate_calibration_quality(self) -> Dict:
        """Evaluate confidence calibration quality for each model."""
        print("Evaluating confidence calibration...")

        calibration_results = {}

        # Need ground truth for calibration evaluation
        if 'human_label' not in self.df.columns:
            print("No ground truth labels available for calibration evaluation")
            return {}

        human_labeled_mask = self.df['human_label'].notna()
        eval_df = self.df[human_labeled_mask]

        if len(eval_df) == 0:
            return {}

        for model in self.models:
            label_col = f"{model}_label"
            score_cols = [f"{model}_calibrated", f"{model}_score"]

            if label_col not in eval_df.columns:
                continue

            model_mask = eval_df[label_col].notna()
            model_eval_df = eval_df[model_mask]

            if len(model_eval_df) == 0:
                continue

            calibration_results[model] = {}

            for score_col in score_cols:
                if score_col not in model_eval_df.columns:
                    continue

                try:
                    predictions = model_eval_df[label_col].values
                    true_labels = model_eval_df['human_label'].values
                    confidences = model_eval_df[score_col].values

                    # Calculate Expected Calibration Error (ECE)
                    n_bins = 10
                    bin_boundaries = np.linspace(0, 1, n_bins + 1)
                    bin_lowers = bin_boundaries[:-1]
                    bin_uppers = bin_boundaries[1:]

                    ece = 0
                    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
                        in_bin = (confidences > bin_lower) & (confidences <= bin_upper)
                        prop_in_bin = in_bin.mean()

                        if prop_in_bin > 0:
                            accuracy_in_bin = (predictions[in_bin] == true_labels[in_bin]).mean()
                            avg_confidence_in_bin = confidences[in_bin].mean()
                            ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

                    calibration_results[model][score_col] = {
                        'expected_calibration_error': ece,
                        'sample_size': len(model_eval_df)
                    }

                except Exception as e:
                    calibration_results[model][score_col] = {'error': str(e)}

        return calibration_results

    def analyze_disagreement_patterns(self) -> Dict:
        """Deep analysis of model disagreement patterns."""
        print("Analyzing disagreement patterns...")

        if len(self.models) < 2:
            return {}

        disagreement_results = {}

        # Find cases where models disagree
        for model1, model2 in itertools.combinations(self.models, 2):
            pair_name = f"{model1}_vs_{model2}"

            label_col1 = f"{model1}_label"
            label_col2 = f"{model2}_label"

            if label_col1 not in self.df.columns or label_col2 not in self.df.columns:
                continue

            valid_mask = (
                self.df[label_col1].notna() &
                self.df[label_col2].notna()
            )
            valid_df = self.df[valid_mask]

            if len(valid_df) == 0:
                continue

            # Find disagreements
            disagreement_mask = valid_df[label_col1] != valid_df[label_col2]
            disagreements = valid_df[disagreement_mask]

            if len(disagreements) == 0:
                continue

            # Disagreement patterns
            disagreement_patterns = disagreements.groupby([label_col1, label_col2]).size()

            # Analyze disagreement by various factors
            disagreement_analysis = {
                'total_disagreements': len(disagreements),
                'disagreement_rate': len(disagreements) / len(valid_df),
                'disagreement_patterns': disagreement_patterns.to_dict()
            }

            # By speaker role
            if 'speaker_role' in disagreements.columns:
                speaker_disagreements = disagreements['speaker_role'].value_counts()
                disagreement_analysis['by_speaker_role'] = speaker_disagreements.to_dict()

            # By topic
            if 'primary_topic' in disagreements.columns:
                topic_disagreements = disagreements['primary_topic'].value_counts()
                disagreement_analysis['by_topic'] = topic_disagreements.to_dict()

            # By text length
            if 'sentence_length' in disagreements.columns:
                avg_disagreement_length = disagreements['sentence_length'].mean()
                agreement_df = valid_df[~disagreement_mask]
                avg_agreement_length = agreement_df['sentence_length'].mean()

                disagreement_analysis['text_length_analysis'] = {
                    'avg_disagreement_length': avg_disagreement_length,
                    'avg_agreement_length': avg_agreement_length,
                    'length_difference': avg_disagreement_length - avg_agreement_length
                }

            # Confidence analysis for disagreements
            score_col1 = f"{model1}_calibrated" if f"{model1}_calibrated" in disagreements.columns else f"{model1}_score"
            score_col2 = f"{model2}_calibrated" if f"{model2}_calibrated" in disagreements.columns else f"{model2}_score"

            if score_col1 in disagreements.columns and score_col2 in disagreements.columns:
                disagreement_analysis['confidence_analysis'] = {
                    'avg_confidence_model1': disagreements[score_col1].mean(),
                    'avg_confidence_model2': disagreements[score_col2].mean(),
                    'high_confidence_disagreements': (
                        (disagreements[score_col1] > 0.8) &
                        (disagreements[score_col2] > 0.8)
                    ).sum()
                }

            disagreement_results[pair_name] = disagreement_analysis

        return disagreement_results

# Initialize enhanced comparator
print("\n" + "="*60)
print("ENHANCED MODEL COMPARISON ANALYSIS")
print("="*60)

enhanced_comparator = None
if enhanced_sentiment_multi_df is not None:
    enhanced_comparator = EnhancedModelComparator(enhanced_sentiment_multi_df)

    # Run comprehensive analysis
    print("\nRunning comprehensive model comparison...")

    # Agreement metrics
    agreement_results = enhanced_comparator.calculate_comprehensive_agreement_metrics()

    # Distribution analysis
    distribution_results = enhanced_comparator.analyze_prediction_distributions()

    # Calibration quality
    calibration_results = enhanced_comparator.evaluate_calibration_quality()

    # Disagreement patterns
    disagreement_results = enhanced_comparator.analyze_disagreement_patterns()

    enhanced_comparator.comparison_results = {
        'agreement_metrics': agreement_results,
        'distribution_analysis': distribution_results,
        'calibration_quality': calibration_results,
        'disagreement_patterns': disagreement_results
    }


In [None]:
## Performance Evaluation with Manual Labels

class PerformanceEvaluator:
    """Comprehensive performance evaluation against manual labels."""

    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.evaluation_results = {}

        # Filter to manually labeled data
        if 'human_label' in df.columns:
            self.eval_df = df[df['human_label'].notna()].copy()
            print(f"Performance evaluation on {len(self.eval_df)} manually labeled records")
        else:
            self.eval_df = None
            print("No manual labels available for performance evaluation")

    def evaluate_all_models(self) -> Dict:
        """Evaluate all available models against manual labels."""
        if self.eval_df is None or len(self.eval_df) == 0:
            return {}

        print("Evaluating all models against manual labels...")

        evaluation_results = {}

        # Identify available models
        model_columns = [col for col in self.eval_df.columns if col.endswith('_label')]
        model_names = [col.replace('_label', '') for col in model_columns]

        for model_name in model_names:
            if model_name == 'human':  # Skip human labels
                continue

            label_col = f"{model_name}_label"

            if label_col not in self.eval_df.columns:
                continue

            # Filter valid predictions
            valid_mask = self.eval_df[label_col].notna()
            valid_eval_df = self.eval_df[valid_mask]

            if len(valid_eval_df) == 0:
                continue

            true_labels = valid_eval_df['human_label'].values
            pred_labels = valid_eval_df[label_col].values

            # Get confidence scores
            confidence_scores = None
            score_cols = [f"{model_name}_calibrated", f"{model_name}_score"]
            for score_col in score_cols:
                if score_col in valid_eval_df.columns:
                    confidence_scores = valid_eval_df[score_col].values
                    break

            # Calculate comprehensive metrics
            model_evaluation = self.calculate_comprehensive_metrics(
                true_labels, pred_labels, confidence_scores, model_name
            )

            evaluation_results[model_name] = model_evaluation

            # Print key metrics
            print(f"  {model_name}:")
            print(f"    Accuracy: {model_evaluation['accuracy']:.3f}")
            print(f"    F1 (weighted): {model_evaluation['f1_weighted']:.3f}")
            print(f"    F1 (macro): {model_evaluation['f1_macro']:.3f}")

        return evaluation_results

    def calculate_comprehensive_metrics(self, true_labels: np.ndarray, pred_labels: np.ndarray,
                                      confidence_scores: np.ndarray = None, model_name: str = "") -> Dict:
        """Calculate comprehensive evaluation metrics."""

        # Basic metrics
        accuracy = accuracy_score(true_labels, pred_labels)
        balanced_accuracy = balanced_accuracy_score(true_labels, pred_labels)

        # Precision, Recall, F1
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            true_labels, pred_labels, average='macro', zero_division=0
        )
        precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
            true_labels, pred_labels, average='weighted', zero_division=0
        )

        # Per-class metrics
        precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
            true_labels, pred_labels, average=None, zero_division=0
        )

        # Confusion matrix
        cm = confusion_matrix(true_labels, pred_labels)

        # Classification report
        class_report = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)

        # Cohen's Kappa
        kappa = cohen_kappa_score(true_labels, pred_labels)

        evaluation_result = {
            'model_name': model_name,
            'sample_size': len(true_labels),
            'accuracy': accuracy,
            'balanced_accuracy': balanced_accuracy,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'f1_macro': f1_macro,
            'precision_weighted': precision_weighted,
            'recall_weighted': recall_weighted,
            'f1_weighted': f1_weighted,
            'cohen_kappa': kappa,
            'confusion_matrix': cm.tolist(),
            'classification_report': class_report,
            'per_class_metrics': {
                'precision': precision_per_class.tolist(),
                'recall': recall_per_class.tolist(),
                'f1': f1_per_class.tolist(),
                'support': support.tolist()
            }
        }

        # Confidence-based metrics
        if confidence_scores is not None:
            confidence_metrics = self.calculate_confidence_metrics(
                true_labels, pred_labels, confidence_scores
            )
            evaluation_result['confidence_metrics'] = confidence_metrics

        return evaluation_result

    def calculate_confidence_metrics(self, true_labels: np.ndarray, pred_labels: np.ndarray,
                                   confidence_scores: np.ndarray) -> Dict:
        """Calculate confidence-based evaluation metrics."""

        correct_predictions = (true_labels == pred_labels)

        # Accuracy at different confidence thresholds
        confidence_thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
        accuracy_at_threshold = {}
        coverage_at_threshold = {}

        for threshold in confidence_thresholds:
            high_conf_mask = confidence_scores >= threshold
            if high_conf_mask.sum() > 0:
                accuracy_at_threshold[threshold] = correct_predictions[high_conf_mask].mean()
                coverage_at_threshold[threshold] = high_conf_mask.mean()
            else:
                accuracy_at_threshold[threshold] = None
                coverage_at_threshold[threshold] = 0.0

        # Expected Calibration Error (ECE)
        n_bins = 10
        bin_boundaries = np.linspace(0, 1, n_bins + 1)
        bin_lowers = bin_boundaries[:-1]
        bin_uppers = bin_boundaries[1:]

        ece = 0
        bin_accuracies = []
        bin_confidences = []
        bin_counts = []

        for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
            in_bin = (confidence_scores > bin_lower) & (confidence_scores <= bin_upper)
            prop_in_bin = in_bin.mean()

            if prop_in_bin > 0:
                accuracy_in_bin = correct_predictions[in_bin].mean()
                avg_confidence_in_bin = confidence_scores[in_bin].mean()
                ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

                bin_accuracies.append(accuracy_in_bin)
                bin_confidences.append(avg_confidence_in_bin)
                bin_counts.append(in_bin.sum())
            else:
                bin_accuracies.append(0.0)
                bin_confidences.append(0.0)
                bin_counts.append(0)

        confidence_metrics = {
            'accuracy_at_threshold': accuracy_at_threshold,
            'coverage_at_threshold': coverage_at_threshold,
            'expected_calibration_error': ece,
            'average_confidence': confidence_scores.mean(),
            'confidence_std': confidence_scores.std(),
            'calibration_curve': {
                'bin_accuracies': bin_accuracies,
                'bin_confidences': bin_confidences,
                'bin_counts': bin_counts
            }
        }

        return confidence_metrics

    def rank_models_by_performance(self) -> Dict:
        """Rank models by various performance metrics."""
        if not self.evaluation_results:
            return {}

        print("Ranking models by performance...")

        # Different ranking criteria
        ranking_criteria = ['accuracy', 'f1_weighted', 'f1_macro', 'cohen_kappa', 'balanced_accuracy']

        rankings = {}

        for criterion in ranking_criteria:
            model_scores = []
            for model_name, results in self.evaluation_results.items():
                if criterion in results:
                    model_scores.append((model_name, results[criterion]))

            # Sort by score (descending)
            model_scores.sort(key=lambda x: x[1], reverse=True)
            rankings[criterion] = model_scores

            print(f"\nRanking by {criterion}:")
            for i, (model_name, score) in enumerate(model_scores[:5]):  # Top 5
                print(f"  {i+1}. {model_name}: {score:.3f}")

        # Overall ranking (average rank across criteria)
        model_avg_ranks = {}
        for model_name in self.evaluation_results.keys():
            ranks = []
            for criterion in ranking_criteria:
                if criterion in self.evaluation_results[model_name]:
                    criterion_ranking = rankings[criterion]
                    for rank, (name, _) in enumerate(criterion_ranking):
                        if name == model_name:
                            ranks.append(rank + 1)  # 1-indexed rank
                            break

            if ranks:
                model_avg_ranks[model_name] = np.mean(ranks)

        # Sort by average rank (lower is better)
        overall_ranking = sorted(model_avg_ranks.items(), key=lambda x: x[1])

        print(f"\nOverall ranking (average across criteria):")
        for i, (model_name, avg_rank) in enumerate(overall_ranking):
            print(f"  {i+1}. {model_name}: {avg_rank:.1f}")

        return {
            'rankings_by_criterion': rankings,
            'overall_ranking': overall_ranking,
            'best_model': overall_ranking[0][0] if overall_ranking else None
        }

# Run performance evaluation
performance_evaluator = None
performance_evaluation_results = {}

if enhanced_sentiment_multi_df is not None:
    performance_evaluator = PerformanceEvaluator(enhanced_sentiment_multi_df)
    performance_evaluation_results = performance_evaluator.evaluate_all_models()
    model_rankings = performance_evaluator.rank_models_by_performance()

    performance_evaluator.evaluation_results = performance_evaluation_results


In [None]:
## Enhanced Financial Context Analysis

def analyze_financial_context_performance_enhanced(df: pd.DataFrame) -> Dict:
    """Enhanced analysis of model performance in financial context."""
    if df is None:
        return {}

    print("\nEnhanced financial context analysis...")

    # Enhanced financial keyword analysis
    financial_indicators = {
        'bullish_indicators': [
            'growth', 'profit', 'increase', 'strong', 'improved', 'positive',
            'beat', 'exceed', 'outperform', 'robust', 'solid', 'expansion'
        ],
        'bearish_indicators': [
            'loss', 'decline', 'decrease', 'weak', 'poor', 'negative',
            'miss', 'underperform', 'below', 'concern', 'risk', 'challenge'
        ],
        'neutral_indicators': [
            'stable', 'maintain', 'steady', 'consistent', 'unchanged', 'flat'
        ]
    }

    # Calculate financial sentiment scores
    df_analysis = df.copy()

    for indicator_type, keywords in financial_indicators.items():
        pattern = '|'.join([re.escape(kw) for kw in keywords])
        df_analysis[f'{indicator_type}_count'] = df_analysis['text'].str.lower().str.count(pattern)

    # Create expected sentiment based on financial indicators
    df_analysis['financial_sentiment_score'] = (
        df_analysis['bullish_indicators_count'] * 1 +
        df_analysis['neutral_indicators_count'] * 0 +
        df_analysis['bearish_indicators_count'] * (-1)
    )

    # Categorize expected sentiment
    df_analysis['expected_financial_sentiment'] = 'neutral'
    df_analysis.loc[df_analysis['financial_sentiment_score'] > 0, 'expected_financial_sentiment'] = 'positive'
    df_analysis.loc[df_analysis['financial_sentiment_score'] < 0, 'expected_financial_sentiment'] = 'negative'

    # Analyze model alignment with financial context
    financial_context_results = {}

    # Identify available models
    model_columns = [col for col in df_analysis.columns if col.endswith('_label')]
    model_names = [col.replace('_label', '') for col in model_columns if col.replace('_label', '') != 'human']

    for model_name in model_names:
        label_col = f"{model_name}_label"

        if label_col not in df_analysis.columns:
            continue

        valid_mask = (
            df_analysis[label_col].notna() &
            df_analysis['expected_financial_sentiment'].notna()
        )
        valid_df = df_analysis[valid_mask]

        if len(valid_df) == 0:
            continue

        # Calculate alignment with financial context
        financial_alignment = (
            valid_df[label_col] == valid_df['expected_financial_sentiment']
        ).mean()

        # Analyze by financial sentiment strength
        strong_signals_mask = np.abs(valid_df['financial_sentiment_score']) >= 2
        strong_signals_df = valid_df[strong_signals_mask]

        strong_signal_alignment = None
        if len(strong_signals_df) > 0:
            strong_signal_alignment = (
                strong_signals_df[label_col] == strong_signals_df['expected_financial_sentiment']
            ).mean()

        # Topic-specific financial alignment
        topic_financial_alignment = {}
        if 'primary_topic' in valid_df.columns:
            for topic in valid_df['primary_topic'].unique():
                topic_mask = valid_df['primary_topic'] == topic
                topic_df = valid_df[topic_mask]

                if len(topic_df) >= 5:  # Minimum sample size
                    topic_alignment = (
                        topic_df[label_col] == topic_df['expected_financial_sentiment']
                    ).mean()
                    topic_financial_alignment[topic] = topic_alignment

        financial_context_results[model_name] = {
            'overall_financial_alignment': financial_alignment,
            'strong_signal_alignment': strong_signal_alignment,
            'topic_financial_alignment': topic_financial_alignment,
            'sample_size': len(valid_df),
            'strong_signals_count': len(strong_signals_df) if strong_signals_df is not None else 0
        }

        print(f"  {model_name} financial alignment: {financial_alignment:.3f}")

    return financial_context_results

# Run enhanced financial context analysis
financial_context_results = {}
if enhanced_sentiment_multi_df is not None:
    financial_context_results = analyze_financial_context_performance_enhanced(enhanced_sentiment_multi_df)


In [None]:
## Research Questions Analysis Enhanced

def analyze_enhanced_research_questions(df: pd.DataFrame) -> Dict:
    """Enhanced analysis of key research questions with statistical rigor."""
    if df is None:
        return {}

    print("\nEnhanced research questions analysis...")

    research_results = {}

    # Question 1: Enhanced banker vs analyst sentiment divergence
    if 'speaker_role' in df.columns:
        print("1. Enhanced Speaker Sentiment Divergence Analysis")

        speaker_analysis = {}

        # Identify available models
        model_columns = [col for col in df.columns if col.endswith('_label')]
        model_names = [col.replace('_label', '') for col in model_columns if col.replace('_label', '') != 'human']

        for model_name in model_names:
            label_col = f"{model_name}_label"
            score_col = f"{model_name}_calibrated" if f"{model_name}_calibrated" in df.columns else f"{model_name}_score"

            if label_col not in df.columns:
                continue

            # Filter valid data
            valid_mask = df[label_col].notna() & df['speaker_role'].notna()
            valid_df = df[valid_mask]

            if len(valid_df) == 0:
                continue

            # Speaker sentiment distributions
            speaker_dist = valid_df.groupby('speaker_role')[label_col].value_counts(normalize=True).unstack(fill_value=0)

            # Calculate divergence metrics
            if 'analyst' in speaker_dist.index and 'executive' in speaker_dist.index:
                analyst_sentiment = speaker_dist.loc['analyst']
                exec_sentiment = speaker_dist.loc['executive']

                # Statistical test for difference in distributions
                try:
                    # Chi-square test for independence
                    contingency_table = valid_df.groupby(['speaker_role', label_col]).size().unstack(fill_value=0)
                    chi2, p_value, dof, expected = chi2_contingency(contingency_table)

                    statistical_test = {
                        'test': 'chi_square',
                        'statistic': chi2,
                        'p_value': p_value,
                        'significant': p_value < 0.05
                    }
                except Exception as e:
                    statistical_test = {'error': str(e)}

                # Total variation distance
                tvd = 0.5 * (analyst_sentiment - exec_sentiment).abs().sum()

                # Confidence-weighted analysis
                confidence_analysis = {}
                if score_col in valid_df.columns:
                    analyst_df = valid_df[valid_df['speaker_role'] == 'analyst']
                    exec_df = valid_df[valid_df['speaker_role'] == 'executive']

                    if len(analyst_df) > 0 and len(exec_df) > 0:
                        analyst_avg_conf = analyst_df[score_col].mean()
                        exec_avg_conf = exec_df[score_col].mean()

                        # Weighted sentiment scores
                        analyst_weighted_pos = (
                            (analyst_df[label_col] == 'positive') * analyst_df[score_col]
                        ).sum() / len(analyst_df)

                        exec_weighted_pos = (
                            (exec_df[label_col] == 'positive') * exec_df[score_col]
                        ).sum() / len(exec_df)

                        confidence_analysis = {
                            'analyst_avg_confidence': analyst_avg_conf,
                            'executive_avg_confidence': exec_avg_conf,
                            'analyst_weighted_positive': analyst_weighted_pos,
                            'executive_weighted_positive': exec_weighted_pos,
                            'confidence_weighted_divergence': abs(analyst_weighted_pos - exec_weighted_pos)
                        }

                speaker_analysis[model_name] = {
                    'analyst_distribution': analyst_sentiment.to_dict(),
                    'executive_distribution': exec_sentiment.to_dict(),
                    'total_variation_distance': tvd,
                    'statistical_test': statistical_test,
                    'confidence_analysis': confidence_analysis
                }

        research_results['speaker_divergence'] = speaker_analysis

    # Question 2: Enhanced temporal sentiment analysis
    if 'quarter' in df.columns:
        print("2. Enhanced Temporal Sentiment Analysis")

        temporal_analysis = {}

        for model_name in model_names:
            label_col = f"{model_name}_label"
            score_col = f"{model_name}_calibrated" if f"{model_name}_calibrated" in df.columns else f"{model_name}_score"

            if label_col not in df.columns:
                continue

            valid_mask = df[label_col].notna() & df['quarter'].notna()
            valid_df = df[valid_mask]

            if len(valid_df) == 0:
                continue

            # Temporal sentiment shifts
            quarter_sentiment = valid_df.groupby('quarter')[label_col].value_counts(normalize=True).unstack(fill_value=0)

            if len(quarter_sentiment) >= 2:
                quarters = sorted(quarter_sentiment.index)
                q1_sentiment = quarter_sentiment.loc[quarters[0]]
                q2_sentiment = quarter_sentiment.loc[quarters[-1]]

                # Calculate sentiment shift magnitude
                sentiment_shift = q2_sentiment - q1_sentiment
                shift_magnitude = sentiment_shift.abs().sum()

                # Confidence-weighted temporal analysis
                confidence_temporal = {}
                if score_col in valid_df.columns:
                    q1_df = valid_df[valid_df['quarter'] == quarters[0]]
                    q2_df = valid_df[valid_df['quarter'] == quarters[-1]]

                    if len(q1_df) > 0 and len(q2_df) > 0:
                        q1_avg_conf = q1_df[score_col].mean()
                        q2_avg_conf = q2_df[score_col].mean()

                        confidence_temporal = {
                            'q1_avg_confidence': q1_avg_conf,
                            'q2_avg_confidence': q2_avg_conf,
                            'confidence_change': q2_avg_conf - q1_avg_conf
                        }

                temporal_analysis[model_name] = {
                    'q1_distribution': q1_sentiment.to_dict(),
                    'q2_distribution': q2_sentiment.to_dict(),
                    'sentiment_shift': sentiment_shift.to_dict(),
                    'shift_magnitude': shift_magnitude,
                    'confidence_temporal': confidence_temporal
                }

        research_results['temporal_shifts'] = temporal_analysis

    # Question 3: Enhanced model consistency across contexts
    print("3. Enhanced Model Consistency Analysis")

    consistency_analysis = {}

    # Agreement rates across different contexts
    if len(model_names) >= 2:
        model_pairs = list(itertools.combinations(model_names, 2))

        for model1, model2 in model_pairs:
            pair_name = f"{model1}_vs_{model2}"

            label_col1 = f"{model1}_label"
            label_col2 = f"{model2}_label"

            if label_col1 not in df.columns or label_col2 not in df.columns:
                continue

            valid_mask = df[label_col1].notna() & df[label_col2].notna()
            valid_df = df[valid_mask]

            if len(valid_df) == 0:
                continue

            pair_analysis = {}

            # Overall agreement
            overall_agreement = (valid_df[label_col1] == valid_df[label_col2]).mean()
            pair_analysis['overall_agreement'] = overall_agreement

            # Agreement by context
            context_columns = ['speaker_role', 'primary_topic', 'quarter']

            for context_col in context_columns:
                if context_col in valid_df.columns:
                    context_agreement = {}

                    for context_value in valid_df[context_col].unique():
                        context_mask = valid_df[context_col] == context_value
                        context_df = valid_df[context_mask]

                        if len(context_df) >= 5:  # Minimum sample size
                            agreement_rate = (context_df[label_col1] == context_df[label_col2]).mean()
                            context_agreement[str(context_value)] = agreement_rate

                    pair_analysis[f'agreement_by_{context_col}'] = context_agreement

            consistency_analysis[pair_name] = pair_analysis

    research_results['model_consistency'] = consistency_analysis

    return research_results

# Run enhanced research questions analysis
research_questions_results = {}
if enhanced_sentiment_multi_df is not None:
    research_questions_results = analyze_enhanced_research_questions(enhanced_sentiment_multi_df)


In [None]:
## Save Enhanced Comparison Results

def save_enhanced_comparison_results():
    """Save all enhanced comparison results."""

    print("\n" + "="*60)
    print("SAVING ENHANCED COMPARISON RESULTS")
    print("="*60)

    # Compile comprehensive results
    comprehensive_results = {
        'timestamp': pd.Timestamp.now().isoformat(),
        'bank_code': BANK_CODE,
        'enhanced_features': True,
        'fine_tuned_models_included': bool(finetuning_results),
        'analysis_components': {
            'enhanced_model_comparison': enhanced_comparator.comparison_results if enhanced_comparator else {},
            'performance_evaluation': performance_evaluation_results,
            'model_rankings': model_rankings if 'model_rankings' in locals() else {},
            'financial_context_analysis': financial_context_results,
            'research_questions_analysis': research_questions_results,
            'fine_tuning_results': finetuning_results
        }
    }

    # Save main results
    main_results_path = results_comparison_path / "enhanced_model_comparison_results.json"
    with open(main_results_path, 'w') as f:
        json.dump(comprehensive_results, f, indent=2, default=str)
    print(f"Enhanced comparison results: {main_results_path}")

    # Save performance summary
    if performance_evaluation_results:
        performance_summary = {}
        for model_name, results in performance_evaluation_results.items():
            performance_summary[model_name] = {
                'accuracy': results.get('accuracy', 0),
                'f1_weighted': results.get('f1_weighted', 0),
                'f1_macro': results.get('f1_macro', 0),
                'sample_size': results.get('sample_size', 0)
            }

        performance_path = results_comparison_path / "enhanced_performance_summary.json"
        with open(performance_path, 'w') as f:
            json.dump(performance_summary, f, indent=2, default=str)
        print(f"Performance summary: {performance_path}")

    # Save best model recommendation
    if 'model_rankings' in locals() and model_rankings.get('best_model'):
        best_model_recommendation = {
            'best_model': model_rankings['best_model'],
            'ranking_criteria': list(model_rankings.get('rankings_by_criterion', {}).keys()),
            'performance_metrics': performance_evaluation_results.get(model_rankings['best_model'], {}),
            'recommendation_timestamp': pd.Timestamp.now().isoformat()
        }

        recommendation_path = results_comparison_path / "best_model_recommendation.json"
        with open(recommendation_path, 'w') as f:
            json.dump(best_model_recommendation, f, indent=2, default=str)
        print(f"Best model recommendation: {recommendation_path}")

# Save all results
save_enhanced_comparison_results()


In [None]:
## Enhanced Summary and Insights

print("\n" + "="*60)
print("ENHANCED MODEL COMPARISON COMPLETE")
print("="*60)

# Performance summary
if performance_evaluation_results:
    print(f"\nModel Performance Summary:")
    print(f"{'Model':<20} {'Accuracy':<10} {'F1-Weighted':<12} {'F1-Macro':<10} {'Samples':<8}")
    print(f"{'-'*60}")

    for model_name, results in performance_evaluation_results.items():
        accuracy = results.get('accuracy', 0)
        f1_weighted = results.get('f1_weighted', 0)
        f1_macro = results.get('f1_macro', 0)
        sample_size = results.get('sample_size', 0)

        print(f"{model_name:<20} {accuracy:<10.3f} {f1_weighted:<12.3f} {f1_macro:<10.3f} {sample_size:<8}")

# Best model summary
if 'model_rankings' in locals() and model_rankings.get('best_model'):
    best_model = model_rankings['best_model']
    print(f"\nBest Performing Model: {best_model}")

    if best_model in performance_evaluation_results:
        best_results = performance_evaluation_results[best_model]
        print(f"  Accuracy: {best_results.get('accuracy', 0):.3f}")
        print(f"  F1-Weighted: {best_results.get('f1_weighted', 0):.3f}")
        print(f"  F1-Macro: {best_results.get('f1_macro', 0):.3f}")
        print(f"  Cohen's Kappa: {best_results.get('cohen_kappa', 0):.3f}")

# Model agreement summary
if enhanced_comparator and enhanced_comparator.comparison_results.get('agreement_metrics'):
    agreement_results = enhanced_comparator.comparison_results['agreement_metrics']
    print(f"\nModel Agreement Summary:")

    for pair_name, metrics in agreement_results.items():
        agreement_rate = metrics.get('agreement_rate', 0)
        kappa = metrics.get('cohen_kappa', 0)
        print(f"  {pair_name}: Agreement={agreement_rate:.3f}, Kappa={kappa:.3f}")

# Financial context insights
if financial_context_results:
    print(f"\nFinancial Context Alignment:")
    for model_name, results in financial_context_results.items():
        alignment = results.get('overall_financial_alignment', 0)
        sample_size = results.get('sample_size', 0)
        print(f"  {model_name}: {alignment:.3f} ({sample_size} samples)")

# Research questions insights
if research_questions_results:
    print(f"\nKey Research Insights:")

    # Speaker divergence
    if 'speaker_divergence' in research_questions_results:
        print(f"  Speaker Sentiment Divergence:")
        for model_name, analysis in research_questions_results['speaker_divergence'].items():
            tvd = analysis.get('total_variation_distance', 0)
            print(f"    {model_name}: TVD={tvd:.3f}")

    # Temporal shifts
    if 'temporal_shifts' in research_questions_results:
        print(f"  Temporal Sentiment Shifts:")
        for model_name, analysis in research_questions_results['temporal_shifts'].items():
            shift_magnitude = analysis.get('shift_magnitude', 0)
            print(f"    {model_name}: Shift Magnitude={shift_magnitude:.3f}")

# Fine-tuning impact
if finetuning_results and 'model_comparison' in finetuning_results:
    print(f"\nFine-tuning Impact:")
    for model_name, comparison in finetuning_results['model_comparison'].items():
        if 'f1_improvement' in comparison:
            improvement = comparison['f1_improvement']
            print(f"  {model_name}: F1 improvement = +{improvement:.3f}")

print(f"\nEnhanced comparison analysis saved to: {results_comparison_path}")
print(f"Ready for enhanced visualization in 06_results_visualization_jpm_enhanced.ipynb")


In [None]:
## Export Key Metrics for Visualization

enhanced_viz_metrics = {
    'performance_summary': {},
    'model_agreement': {},
    'financial_context': {},
    'research_insights': {},
    'fine_tuning_impact': {}
}

# Performance metrics
if performance_evaluation_results:
    enhanced_viz_metrics['performance_summary'] = {
        model_name: {
            'accuracy': results.get('accuracy', 0),
            'f1_weighted': results.get('f1_weighted', 0),
            'f1_macro': results.get('f1_macro', 0),
            'sample_size': results.get('sample_size', 0)
        }
        for model_name, results in performance_evaluation_results.items()
    }

# Model agreement
if enhanced_comparator and enhanced_comparator.comparison_results.get('agreement_metrics'):
    enhanced_viz_metrics['model_agreement'] = {
        pair_name: {
            'agreement_rate': metrics.get('agreement_rate', 0),
            'cohen_kappa': metrics.get('cohen_kappa', 0)
        }
        for pair_name, metrics in enhanced_comparator.comparison_results['agreement_metrics'].items()
    }

# Financial context
if financial_context_results:
    enhanced_viz_metrics['financial_context'] = {
        model_name: results.get('overall_financial_alignment', 0)
        for model_name, results in financial_context_results.items()
    }

# Best model info
if 'model_rankings' in locals() and model_rankings.get('best_model'):
    enhanced_viz_metrics['best_model'] = {
        'name': model_rankings['best_model'],
        'metrics': performance_evaluation_results.get(model_rankings['best_model'], {})
    }

# Save visualization metrics
viz_metrics_path = results_comparison_path / "enhanced_viz_metrics.json"
with open(viz_metrics_path, 'w') as f:
    json.dump(enhanced_viz_metrics, f, indent=2, default=str)

print(f"Enhanced visualization metrics saved: {viz_metrics_path}")
print(f"\nEnhanced model comparison analysis complete!")