In [1]:
# 03b_manual_validation.ipynb
# Purpose: Enhanced manual validation for 4 models across JP Morgan and HSBC datasets
# Banks: JP Morgan (JPM) and HSBC
# Models: FinBERT (yiyanghkust), FinBERT (ProsusAI), DistilRoBERTa, CardiffNLP (Twitter-RoBERTa)
# Input: Manual labels for both banks + processed datasets
# Output: Validated manual labels for 4-model fine-tuning

## Import Libraries

import pandas as pd
import numpy as np
import json
import requests
import io
import csv
import tempfile
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Statistical analysis
from scipy import stats
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns

# Google Colab
from google.colab import drive
drive.mount("/content/drive")

# Load enhanced configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_config.json")
with open(config_path, "r") as f:
    enhanced_config = json.load(f)

SEED = enhanced_config["SEED"]
BANKS = enhanced_config["BANKS"]
QUARTERS = enhanced_config["QUARTERS"]
MODELS = enhanced_config["MODELS"]
drive_base = Path(enhanced_config["drive_base"])
colab_base = Path(enhanced_config["colab_base"])
data_urls = enhanced_config["data_urls"]

print(f"Enhanced manual validation for banks: {', '.join([bank.upper() for bank in BANKS])}")
print(f"Target models: {len(MODELS)} models for validation")


Mounted at /content/drive
Enhanced manual validation for banks: JPM, HSBC
Target models: 4 models for validation


In [2]:
## Define Enhanced Paths

manual_validation_paths = {}
for bank in BANKS:
    manual_validation_paths[bank] = {
        "manual_validation": drive_base / f"data/manual_validation/{bank}",
        "processed_data": drive_base / f"data/processed/{bank}",
        "results_sentiment": drive_base / f"results/sentiment/{bank}"
    }

    # Ensure directories exist
    for path in manual_validation_paths[bank].values():
        path.mkdir(parents=True, exist_ok=True)

## Enhanced Manual Labels Download

def extract_file_id_from_drive_url(url: str) -> str:
    """Extract file ID from Google Drive sharing URL."""
    if "drive.google.com" in url and "/file/d/" in url:
        return url.split("/file/d/")[1].split("/")[0]
    return None

def download_manual_labels_enhanced(bank_code: str) -> Path:
    """Download manually labeled dataset for specific bank."""
    print(f"\n📥 Downloading {bank_code.upper()} manual labels...")

    if bank_code not in data_urls or "manual_labels" not in data_urls[bank_code]:
        print(f"❌ No manual labels URL configured for {bank_code.upper()}")
        return None

    manual_labels_url = data_urls[bank_code]["manual_labels"]
    file_id = extract_file_id_from_drive_url(manual_labels_url)

    if not file_id:
        print(f"❌ Could not extract file ID from URL for {bank_code.upper()}")
        return None

    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
    filename = f"manual_labels_{bank_code}_multi_2025.csv"

    try:
        print(f"  Downloading from Google Drive...")
        response = requests.get(download_url, timeout=300)
        response.raise_for_status()

        # Save to manual validation directory
        manual_file_path = manual_validation_paths[bank_code]["manual_validation"] / filename
        with open(manual_file_path, 'wb') as f:
            f.write(response.content)

        print(f"  ✅ Downloaded: {manual_file_path}")
        return manual_file_path

    except Exception as e:
        print(f"  ❌ Error downloading manual labels for {bank_code.upper()}: {str(e)}")
        return None

# Download manual labels for all banks
manual_label_paths = {}
for bank in BANKS:
    manual_label_paths[bank] = download_manual_labels_enhanced(bank)



📥 Downloading JPM manual labels...
  Downloading from Google Drive...
  ✅ Downloaded: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/jpm/manual_labels_jpm_multi_2025.csv

📥 Downloading HSBC manual labels...
  Downloading from Google Drive...
  ✅ Downloaded: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/hsbc/manual_labels_hsbc_multi_2025.csv


In [3]:
## Enhanced Manual Labels Loading

def load_manual_labels_enhanced(file_path: Path, bank_code: str) -> pd.DataFrame:
    """Enhanced loading of manually labeled data with robust error handling."""
    if not file_path or not file_path.exists():
        print(f"❌ Manual labels file not found for {bank_code.upper()}")
        return None

    print(f"\n📂 Loading {bank_code.upper()} manual labels...")

    try:
        # Multiple approaches for robust CSV loading
        approaches = [
            # Standard approach
            lambda: pd.read_csv(file_path, encoding='utf-8'),

            # Skip bad lines
            lambda: pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip'),

            # Alternative encodings
            lambda: pd.read_csv(file_path, encoding='latin-1'),
            lambda: pd.read_csv(file_path, encoding='cp1252'),

            # Flexible quoting
            lambda: pd.read_csv(file_path, encoding='utf-8', quoting=csv.QUOTE_ALL, on_bad_lines='skip')
        ]

        for i, approach in enumerate(approaches):
            try:
                df = approach()
                print(f"  ✅ Loaded with approach {i+1}: {df.shape}")
                return df
            except Exception as e:
                print(f"  Approach {i+1} failed: {str(e)}")
                continue

        # Manual cleanup approach
        print(f"  Attempting manual cleanup...")

        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()

        print(f"  File has {len(lines)} lines")

        # Identify expected columns from header
        if lines:
            header_line = lines[0].strip()
            expected_columns = len(header_line.split(','))
            print(f"  Expected columns: {expected_columns}")

            # Clean problematic lines
            cleaned_lines = [lines[0]]  # Keep header

            for i, line in enumerate(lines[1:], 1):
                field_count = line.count(',') + 1
                if field_count <= expected_columns * 1.5:  # Allow some tolerance
                    cleaned_lines.append(line)
                else:
                    print(f"    Skipping line {i+1} with {field_count} fields")

            # Save cleaned version and load
            with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv', encoding='utf-8') as tmp_file:
                tmp_file.writelines(cleaned_lines)
                tmp_path = tmp_file.name

            try:
                df = pd.read_csv(tmp_path, encoding='utf-8')
                os.unlink(tmp_path)  # Clean up
                print(f"  ✅ Manual cleanup successful: {df.shape}")
                return df
            except Exception as e:
                os.unlink(tmp_path)
                print(f"  ❌ Manual cleanup failed: {e}")

        print(f"  ❌ All loading approaches failed for {bank_code.upper()}")
        return None

    except Exception as e:
        print(f"❌ Error loading manual labels for {bank_code.upper()}: {str(e)}")
        return None

# Load manual labels for all banks
manual_labels_dfs = {}
for bank in BANKS:
    if manual_label_paths[bank]:
        manual_labels_dfs[bank] = load_manual_labels_enhanced(manual_label_paths[bank], bank)


📂 Loading JPM manual labels...
  ✅ Loaded with approach 1: (1121, 27)

📂 Loading HSBC manual labels...
  ✅ Loaded with approach 1: (858, 27)


In [4]:
## Enhanced Manual Labels Validation

def validate_manual_labels_enhanced(df: pd.DataFrame, bank_code: str) -> Dict:
    """Enhanced validation of manual labels structure and quality."""
    if df is None:
        return {"error": f"No manual labels data for {bank_code.upper()}"}

    print(f"\n🔍 [{bank_code.upper()}] ENHANCED MANUAL LABELS VALIDATION")
    print("-" * 50)

    validation_report = {
        'bank_code': bank_code,
        'total_records': len(df),
        'manually_labeled_count': 0,
        'label_distribution': {},
        'confidence_stats': {},
        'annotator_info': {},
        'data_quality': {},
        'model_compatibility': {}
    }

    print(f"Total records: {len(df):,}")
    print(f"Columns: {list(df.columns)}")

    # Check for required columns with alternatives
    required_columns = ['human_label', 'human_confidence', 'annotation_notes', 'annotator_id']
    missing_columns = []
    found_alternatives = {}

    # Alternative column mappings
    alternative_mappings = {
        'human_label': ['label', 'sentiment', 'manual_label', 'sentiment_label'],
        'human_confidence': ['confidence', 'manual_confidence', 'certainty', 'score'],
        'annotation_notes': ['notes', 'comments', 'remarks', 'annotation_comment'],
        'annotator_id': ['annotator', 'user_id', 'reviewer', 'labeler_id']
    }

    for req_col in required_columns:
        if req_col not in df.columns:
            # Look for alternatives
            for alt_col in alternative_mappings.get(req_col, []):
                if alt_col in df.columns:
                    found_alternatives[req_col] = alt_col
                    df = df.rename(columns={alt_col: req_col})
                    print(f"  Mapped '{alt_col}' → '{req_col}'")
                    break
            else:
                missing_columns.append(req_col)

    if missing_columns:
        print(f"  ⚠️ Missing columns: {missing_columns}")
        validation_report['missing_columns'] = missing_columns

    # Count manually labeled records
    if 'human_label' in df.columns:
        manually_labeled_mask = df['human_label'].notna() & (df['human_label'] != '')
        manually_labeled_df = df[manually_labeled_mask].copy()
        validation_report['manually_labeled_count'] = len(manually_labeled_df)

        print(f"Manually labeled: {len(manually_labeled_df):,} ({len(manually_labeled_df)/len(df)*100:.1f}%)")

        if len(manually_labeled_df) > 0:
            # Label distribution analysis
            label_dist = manually_labeled_df['human_label'].value_counts()
            validation_report['label_distribution'] = label_dist.to_dict()

            print(f"Label distribution:")
            for label, count in label_dist.items():
                pct = (count / len(manually_labeled_df)) * 100
                print(f"  {label}: {count} ({pct:.1f}%)")

            # Enhanced label validation for 4 models
            expected_labels = ['positive', 'negative', 'neutral']
            valid_labels = set(manually_labeled_df['human_label'].unique())
            unexpected_labels = valid_labels - set(expected_labels)

            if unexpected_labels:
                print(f"  ⚠️ Unexpected labels found: {unexpected_labels}")
                validation_report['unexpected_labels'] = list(unexpected_labels)

            # Model compatibility check
            for model_key in MODELS.keys():
                model_compatible = len(valid_labels.intersection(expected_labels)) > 0
                validation_report['model_compatibility'][model_key] = {
                    'compatible': model_compatible,
                    'supported_labels': list(valid_labels.intersection(expected_labels))
                }

            # Confidence analysis
            if 'human_confidence' in manually_labeled_df.columns:
                confidence_scores = manually_labeled_df['human_confidence'].dropna()
                if len(confidence_scores) > 0:
                    validation_report['confidence_stats'] = {
                        'count': len(confidence_scores),
                        'mean': confidence_scores.mean(),
                        'std': confidence_scores.std(),
                        'min': confidence_scores.min(),
                        'max': confidence_scores.max(),
                        'median': confidence_scores.median()
                    }

                    print(f"Confidence scores:")
                    print(f"  Count: {len(confidence_scores)}")
                    print(f"  Mean: {confidence_scores.mean():.3f}")
                    print(f"  Range: {confidence_scores.min():.3f} - {confidence_scores.max():.3f}")

                    # Confidence quality assessment
                    high_confidence = (confidence_scores >= 0.8).sum()
                    low_confidence = (confidence_scores < 0.5).sum()

                    print(f"  High confidence (≥0.8): {high_confidence} ({high_confidence/len(confidence_scores)*100:.1f}%)")
                    print(f"  Low confidence (<0.5): {low_confidence} ({low_confidence/len(confidence_scores)*100:.1f}%)")

            # Annotator analysis
            if 'annotator_id' in manually_labeled_df.columns:
                annotator_dist = manually_labeled_df['annotator_id'].value_counts()
                validation_report['annotator_info'] = {
                    'unique_annotators': len(annotator_dist),
                    'annotations_per_annotator': annotator_dist.to_dict()
                }

                print(f"Annotator information:")
                print(f"  Unique annotators: {len(annotator_dist)}")
                for annotator, count in annotator_dist.head().items():
                    print(f"  {annotator}: {count} annotations")

                # Inter-annotator reliability (if multiple annotators)
                if len(annotator_dist) > 1:
                    print(f"  Multi-annotator dataset detected")
                    validation_report['multi_annotator'] = True

    # Data quality assessment
    quality_issues = []

    # Missing confidence scores
    if 'human_confidence' in df.columns:
        missing_confidence = df['human_confidence'].isna().sum()
        if missing_confidence > 0:
            quality_issues.append(f"Missing confidence: {missing_confidence}")

    # Missing annotation notes
    if 'annotation_notes' in df.columns:
        missing_notes = df['annotation_notes'].isna().sum()
        if missing_notes > 0:
            quality_issues.append(f"Missing notes: {missing_notes}")

    # Text matching capability
    text_columns = ['text', 'content', 'sentence', 'statement']
    text_col_found = None
    for col in text_columns:
        if col in df.columns:
            text_col_found = col
            break

    if text_col_found:
        validation_report['text_column'] = text_col_found
        print(f"  Text column found: {text_col_found}")
    else:
        quality_issues.append("No text column found for matching")

    validation_report['data_quality']['issues'] = quality_issues

    if quality_issues:
        print(f"Data quality issues:")
        for issue in quality_issues:
            print(f"  ⚠️ {issue}")
    else:
        print(f"✅ No major data quality issues")

    return validation_report

# Validate manual labels for all banks
validation_reports = {}
for bank in BANKS:
    if bank in manual_labels_dfs and manual_labels_dfs[bank] is not None:
        validation_reports[bank] = validate_manual_labels_enhanced(manual_labels_dfs[bank], bank)



🔍 [JPM] ENHANCED MANUAL LABELS VALIDATION
--------------------------------------------------
Total records: 1,121
Columns: ['original_qa_id', 'sentence_id', 'text', 'speaker', 'speaker_role', 'quarter', 'bank_code', 'dataset_id', 'sentence_length', 'sentence_word_count', 'sentence_index_in_qa', 'text_quality_score', 'financial_relevance_score', 'has_revenue_mentions', 'has_risk_mentions', 'has_growth_mentions', 'has_performance_mentions', 'ready_for_finbert', 'ready_for_distilroberta', 'ready_for_cardiffnlp', 'has_financial_numbers', 'has_financial_entities', 'sentence_complexity', 'human_label', 'human_confidence', 'annotation_notes', 'annotator_id']
Manually labeled: 49 (4.4%)
Label distribution:
  neutral: 19 (38.8%)
  positive: 19 (38.8%)
  negative: 11 (22.4%)
Confidence scores:
  Count: 49
  Mean: 3.939
  Range: 2.000 - 5.000
  High confidence (≥0.8): 49 (100.0%)
  Low confidence (<0.5): 0 (0.0%)
Annotator information:
  Unique annotators: 1
  Annotator 1: 49 annotations
  Text 

In [5]:
## Enhanced Model Comparison Analysis

def compare_manual_vs_model_predictions_enhanced(bank_code: str) -> Dict:
    """Enhanced comparison of manual labels with existing model predictions for 4 models."""
    if bank_code not in manual_labels_dfs or manual_labels_dfs[bank_code] is None:
        return {"error": f"No manual labels for {bank_code.upper()}"}

    print(f"\n🔍 [{bank_code.upper()}] MANUAL VS 4 MODELS COMPARISON")
    print("-" * 50)

    manual_df = manual_labels_dfs[bank_code]

    # Filter to manually labeled records
    manually_labeled_mask = manual_df['human_label'].notna() & (manual_df['human_label'] != '')
    manual_eval_df = manual_df[manually_labeled_mask].copy()

    if len(manual_eval_df) == 0:
        print("❌ No manually labeled records for comparison")
        return {"error": "No manually labeled records"}

    print(f"Evaluating {len(manual_eval_df)} manually labeled records")

    # Load processed data to check for existing model predictions
    processed_file = drive_base / f"data/processed/{bank_code}" / f"processed_{bank_code}_combined_sentence_level.csv"

    comparison_results = {}

    if processed_file.exists():
        try:
            processed_df = pd.read_csv(processed_file)
            print(f"Loaded processed data: {processed_df.shape}")

            # Try to merge with manual labels
            merge_columns = ['text', 'sentence_id', 'original_qa_id']
            merged_df = None

            for merge_col in merge_columns:
                if merge_col in manual_eval_df.columns and merge_col in processed_df.columns:
                    try:
                        merged_df = manual_eval_df.merge(
                            processed_df,
                            on=merge_col,
                            how='inner',
                            suffixes=('_manual', '_processed')
                        )
                        if len(merged_df) > 0:
                            print(f"  Merged on '{merge_col}': {len(merged_df)} records")
                            break
                    except Exception as e:
                        print(f"  Merge on '{merge_col}' failed: {e}")

            if merged_df is not None and len(merged_df) > 0:
                # Check for model predictions in processed data
                model_mappings = {
                    'finbert_yiyanghkust': ['finbert_tone_label', 'finbert_label'],
                    'finbert_prosusai': ['prosus_label', 'prosus_finbert_label'],
                    'distilroberta': ['distilroberta_label', 'roberta_label'],
                    'cardiffnlp_roberta': ['cardiff_label', 'twitter_roberta_label']
                }

                for model_key, possible_cols in model_mappings.items():
                    model_col = None
                    for col in possible_cols:
                        if col in merged_df.columns:
                            model_col = col
                            break

                    if model_col:
                        comparison_results[model_key] = analyze_model_vs_manual_enhanced(
                            merged_df, model_col, 'human_label', model_key, bank_code
                        )
                    else:
                        print(f"  ⚠️ No predictions found for {model_key}")
                        comparison_results[model_key] = {"error": "No predictions found"}
            else:
                print("  ❌ Could not merge manual labels with processed data")

        except Exception as e:
            print(f"  ❌ Error loading processed data: {e}")
    else:
        print(f"  ⚠️ No processed data found at {processed_file}")

    # If no existing predictions, create placeholder analysis
    if not comparison_results:
        print("  Creating placeholder analysis for 4 models...")
        for model_key in MODELS.keys():
            comparison_results[model_key] = {
                "status": "no_predictions_available",
                "manual_label_count": len(manual_eval_df),
                "ready_for_training": True
            }

    return comparison_results

# Compare manual labels with model predictions for all banks
model_comparison_results = {}
for bank in BANKS:
    model_comparison_results[bank] = compare_manual_vs_model_predictions_enhanced(bank)

def analyze_model_vs_manual_enhanced(df: pd.DataFrame, model_col: str, manual_col: str,
                                   model_name: str, bank_code: str) -> Dict:
    """Enhanced analysis of agreement between model predictions and manual labels."""

    # Filter valid comparisons
    valid_mask = df[model_col].notna() & df[manual_col].notna()
    comparison_df = df[valid_mask].copy()

    if len(comparison_df) == 0:
        return {'error': f'No valid comparisons for {model_name}'}

    print(f"  {model_name}: {len(comparison_df)} comparisons")

    # Calculate agreement
    agreement = (comparison_df[model_col] == comparison_df[manual_col]).mean()
    print(f"    Agreement: {agreement:.3f}")

    # Classification report
    try:
        report = classification_report(
            comparison_df[manual_col],
            comparison_df[model_col],
            output_dict=True,
            zero_division=0
        )

        print(f"    F1-Score: {report['weighted avg']['f1-score']:.3f}")

    except Exception as e:
        print(f"    Classification report failed: {e}")
        report = {}

    # Confusion matrix analysis
    try:
        labels = sorted(list(set(comparison_df[manual_col].unique()) | set(comparison_df[model_col].unique())))
        cm = confusion_matrix(comparison_df[manual_col], comparison_df[model_col], labels=labels)

        print(f"    Labels analyzed: {labels}")

    except Exception as e:
        print(f"    Confusion matrix failed: {e}")
        cm = None
        labels = []

    # Cohen's Kappa
    try:
        kappa = cohen_kappa_score(comparison_df[manual_col], comparison_df[model_col])
        print(f"    Cohen's Kappa: {kappa:.3f}")
    except Exception as e:
        print(f"    Kappa calculation failed: {e}")
        kappa = None

    return {
        'model_name': model_name,
        'bank_code': bank_code,
        'total_comparisons': len(comparison_df),
        'agreement_rate': agreement,
        'classification_report': report,
        'confusion_matrix': cm.tolist() if cm is not None else None,
        'confusion_matrix_labels': labels,
        'cohen_kappa': kappa
    }



🔍 [JPM] MANUAL VS 4 MODELS COMPARISON
--------------------------------------------------
Evaluating 49 manually labeled records
Loaded processed data: (752, 21)
  Merged on 'original_qa_id': 49 records
  ⚠️ No predictions found for finbert_yiyanghkust
  ⚠️ No predictions found for finbert_prosusai
  ⚠️ No predictions found for distilroberta
  ⚠️ No predictions found for cardiffnlp_roberta

🔍 [HSBC] MANUAL VS 4 MODELS COMPARISON
--------------------------------------------------
Evaluating 69 manually labeled records
Loaded processed data: (640, 21)
  Merged on 'text': 1 records
  ⚠️ No predictions found for finbert_yiyanghkust
  ⚠️ No predictions found for finbert_prosusai
  ⚠️ No predictions found for distilroberta
  ⚠️ No predictions found for cardiffnlp_roberta


In [6]:
## Enhanced Training/Validation Split Preparation

def prepare_enhanced_training_data(bank_code: str) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
    """Prepare manually labeled data for 4-model fine-tuning."""
    if bank_code not in manual_labels_dfs or manual_labels_dfs[bank_code] is None:
        return None, None, {"error": f"No manual labels for {bank_code.upper()}"}

    print(f"\n🎯 [{bank_code.upper()}] PREPARING ENHANCED TRAINING DATA")
    print("-" * 50)

    manual_df = manual_labels_dfs[bank_code]

    # Filter to manually labeled records
    manually_labeled_mask = manual_df['human_label'].notna() & (manual_df['human_label'] != '')
    manual_clean_df = manual_df[manually_labeled_mask].copy()

    if len(manual_clean_df) == 0:
        print("❌ No manually labeled data available")
        return None, None, {"error": "No manually labeled data"}

    print(f"Total manually labeled records: {len(manual_clean_df)}")

    # Enhanced stratified split
    from sklearn.model_selection import train_test_split

    # Create stratification groups
    stratify_col = 'human_label'

    if 'human_confidence' in manual_clean_df.columns:
        # Add confidence-based stratification
        manual_clean_df['confidence_group'] = pd.cut(
            manual_clean_df['human_confidence'].fillna(0.5),
            bins=[0, 0.6, 0.8, 1.0],
            labels=['low', 'medium', 'high'],
            include_lowest=True
        )

        manual_clean_df['stratify_group'] = (
            manual_clean_df['human_label'].astype(str) + '_' +
            manual_clean_df['confidence_group'].astype(str)
        )
        stratify_col = 'stratify_group'

    # Enhanced split (80/20 for training/validation)
    try:
        train_df, val_df = train_test_split(
            manual_clean_df,
            test_size=0.2,
            random_state=SEED,
            stratify=manual_clean_df[stratify_col]
        )

        print(f"Training set: {len(train_df)} records")
        print(f"Validation set: {len(val_df)} records")

    except ValueError as e:
        print(f"Stratification failed ({e}), using random split")
        train_df, val_df = train_test_split(
            manual_clean_df,
            test_size=0.2,
            random_state=SEED
        )

        print(f"Training set: {len(train_df)} records")
        print(f"Validation set: {len(val_df)} records")

    # Enhanced analysis
    split_analysis = {
        'bank_code': bank_code,
        'total_manual_labels': len(manual_clean_df),
        'train_size': len(train_df),
        'val_size': len(val_df),
        'train_label_dist': train_df['human_label'].value_counts().to_dict(),
        'val_label_dist': val_df['human_label'].value_counts().to_dict(),
        'model_readiness': {}
    }

    # Model-specific readiness assessment
    for model_key in MODELS.keys():
        # Check text requirements for each model
        text_col = 'text' if 'text' in train_df.columns else None

        if text_col:
            # Model-specific text length requirements
            model_requirements = {
                'finbert_yiyanghkust': {'min_length': 20, 'min_words': 4},
                'finbert_prosusai': {'min_length': 20, 'min_words': 4},
                'distilroberta': {'min_length': 10, 'min_words': 2},
                'cardiffnlp_roberta': {'min_length': 10, 'min_words': 2}
            }

            if model_key in model_requirements:
                req = model_requirements[model_key]

                train_ready = (
                    (train_df[text_col].str.len() >= req['min_length']) &
                    (train_df[text_col].str.split().str.len() >= req['min_words'])
                ).sum()

                val_ready = (
                    (val_df[text_col].str.len() >= req['min_length']) &
                    (val_df[text_col].str.split().str.len() >= req['min_words'])
                ).sum()

                split_analysis['model_readiness'][model_key] = {
                    'train_ready': train_ready,
                    'val_ready': val_ready,
                    'train_ready_pct': train_ready / len(train_df) * 100,
                    'val_ready_pct': val_ready / len(val_df) * 100
                }

    # Print detailed analysis
    print(f"\nTraining Label Distribution:")
    for label, count in split_analysis['train_label_dist'].items():
        pct = (count / len(train_df)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    print(f"\nValidation Label Distribution:")
    for label, count in split_analysis['val_label_dist'].items():
        pct = (count / len(val_df)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    print(f"\nModel Readiness:")
    for model_key, readiness in split_analysis['model_readiness'].items():
        print(f"  {model_key}:")
        print(f"    Train: {readiness['train_ready']}/{len(train_df)} ({readiness['train_ready_pct']:.1f}%)")
        print(f"    Val: {readiness['val_ready']}/{len(val_df)} ({readiness['val_ready_pct']:.1f}%)")

    return train_df, val_df, split_analysis

# Prepare training data for all banks
training_splits = {}
for bank in BANKS:
    train_df, val_df, split_analysis = prepare_enhanced_training_data(bank)
    training_splits[bank] = {
        'train_df': train_df,
        'val_df': val_df,
        'split_analysis': split_analysis
    }



🎯 [JPM] PREPARING ENHANCED TRAINING DATA
--------------------------------------------------
Total manually labeled records: 49
Training set: 39 records
Validation set: 10 records

Training Label Distribution:
  positive: 15 (38.5%)
  neutral: 15 (38.5%)
  negative: 9 (23.1%)

Validation Label Distribution:
  neutral: 4 (40.0%)
  positive: 4 (40.0%)
  negative: 2 (20.0%)

Model Readiness:
  finbert_yiyanghkust:
    Train: 39/39 (100.0%)
    Val: 10/10 (100.0%)
  finbert_prosusai:
    Train: 39/39 (100.0%)
    Val: 10/10 (100.0%)
  distilroberta:
    Train: 39/39 (100.0%)
    Val: 10/10 (100.0%)
  cardiffnlp_roberta:
    Train: 39/39 (100.0%)
    Val: 10/10 (100.0%)

🎯 [HSBC] PREPARING ENHANCED TRAINING DATA
--------------------------------------------------
Total manually labeled records: 69
Training set: 55 records
Validation set: 14 records

Training Label Distribution:
  positive: 31 (56.4%)
  neutral: 16 (29.1%)
  negative: 8 (14.5%)

Validation Label Distribution:
  positive: 8 (5

In [7]:
## Enhanced Quality Assessment

def assess_enhanced_manual_data_quality() -> Dict:
    """Comprehensive quality assessment across all banks and models."""
    print(f"\n🏆 ENHANCED MULTI-BANK QUALITY ASSESSMENT")
    print("=" * 60)

    assessment = {
        'overall_quality': 'unknown',
        'recommendations': [],
        'fine_tuning_readiness': {},
        'bank_summaries': {},
        'model_compatibility': {},
        'cross_bank_analysis': {}
    }

    total_manual_labels = 0
    banks_with_sufficient_data = 0
    models_ready_count = {model: 0 for model in MODELS.keys()}

    # Bank-by-bank assessment
    for bank in BANKS:
        bank_assessment = {
            'manual_label_count': 0,
            'label_distribution': {},
            'model_readiness': {},
            'quality_score': 0,
            'recommendations': []
        }

        print(f"\n📊 [{bank.upper()}] Assessment:")

        if bank in validation_reports and 'manually_labeled_count' in validation_reports[bank]:
            manual_count = validation_reports[bank]['manually_labeled_count']
            bank_assessment['manual_label_count'] = manual_count
            total_manual_labels += manual_count

            print(f"  Manual labels: {manual_count}")

            # Assess quantity
            if manual_count >= 100:
                banks_with_sufficient_data += 1
                bank_assessment['quality_score'] += 3
                bank_assessment['recommendations'].append("Good dataset size for fine-tuning")
            elif manual_count >= 50:
                bank_assessment['quality_score'] += 2
                bank_assessment['recommendations'].append("Moderate dataset size - suitable with care")
            else:
                bank_assessment['quality_score'] += 1
                bank_assessment['recommendations'].append("Small dataset - consider data augmentation")

            # Assess label balance
            if 'label_distribution' in validation_reports[bank]:
                label_dist = validation_reports[bank]['label_distribution']
                bank_assessment['label_distribution'] = label_dist

                if label_dist:
                    label_counts = list(label_dist.values())
                    min_count = min(label_counts)
                    max_count = max(label_counts)
                    balance_ratio = min_count / max_count if max_count > 0 else 0

                    if balance_ratio >= 0.3:
                        bank_assessment['quality_score'] += 2
                        bank_assessment['recommendations'].append("Well-balanced labels")
                    elif balance_ratio >= 0.1:
                        bank_assessment['quality_score'] += 1
                        bank_assessment['recommendations'].append("Moderately balanced - usable")
                    else:
                        bank_assessment['recommendations'].append("Imbalanced labels - use class weights")

            # Model readiness assessment
            if bank in training_splits and training_splits[bank]['split_analysis']:
                split_analysis = training_splits[bank]['split_analysis']
                if 'model_readiness' in split_analysis:
                    bank_assessment['model_readiness'] = split_analysis['model_readiness']

                    for model_key, readiness in split_analysis['model_readiness'].items():
                        train_ready_pct = readiness.get('train_ready_pct', 0)
                        if train_ready_pct >= 80:
                            models_ready_count[model_key] += 1
                            print(f"    {model_key}: Ready ({train_ready_pct:.1f}%)")
                        else:
                            print(f"    {model_key}: Needs attention ({train_ready_pct:.1f}%)")

        else:
            bank_assessment['recommendations'].append("No manual labels available")

        # Overall bank quality
        if bank_assessment['quality_score'] >= 6:
            bank_quality = 'excellent'
            assessment['fine_tuning_readiness'][bank] = True
        elif bank_assessment['quality_score'] >= 4:
            bank_quality = 'good'
            assessment['fine_tuning_readiness'][bank] = True
        elif bank_assessment['quality_score'] >= 2:
            bank_quality = 'fair'
            assessment['fine_tuning_readiness'][bank] = True
        else:
            bank_quality = 'poor'
            assessment['fine_tuning_readiness'][bank] = False

        bank_assessment['overall_quality'] = bank_quality
        assessment['bank_summaries'][bank] = bank_assessment

        print(f"  Quality: {bank_quality}")
        print(f"  Fine-tuning ready: {assessment['fine_tuning_readiness'][bank]}")

    # Cross-bank analysis
    assessment['cross_bank_analysis'] = {
        'total_manual_labels': total_manual_labels,
        'banks_with_sufficient_data': banks_with_sufficient_data,
        'avg_labels_per_bank': total_manual_labels / len(BANKS) if len(BANKS) > 0 else 0
    }

    # Model compatibility across banks
    for model_key in MODELS.keys():
        ready_banks = models_ready_count[model_key]
        assessment['model_compatibility'][model_key] = {
            'ready_banks': ready_banks,
            'total_banks': len(BANKS),
            'compatibility_score': ready_banks / len(BANKS) if len(BANKS) > 0 else 0
        }

    # Overall recommendations
    if banks_with_sufficient_data == len(BANKS):
        assessment['overall_quality'] = 'excellent'
        assessment['recommendations'].append("All banks ready for fine-tuning")
    elif banks_with_sufficient_data >= len(BANKS) // 2:
        assessment['overall_quality'] = 'good'
        assessment['recommendations'].append("Most banks ready - proceed with available data")
    else:
        assessment['overall_quality'] = 'needs_improvement'
        assessment['recommendations'].append("Consider collecting more manual labels")

    # Model-specific recommendations
    for model_key, compatibility in assessment['model_compatibility'].items():
        if compatibility['compatibility_score'] >= 0.8:
            assessment['recommendations'].append(f"{model_key}: Ready for multi-bank fine-tuning")
        elif compatibility['compatibility_score'] >= 0.5:
            assessment['recommendations'].append(f"{model_key}: Partial readiness - focus on ready banks")
        else:
            assessment['recommendations'].append(f"{model_key}: Needs more preparation")

    # Print summary
    print(f"\n🏆 OVERALL ASSESSMENT:")
    print(f"  Quality: {assessment['overall_quality'].upper()}")
    print(f"  Total manual labels: {total_manual_labels:,}")
    print(f"  Banks with sufficient data: {banks_with_sufficient_data}/{len(BANKS)}")

    print(f"\n📋 RECOMMENDATIONS:")
    for rec in assessment['recommendations']:
        print(f"  • {rec}")

    return assessment

# Run enhanced quality assessment
enhanced_quality_assessment = assess_enhanced_manual_data_quality()


🏆 ENHANCED MULTI-BANK QUALITY ASSESSMENT

📊 [JPM] Assessment:
  Manual labels: 49
    finbert_yiyanghkust: Ready (100.0%)
    finbert_prosusai: Ready (100.0%)
    distilroberta: Ready (100.0%)
    cardiffnlp_roberta: Ready (100.0%)
  Quality: fair
  Fine-tuning ready: True

📊 [HSBC] Assessment:
  Manual labels: 69
    finbert_yiyanghkust: Ready (100.0%)
    finbert_prosusai: Ready (100.0%)
    distilroberta: Ready (100.0%)
    cardiffnlp_roberta: Ready (100.0%)
  Quality: fair
  Fine-tuning ready: True

🏆 OVERALL ASSESSMENT:
  Quality: NEEDS_IMPROVEMENT
  Total manual labels: 118
  Banks with sufficient data: 0/2

📋 RECOMMENDATIONS:
  • Consider collecting more manual labels
  • finbert_yiyanghkust: Ready for multi-bank fine-tuning
  • finbert_prosusai: Ready for multi-bank fine-tuning
  • distilroberta: Ready for multi-bank fine-tuning
  • cardiffnlp_roberta: Ready for multi-bank fine-tuning


In [8]:
## Save Enhanced Validation Results

def save_enhanced_validation_results():
    """Save all enhanced validation results and prepared data."""
    print(f"\n💾 SAVING ENHANCED VALIDATION RESULTS")
    print("=" * 50)

    saved_files = {}

    for bank in BANKS:
        bank_saved = {}

        print(f"\n📁 Saving {bank.upper()} validation results...")

        # Save validation report
        if bank in validation_reports:
            validation_path = manual_validation_paths[bank]["manual_validation"] / f"validation_report_{bank}.json"
            with open(validation_path, 'w') as f:
                json.dump(validation_reports[bank], f, indent=2, default=str)
            bank_saved['validation_report'] = str(validation_path)
            print(f"  ✅ Validation report: {validation_path}")

        # Save model comparison results
        if bank in model_comparison_results:
            comparison_path = manual_validation_paths[bank]["manual_validation"] / f"model_comparison_{bank}.json"
            with open(comparison_path, 'w') as f:
                json.dump(model_comparison_results[bank], f, indent=2, default=str)
            bank_saved['model_comparison'] = str(comparison_path)
            print(f"  ✅ Model comparison: {comparison_path}")

        # Save training/validation splits
        if bank in training_splits:
            splits = training_splits[bank]

            if splits['train_df'] is not None:
                train_path = manual_validation_paths[bank]["manual_validation"] / f"train_manual_labels_{bank}.csv"
                splits['train_df'].to_csv(train_path, index=False)
                bank_saved['train_data'] = str(train_path)
                print(f"  ✅ Training data: {train_path} ({len(splits['train_df'])} records)")

            if splits['val_df'] is not None:
                val_path = manual_validation_paths[bank]["manual_validation"] / f"val_manual_labels_{bank}.csv"
                splits['val_df'].to_csv(val_path, index=False)
                bank_saved['val_data'] = str(val_path)
                print(f"  ✅ Validation data: {val_path} ({len(splits['val_df'])} records)")

            # Save split analysis
            if splits['split_analysis']:
                analysis_path = manual_validation_paths[bank]["manual_validation"] / f"split_analysis_{bank}.json"
                with open(analysis_path, 'w') as f:
                    json.dump(splits['split_analysis'], f, indent=2, default=str)
                bank_saved['split_analysis'] = str(analysis_path)
                print(f"  ✅ Split analysis: {analysis_path}")

        # Save validated manual labels to results directory
        if bank in manual_labels_dfs and manual_labels_dfs[bank] is not None:
            validated_path = manual_validation_paths[bank]["results_sentiment"] / f"manual_labels_{bank}_validated.csv"
            manual_labels_dfs[bank].to_csv(validated_path, index=False)
            bank_saved['validated_labels'] = str(validated_path)
            print(f"  ✅ Validated labels: {validated_path}")

        saved_files[bank] = bank_saved

    # Save overall quality assessment
    overall_assessment_path = drive_base / "configs" / "enhanced_manual_validation_assessment.json"
    with open(overall_assessment_path, 'w') as f:
        json.dump(enhanced_quality_assessment, f, indent=2, default=str)
    print(f"\n✅ Overall assessment: {overall_assessment_path}")

    return saved_files

# Save all enhanced validation results
saved_validation_files = save_enhanced_validation_results()



💾 SAVING ENHANCED VALIDATION RESULTS

📁 Saving JPM validation results...
  ✅ Validation report: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/jpm/validation_report_jpm.json
  ✅ Model comparison: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/jpm/model_comparison_jpm.json
  ✅ Training data: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/jpm/train_manual_labels_jpm.csv (39 records)
  ✅ Validation data: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/jpm/val_manual_labels_jpm.csv (10 records)
  ✅ Split analysis: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/jpm/split_analysis_jpm.json
  ✅ Validated labels: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/results/sentiment/jpm/manual_labels_jpm_validated.csv

📁 Saving HSBC validation results...
  ✅ Validation report: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/manual_validation/hsbc/validation_repo

In [9]:
## Enhanced Summary and Next Steps

print(f"\n{'='*60}")
print("ENHANCED MANUAL VALIDATION COMPLETE")
print(f"{'='*60}")

# Summary statistics
total_manual_labels = sum(
    validation_reports[bank].get('manually_labeled_count', 0)
    for bank in BANKS if bank in validation_reports
)

banks_ready_for_finetuning = sum(
    1 for bank in BANKS
    if enhanced_quality_assessment['fine_tuning_readiness'].get(bank, False)
)

models_with_high_compatibility = sum(
    1 for model_compatibility in enhanced_quality_assessment['model_compatibility'].values()
    if model_compatibility.get('compatibility_score', 0) >= 0.8
)

print(f"📊 Validation Summary:")
print(f"  Banks processed: {len(BANKS)} ({', '.join([b.upper() for b in BANKS])})")
print(f"  Total manual labels: {total_manual_labels:,}")
print(f"  Banks ready for fine-tuning: {banks_ready_for_finetuning}/{len(BANKS)}")
print(f"  Models with high compatibility: {models_with_high_compatibility}/{len(MODELS)}")

print(f"\n🏦 Bank-specific Summary:")
for bank in BANKS:
    if bank in validation_reports:
        manual_count = validation_reports[bank].get('manually_labeled_count', 0)
        quality = enhanced_quality_assessment['bank_summaries'].get(bank, {}).get('overall_quality', 'unknown')
        ready = enhanced_quality_assessment['fine_tuning_readiness'].get(bank, False)

        print(f"  {bank.upper()}:")
        print(f"    Manual labels: {manual_count:,}")
        print(f"    Quality: {quality}")
        print(f"    Fine-tuning ready: {'Yes' if ready else 'No'}")

        # Show training/validation split if available
        if bank in training_splits and training_splits[bank]['train_df'] is not None:
            train_size = len(training_splits[bank]['train_df'])
            val_size = len(training_splits[bank]['val_df'])
            print(f"    Train/Val split: {train_size}/{val_size}")

print(f"\n🎯 Model Compatibility:")
for model_key in MODELS.keys():
    compatibility = enhanced_quality_assessment['model_compatibility'].get(model_key, {})
    ready_banks = compatibility.get('ready_banks', 0)
    score = compatibility.get('compatibility_score', 0)
    status = "Ready" if score >= 0.8 else "Partial" if score >= 0.5 else "Needs work"

    print(f"  {model_key}:")
    print(f"    Ready banks: {ready_banks}/{len(BANKS)}")
    print(f"    Compatibility: {score:.1%} - {status}")

print(f"\n📋 Key Recommendations:")
for rec in enhanced_quality_assessment['recommendations']:
    print(f"  • {rec}")

print(f"\n🚀 Next Steps:")
if enhanced_quality_assessment['overall_quality'] in ['excellent', 'good']:
    print(f"  1. ✅ Proceed to 04_sentiment_analysis.ipynb for 4-model analysis")
    print(f"  2. ✅ Continue to 04b_model_finetuning.ipynb for enhanced fine-tuning")
    print(f"  3. ✅ Manual validation data ready for all {len(MODELS)} models")
else:
    print(f"  1. ⚠️ Consider collecting more manual labels")
    print(f"  2. ⚠️ Review data quality issues before fine-tuning")
    print(f"  3. ✅ Continue with available data for baseline analysis")

print(f"\n📁 Files Created:")
total_files = sum(len(bank_files) for bank_files in saved_validation_files.values())
print(f"  Total files saved: {total_files}")
print(f"  Validation reports: Available for all banks")
print(f"  Training/validation splits: Ready for fine-tuning")

print(f"\n🎉 Enhanced manual validation process complete!")
print(f"   Multi-bank, multi-model validation framework established")
print(f"   Ready for advanced sentiment analysis with {len(MODELS)} models")


ENHANCED MANUAL VALIDATION COMPLETE
📊 Validation Summary:
  Banks processed: 2 (JPM, HSBC)
  Total manual labels: 118
  Banks ready for fine-tuning: 2/2
  Models with high compatibility: 4/4

🏦 Bank-specific Summary:
  JPM:
    Manual labels: 49
    Quality: fair
    Fine-tuning ready: Yes
    Train/Val split: 39/10
  HSBC:
    Manual labels: 69
    Quality: fair
    Fine-tuning ready: Yes
    Train/Val split: 55/14

🎯 Model Compatibility:
  finbert_yiyanghkust:
    Ready banks: 2/2
    Compatibility: 100.0% - Ready
  finbert_prosusai:
    Ready banks: 2/2
    Compatibility: 100.0% - Ready
  distilroberta:
    Ready banks: 2/2
    Compatibility: 100.0% - Ready
  cardiffnlp_roberta:
    Ready banks: 2/2
    Compatibility: 100.0% - Ready

📋 Key Recommendations:
  • Consider collecting more manual labels
  • finbert_yiyanghkust: Ready for multi-bank fine-tuning
  • finbert_prosusai: Ready for multi-bank fine-tuning
  • distilroberta: Ready for multi-bank fine-tuning
  • cardiffnlp_robert