In [1]:
# 03_clean_preprocess_enhanced.ipynb
# Purpose: Enhanced cleaning and preprocessing for both JP Morgan and HSBC data
# Banks: JP Morgan (JPM) and HSBC
# Quarters: Q1 2025, Q2 2025
# Models: 4 sentiment analysis models
# Input: Raw datasets from both banks
# Output: Clean and processed datasets ready for 4-model sentiment analysis

## Import Libraries

import pandas as pd
import numpy as np
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Enhanced text processing
import nltk
from textblob import TextBlob
import unicodedata

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
except:
    print("NLTK downloads failed - continuing without")

# Google Colab
from google.colab import drive
drive.mount("/content/drive")

# Load enhanced configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_config.json")
with open(config_path, "r") as f:
    enhanced_config = json.load(f)

# Load data registry
registry_path = Path(enhanced_config["drive_base"]) / "configs" / "enhanced_data_registry.json"
with open(registry_path, "r") as f:
    data_registry = json.load(f)

SEED = enhanced_config["SEED"]
BANKS = enhanced_config["BANKS"]
QUARTERS = enhanced_config["QUARTERS"]
MODELS = enhanced_config["MODELS"]
drive_base = Path(enhanced_config["drive_base"])
colab_base = Path(enhanced_config["colab_base"])

print(f"Enhanced cleaning and preprocessing for banks: {', '.join([bank.upper() for bank in BANKS])}")
print(f"Target models: {len(MODELS)} ({', '.join(MODELS.keys())})")


Mounted at /content/drive
Enhanced cleaning and preprocessing for banks: JPM, HSBC
Target models: 4 (finbert_yiyanghkust, finbert_prosusai, distilroberta, cardiffnlp_roberta)


In [2]:
## Enhanced Data Loading

def load_raw_datasets_enhanced():
    """Load all raw datasets for processing."""
    print(f"\n{'='*50}")
    print("LOADING RAW DATASETS")
    print(f"{'='*50}")

    raw_datasets = {}

    for bank in BANKS:
        print(f"\n📂 Loading {bank.upper()} datasets...")
        raw_datasets[bank] = {}

        # Load quarterly datasets
        for quarter in QUARTERS:
            filename = f"raw_{bank}_{quarter}_earnings_call.csv"
            file_path = drive_base / f"data/raw/{bank}" / filename

            if file_path.exists():
                try:
                    df = pd.read_csv(file_path)
                    raw_datasets[bank][quarter] = df
                    print(f"  ✅ {quarter}: {df.shape}")
                except Exception as e:
                    print(f"  ❌ {quarter}: Error loading - {str(e)}")
            else:
                print(f"  ❌ {quarter}: File not found - {file_path}")

        # Load combined dataset
        combined_filename = f"raw_{bank}_multi_2025_earnings_call.csv"
        combined_path = drive_base / f"data/raw/{bank}" / combined_filename

        if combined_path.exists():
            try:
                df = pd.read_csv(combined_path)
                raw_datasets[bank]["combined"] = df
                print(f"  ✅ Combined: {df.shape}")
            except Exception as e:
                print(f"  ❌ Combined: Error loading - {str(e)}")

        # Load manual labels if available
        manual_filename = f"raw_{bank}_manual_labels.csv"
        manual_path = drive_base / f"data/raw/{bank}" / manual_filename

        if manual_path.exists():
            try:
                df = pd.read_csv(manual_path)
                raw_datasets[bank]["manual_labels"] = df
                print(f"  ✅ Manual labels: {df.shape}")
            except Exception as e:
                print(f"  ❌ Manual labels: Error loading - {str(e)}")

    return raw_datasets

# Load all raw datasets
raw_datasets = load_raw_datasets_enhanced()



LOADING RAW DATASETS

📂 Loading JPM datasets...
  ✅ q1_2025: (112, 10)
  ✅ q2_2025: (149, 10)
  ✅ Combined: (261, 11)
  ✅ Manual labels: (1121, 27)

📂 Loading HSBC datasets...
  ✅ q1_2025: (49, 11)
  ✅ q2_2025: (30, 11)
  ✅ Combined: (79, 12)
  ✅ Manual labels: (858, 27)


In [3]:
## Enhanced Cleaning Functions

def standardize_column_names_enhanced(df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced column name standardization with financial domain mapping."""
    df = df.copy()

    # Convert to snake_case
    new_columns = {}
    for col in df.columns:
        # Remove special characters and convert to lowercase
        clean_col = re.sub(r'[^\w\s]', '', str(col).lower())
        # Replace spaces with underscores
        clean_col = re.sub(r'\s+', '_', clean_col.strip())
        # Remove multiple underscores
        clean_col = re.sub(r'_+', '_', clean_col)
        new_columns[col] = clean_col

    df = df.rename(columns=new_columns)

    # Enhanced financial domain mapping
    enhanced_column_mapping = {
        # Text content mappings
        'content': 'text',
        'transcript': 'text',
        'question': 'text',
        'answer': 'text',
        'response': 'text',
        'statement': 'text',

        # Speaker mappings
        'speaker_name': 'speaker',
        'participant': 'speaker',
        'person': 'speaker',
        'individual': 'speaker',

        # Role mappings
        'role': 'speaker_role',
        'position': 'speaker_role',
        'title': 'speaker_role',
        'function': 'speaker_role',

        # Timing mappings
        'time': 'timestamp',
        'datetime': 'timestamp',
        'sequence': 'order_id',
        'order': 'order_id',

        # Classification mappings
        'type': 'qa_type',
        'category': 'topic',
        'subject': 'topic',
        'theme': 'topic'
    }

    # Apply enhanced mapping
    mapping_applied = []
    for old_col, new_col in enhanced_column_mapping.items():
        if old_col in df.columns and new_col not in df.columns:
            df = df.rename(columns={old_col: new_col})
            mapping_applied.append(f"{old_col} → {new_col}")

    if mapping_applied:
        print(f"  Column mappings applied: {', '.join(mapping_applied)}")

    print(f"  Standardized columns: {list(df.columns)}")
    return df

def clean_speaker_names_enhanced(df: pd.DataFrame, speaker_col: str = 'speaker') -> pd.DataFrame:
    """Enhanced speaker name cleaning with financial role detection."""
    if speaker_col not in df.columns:
        print(f"  Warning: Speaker column '{speaker_col}' not found")
        return df

    df = df.copy()

    # Enhanced financial speaker role mapping
    enhanced_speaker_mapping = {
        # Executive roles
        r'.*\b(ceo|chief executive)\b.*': 'ceo',
        r'.*\b(cfo|chief financial)\b.*': 'cfo',
        r'.*\b(coo|chief operating)\b.*': 'coo',
        r'.*\b(cro|chief risk)\b.*': 'cro',
        r'.*\b(president)\b.*': 'president',
        r'.*\b(chairman|chair)\b.*': 'chairman',
        r'.*\b(director)\b.*': 'director',
        r'.*\b(head of|head)\b.*': 'head',

        # Analyst roles
        r'.*\b(analyst|research)\b.*': 'analyst',
        r'.*\b(equity research)\b.*': 'analyst',
        r'.*\b(investment banking)\b.*': 'analyst',
        r'.*\b(portfolio manager)\b.*': 'analyst',

        # Operational roles
        r'.*\b(operator|moderator)\b.*': 'operator',
        r'.*\b(facilitator|host)\b.*': 'operator',
        r'.*\b(coordinator)\b.*': 'operator',

        # Generic roles
        r'.*\b(management|executive)\b.*': 'executive',
        r'.*\b(representative|rep)\b.*': 'representative',
        r'.*\b(investor relations|ir)\b.*': 'investor_relations'
    }

    # Clean speaker names
    df[speaker_col] = df[speaker_col].astype(str).str.lower().str.strip()

    # Apply enhanced role detection
    df['speaker_role'] = 'other'  # Default

    for pattern, role in enhanced_speaker_mapping.items():
        mask = df[speaker_col].str.contains(pattern, regex=True, na=False)
        df.loc[mask, 'speaker_role'] = role

    # Also check original_role column if it exists
    if 'original_role' in df.columns:
        df['original_role'] = df['original_role'].astype(str).str.lower().str.strip()
        for pattern, role in enhanced_speaker_mapping.items():
            mask = df['original_role'].str.contains(pattern, regex=True, na=False)
            df.loc[mask, 'speaker_role'] = role

    # Role distribution
    role_counts = df['speaker_role'].value_counts()
    print(f"  Speaker roles detected:")
    for role, count in role_counts.items():
        print(f"    {role}: {count}")

    return df

def clean_text_content_enhanced(df: pd.DataFrame, text_col: str = 'text') -> pd.DataFrame:
    """Enhanced text cleaning optimized for financial content and 4 sentiment models."""
    if text_col not in df.columns:
        print(f"  Warning: Text column '{text_col}' not found")
        return df

    df = df.copy()
    print(f"  Cleaning text in column: {text_col}")

    # Convert to string and handle missing values
    df[text_col] = df[text_col].astype(str).fillna('')

    # Enhanced financial text cleaning
    def clean_financial_text(text):
        """Enhanced financial text cleaning function."""
        if pd.isna(text) or text == '' or text == 'nan':
            return ''

        text = str(text)

        # Remove common artifacts from earnings calls
        text = re.sub(r'\[.*?\]', '', text)  # [brackets]
        text = re.sub(r'\(.*?\)', '', text)  # (parentheses)
        text = re.sub(r'--+', ' ', text)     # Multiple dashes
        text = re.sub(r'\*+', '', text)      # Asterisks
        text = re.sub(r'#+', '', text)       # Hash symbols

        # Clean financial notation while preserving meaning
        text = re.sub(r'\$\s*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(billion|million|thousand|bn|mn|k)\b',
                     r'$\1 \2', text, flags=re.IGNORECASE)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()

        # Preserve financial terminology
        # Remove very short or very long texts that might be artifacts
        if len(text) < 10 or len(text) > 10000:
            return ''

        return text

    # Apply enhanced cleaning
    df[f'{text_col}_clean'] = df[text_col].apply(clean_financial_text)

    # Calculate enhanced text statistics
    df[f'{text_col}_length'] = df[f'{text_col}_clean'].str.len()
    df[f'{text_col}_word_count'] = df[f'{text_col}_clean'].str.split().str.len()
    df[f'{text_col}_sentence_count'] = df[f'{text_col}_clean'].str.count(r'[.!?]+') + 1

    # Enhanced readability metrics for model analysis
    try:
        import textstat
        df[f'{text_col}_readability'] = df[f'{text_col}_clean'].apply(
            lambda x: textstat.flesch_reading_ease(x) if x and len(x) > 0 else 0
        )
    except ImportError:
        df[f'{text_col}_readability'] = 50  # Default moderate readability

    # Remove very short texts
    min_length = 15
    short_text_mask = df[f'{text_col}_length'] < min_length
    df.loc[short_text_mask, f'{text_col}_clean'] = ''

    # Calculate cleaning effectiveness
    original_lengths = df[text_col].str.len()
    clean_lengths = df[f'{text_col}_length'].fillna(0)
    avg_reduction = (original_lengths - clean_lengths).mean()

    valid_text_count = (df[f'{text_col}_clean'] != '').sum()

    print(f"    Average length reduction: {avg_reduction:.1f} characters")
    print(f"    Valid texts after cleaning: {valid_text_count}/{len(df)} ({valid_text_count/len(df)*100:.1f}%)")

    # Sample cleaned text
    valid_texts = df[df[f'{text_col}_clean'] != ''][f'{text_col}_clean']
    if len(valid_texts) > 0:
        sample_text = valid_texts.iloc[0]
        print(f"    Sample: '{sample_text[:150]}...'")

    return df

def add_enhanced_metadata(df: pd.DataFrame, bank_code: str) -> pd.DataFrame:
    """Add enhanced metadata optimized for 4-model sentiment analysis."""
    df = df.copy()

    # Basic metadata
    df['qa_id'] = df.index
    df['bank_code'] = bank_code
    df['processing_timestamp'] = pd.Timestamp.now()

    # Enhanced validation flags for different models
    df['is_valid_for_analysis'] = True

    # Model-specific readiness flags
    for model_key in MODELS.keys():
        df[f'ready_for_{model_key}'] = True

    # Text validation
    if 'text_clean' in df.columns:
        # Basic text validation
        invalid_text_mask = (
            (df['text_clean'] == '') |
            (df['text_clean'].isna()) |
            (df['text_length'] < 15) |
            (df['text_word_count'] < 3)
        )

        df.loc[invalid_text_mask, 'is_valid_for_analysis'] = False

        # Model-specific validation
        # FinBERT models prefer longer, more structured text
        finbert_invalid = (df['text_length'] < 20) | (df['text_word_count'] < 4)
        df.loc[finbert_invalid, 'ready_for_finbert_yiyanghkust'] = False
        df.loc[finbert_invalid, 'ready_for_finbert_prosusai'] = False

        # RoBERTa models are more flexible
        roberta_invalid = (df['text_length'] < 10) | (df['text_word_count'] < 2)
        df.loc[roberta_invalid, 'ready_for_distilroberta'] = False
        df.loc[roberta_invalid, 'ready_for_cardiffnlp_roberta'] = False

    # Speaker role validation
    if 'speaker_role' in df.columns:
        # Flag records with unidentified speakers for special handling
        df['speaker_identified'] = df['speaker_role'] != 'other'

    # Enhanced financial context flags
    if 'text_clean' in df.columns:
        # Financial sentiment indicators
        financial_keywords = {
            'earnings_context': r'\b(earnings|revenue|profit|loss|income|ebitda)\b',
            'risk_context': r'\b(risk|exposure|provision|default|credit)\b',
            'growth_context': r'\b(growth|expansion|increase|improvement|strong)\b',
            'performance_context': r'\b(performance|results|metrics|kpi|target)\b'
        }

        for context_type, pattern in financial_keywords.items():
            df[f'has_{context_type}'] = df['text_clean'].str.contains(
                pattern, case=False, regex=True, na=False
            )

    # Summary statistics
    total_records = len(df)
    valid_records = df['is_valid_for_analysis'].sum()

    print(f"  Enhanced metadata added:")
    print(f"    Total records: {total_records}")
    print(f"    Valid for analysis: {valid_records} ({valid_records/total_records*100:.1f}%)")

    # Model readiness summary
    for model_key in MODELS.keys():
        ready_count = df[f'ready_for_{model_key}'].sum()
        print(f"    Ready for {model_key}: {ready_count} ({ready_count/total_records*100:.1f}%)")

    return df


In [4]:
## Enhanced Dataset Processing

def process_dataset_enhanced(df: pd.DataFrame, dataset_name: str, bank_code: str) -> pd.DataFrame:
    """Apply all enhanced cleaning steps to a dataset."""
    if df is None:
        print(f"❌ Cannot process {dataset_name} for {bank_code.upper()} - dataset is None")
        return None

    print(f"\n🧹 [{bank_code.upper()}] PROCESSING {dataset_name.upper()}")
    print("-" * 50)
    print(f"Input shape: {df.shape}")
    print(f"Input columns: {list(df.columns)}")

    # Step 1: Standardize column names
    df_clean = standardize_column_names_enhanced(df)

    # Step 2: Clean speaker names and detect roles
    df_clean = clean_speaker_names_enhanced(df_clean)

    # Step 3: Enhanced text cleaning
    df_clean = clean_text_content_enhanced(df_clean)

    # Step 4: Remove duplicates with enhanced detection
    original_count = len(df_clean)
    df_clean = df_clean.drop_duplicates()

    # Enhanced duplicate detection on cleaned text
    if 'text_clean' in df_clean.columns:
        df_clean = df_clean.drop_duplicates(subset=['text_clean'])

    removed_count = original_count - len(df_clean)
    print(f"  Removed {removed_count} duplicates ({removed_count/original_count*100:.1f}%)")

    # Step 5: Add enhanced metadata
    df_clean = add_enhanced_metadata(df_clean, bank_code)

    # Step 6: Filter for valid records but keep some invalid for debugging
    valid_df = df_clean[df_clean['is_valid_for_analysis']].copy()
    invalid_count = len(df_clean) - len(valid_df)

    print(f"Final shape: {valid_df.shape} (removed {invalid_count} invalid entries)")

    return valid_df

# Process all datasets for both banks
print(f"\n{'='*60}")
print("ENHANCED DATASET PROCESSING")
print(f"{'='*60}")

processed_datasets = {}

for bank in BANKS:
    processed_datasets[bank] = {}

    if bank in raw_datasets:
        print(f"\n📊 Processing {bank.upper()} datasets...")

        # Process quarterly datasets
        for quarter in QUARTERS:
            if quarter in raw_datasets[bank]:
                processed_datasets[bank][quarter] = process_dataset_enhanced(
                    raw_datasets[bank][quarter], quarter, bank
                )

        # Process combined dataset
        if "combined" in raw_datasets[bank]:
            processed_datasets[bank]["combined"] = process_dataset_enhanced(
                raw_datasets[bank]["combined"], "combined", bank
            )



ENHANCED DATASET PROCESSING

📊 Processing JPM datasets...

🧹 [JPM] PROCESSING Q1_2025
--------------------------------------------------
Input shape: (112, 10)
Input columns: ['section', 'question_number', 'answer_number', 'speaker_name', 'role', 'company', 'content', 'year', 'quarter', 'is_pleasantry']
  Column mappings applied: content → text, speaker_name → speaker, role → speaker_role
  Standardized columns: ['section', 'question_number', 'answer_number', 'speaker', 'speaker_role', 'company', 'text', 'year', 'quarter', 'is_pleasantry']
  Speaker roles detected:
    other: 112
  Cleaning text in column: text
    Average length reduction: 1.1 characters
    Valid texts after cleaning: 107/112 (95.5%)
    Sample: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 billion, EPS of $5.07 on revenue of $46 billion, with ...'
  Removed 5 duplicates (4.5%)
  Enhanced metadata added:
    Total records: 107
    Valid for analysis: 106 (99.1%)
   

In [5]:
## Enhanced Sentence-level Preprocessing for 4 Models

def preprocess_for_sentiment_models_enhanced(df: pd.DataFrame, bank_code: str) -> pd.DataFrame:
    """Enhanced preprocessing specifically optimized for 4 sentiment models."""
    if df is None:
        print(f"❌ Cannot preprocess for {bank_code.upper()} - dataset is None")
        return None

    print(f"\n🔬 [{bank_code.upper()}] SENTENCE-LEVEL PREPROCESSING FOR 4 MODELS")
    print("-" * 60)
    print(f"Input shape: {df.shape}")

    if 'text_clean' not in df.columns:
        print("❌ 'text_clean' column not found")
        return pd.DataFrame()

    sentences = []
    processed_count = 0

    for idx, row in df.iterrows():
        text_content = row.get('text_clean', '')

        # Skip invalid text
        if pd.isna(text_content) or str(text_content).strip() == '' or str(text_content) == 'nan':
            continue

        text_content = str(text_content).strip()

        if len(text_content) < 15:  # Minimum length for sentiment analysis
            continue

        # Enhanced sentence splitting for financial text
        # Split on multiple sentence terminators and financial patterns
        sentence_patterns = [
            r'[.!?]+\s+',  # Standard sentence endings
            r'[.!?]+$'
    ,    # End of text
            r';\s+',       # Semicolon separations
            r'\.\s+[A-Z]', # Period followed by capital letter
        ]

        # Use the first pattern that produces meaningful splits
        text_sentences = [text_content]  # Default to whole text

        for pattern in sentence_patterns:
            splits = re.split(pattern, text_content)
            if len(splits) > 1 and all(len(s.strip()) > 10 for s in splits if s.strip()):
                text_sentences = splits
                break

        sentence_count = 0
        for sent_idx, sentence in enumerate(text_sentences):
            sentence = sentence.strip()

            # Enhanced sentence validation for different models
            if len(sentence) < 15:  # Too short
                continue

            if len(sentence) > 5000:  # Too long - truncate
                sentence = sentence[:5000]

            # Model-specific preprocessing flags
            model_ready = {}

            # FinBERT models prefer structured financial language
            finbert_ready = (
                len(sentence) >= 20 and
                len(sentence.split()) >= 4 and
                any(word in sentence.lower() for word in [
                    'quarter', 'revenue', 'profit', 'growth', 'performance',
                    'market', 'business', 'financial', 'earnings', 'results'
                ])
            )
            model_ready['finbert_yiyanghkust'] = finbert_ready
            model_ready['finbert_prosusai'] = finbert_ready

            # RoBERTa models are more flexible
            roberta_ready = len(sentence) >= 10 and len(sentence.split()) >= 2
            model_ready['distilroberta'] = roberta_ready
            model_ready['cardiffnlp_roberta'] = roberta_ready

            # Enhanced financial context detection
            financial_indicators = {
                'has_financial_numbers': bool(re.search(r'\$\d+|\d+%|\d+\.\d+', sentence)),
                'has_temporal_ref': bool(re.search(r'\b(quarter|q[1-4]|year|month|2025)\b', sentence, re.I)),
                'has_performance_terms': bool(re.search(r'\b(increase|decrease|growth|decline|strong|weak)\b', sentence, re.I)),
                'has_financial_terms': bool(re.search(r'\b(revenue|profit|earnings|ebitda|margin|roi)\b', sentence, re.I))
            }

            sentence_record = {
                'original_qa_id': row.get('qa_id', idx),
                'sentence_id': f"{bank_code}_{row.get('qa_id', idx)}_{sent_idx}",
                'text': sentence,
                'speaker': row.get('speaker', ''),
                'speaker_role': row.get('speaker_role', ''),
                'quarter': row.get('quarter', ''),
                'bank_code': bank_code,
                'sentence_length': len(sentence),
                'sentence_word_count': len(sentence.split()),
                'sentence_order': sent_idx,
                'readability_score': row.get('text_readability', 50),

                # Model readiness flags
                **{f'ready_for_{model}': ready for model, ready in model_ready.items()},

                # Financial context flags
                **financial_indicators,

                # Processing metadata
                'processing_timestamp': pd.Timestamp.now().isoformat(),
                'preprocessing_version': 'enhanced_v1'
            }

            sentences.append(sentence_record)
            sentence_count += 1

        if sentence_count > 0:
            processed_count += 1

    sentences_df = pd.DataFrame(sentences)

    print(f"  Processed {processed_count}/{len(df)} Q&A pairs")
    print(f"  Created sentence-level dataset: {len(sentences_df)} sentences")

    if len(sentences_df) > 0:
        # Model readiness summary
        for model_key in MODELS.keys():
            ready_count = sentences_df[f'ready_for_{model_key}'].sum()
            print(f"    Ready for {model_key}: {ready_count} ({ready_count/len(sentences_df)*100:.1f}%)")

        # Financial context summary
        financial_stats = {
            'with_numbers': sentences_df['has_financial_numbers'].sum(),
            'with_temporal': sentences_df['has_temporal_ref'].sum(),
            'with_performance': sentences_df['has_performance_terms'].sum(),
            'with_financial_terms': sentences_df['has_financial_terms'].sum()
        }

        print(f"  Financial context analysis:")
        for context, count in financial_stats.items():
            print(f"    {context}: {count} ({count/len(sentences_df)*100:.1f}%)")

        # Sample sentences for different models
        print(f"  Sample sentences:")
        for model_key in list(MODELS.keys())[:2]:  # Show first 2 models
            model_ready_sentences = sentences_df[sentences_df[f'ready_for_{model_key}']]
            if len(model_ready_sentences) > 0:
                sample = model_ready_sentences.iloc[0]['text']
                print(f"    {model_key}: '{sample[:100]}...'")

    return sentences_df

# Create enhanced sentence-level datasets for 4 models
print(f"\n{'='*60}")
print("SENTENCE-LEVEL PREPROCESSING FOR 4 MODELS")
print(f"{'='*60}")

sentence_level_datasets = {}

for bank in BANKS:
    sentence_level_datasets[bank] = {}

    if bank in processed_datasets:
        print(f"\n🔬 Creating sentence-level datasets for {bank.upper()}...")

        # Process quarterly datasets
        for quarter in QUARTERS:
            if quarter in processed_datasets[bank] and processed_datasets[bank][quarter] is not None:
                sentence_level_datasets[bank][quarter] = preprocess_for_sentiment_models_enhanced(
                    processed_datasets[bank][quarter], bank
                )

        # Process combined dataset
        if "combined" in processed_datasets[bank] and processed_datasets[bank]["combined"] is not None:
            sentence_level_datasets[bank]["combined"] = preprocess_for_sentiment_models_enhanced(
                processed_datasets[bank]["combined"], bank
            )



SENTENCE-LEVEL PREPROCESSING FOR 4 MODELS

🔬 Creating sentence-level datasets for JPM...

🔬 [JPM] SENTENCE-LEVEL PREPROCESSING FOR 4 MODELS
------------------------------------------------------------
Input shape: (106, 28)
  Processed 105/106 Q&A pairs
  Created sentence-level dataset: 313 sentences
    Ready for finbert_yiyanghkust: 56 (17.9%)
    Ready for finbert_prosusai: 56 (17.9%)
    Ready for distilroberta: 313 (100.0%)
    Ready for cardiffnlp_roberta: 313 (100.0%)
  Financial context analysis:
    with_numbers: 16 (5.1%)
    with_temporal: 30 (9.6%)
    with_performance: 16 (5.1%)
    with_financial_terms: 13 (4.2%)
  Sample sentences:
    finbert_yiyanghkust: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 bill...'
    finbert_prosusai: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 bill...'

🔬 [JPM] SENTENCE-LEVEL PREPROCESSING FOR 4 MODELS
----------------------------------

In [6]:
## Enhanced Data Quality Assessment

def assess_enhanced_data_quality():
    """Comprehensive quality assessment for multi-bank, multi-model analysis."""
    print(f"\n{'='*60}")
    print("ENHANCED DATA QUALITY ASSESSMENT")
    print(f"{'='*60}")

    quality_report = {
        "processing_timestamp": pd.Timestamp.now().isoformat(),
        "banks_processed": {},
        "model_readiness": {},
        "financial_context_analysis": {},
        "overall_statistics": {}
    }

    total_qa_pairs = 0
    total_sentences = 0
    total_memory = 0

    for bank in BANKS:
        bank_stats = {
            "qa_level": {},
            "sentence_level": {},
            "model_readiness": {},
            "quality_issues": []
        }

        print(f"\n📊 [{bank.upper()}] Quality Assessment")
        print("-" * 40)

        # Assess Q&A level data
        if bank in processed_datasets:
            for dataset_type, df in processed_datasets[bank].items():
                if df is not None:
                    bank_stats["qa_level"][dataset_type] = {
                        "shape": df.shape,
                        "valid_records": df.get('is_valid_for_analysis', pd.Series([True] * len(df))).sum(),
                        "memory_mb": df.memory_usage(deep=True).sum() / 1024**2
                    }
                    total_qa_pairs += len(df)
                    total_memory += df.memory_usage(deep=True).sum()

        # Assess sentence-level data
        if bank in sentence_level_datasets:
            for dataset_type, df in sentence_level_datasets[bank].items():
                if df is not None:
                    bank_stats["sentence_level"][dataset_type] = {
                        "shape": df.shape,
                        "memory_mb": df.memory_usage(deep=True).sum() / 1024**2
                    }
                    total_sentences += len(df)
                    total_memory += df.memory_usage(deep=True).sum()

                    # Model readiness assessment
                    for model_key in MODELS.keys():
                        ready_col = f'ready_for_{model_key}'
                        if ready_col in df.columns:
                            ready_count = df[ready_col].sum()
                            ready_pct = ready_count / len(df) * 100

                            if model_key not in bank_stats["model_readiness"]:
                                bank_stats["model_readiness"][model_key] = {}

                            bank_stats["model_readiness"][model_key][dataset_type] = {
                                "ready_count": ready_count,
                                "ready_percentage": ready_pct
                            }

                            # Flag potential issues
                            if ready_pct < 80:
                                bank_stats["quality_issues"].append(
                                    f"{dataset_type}: Low {model_key} readiness ({ready_pct:.1f}%)"
                                )

        quality_report["banks_processed"][bank] = bank_stats

        # Print bank summary
        qa_datasets = len(bank_stats["qa_level"])
        sentence_datasets = len(bank_stats["sentence_level"])
        issues = len(bank_stats["quality_issues"])

        print(f"  Q&A level datasets: {qa_datasets}")
        print(f"  Sentence level datasets: {sentence_datasets}")
        print(f"  Quality issues: {issues}")

        if bank_stats["quality_issues"]:
            for issue in bank_stats["quality_issues"]:
                print(f"    - {issue}")
        else:
            print(f"    No major quality issues detected")

    # Overall model readiness summary
    print(f"\n🎯 Model Readiness Summary:")
    for model_key in MODELS.keys():
        total_ready = 0
        total_records = 0

        for bank in BANKS:
            if bank in quality_report["banks_processed"]:
                bank_readiness = quality_report["banks_processed"][bank]["model_readiness"]
                if model_key in bank_readiness:
                    for dataset_stats in bank_readiness[model_key].values():
                        total_ready += dataset_stats["ready_count"]
                        total_records += dataset_stats["ready_count"] / dataset_stats["ready_percentage"] * 100

        if total_records > 0:
            overall_readiness = total_ready / total_records * 100
            quality_report["model_readiness"][model_key] = {
                "overall_readiness_percentage": overall_readiness,
                "total_ready_records": total_ready
            }
            print(f"  {model_key}: {total_ready:,} records ready ({overall_readiness:.1f}%)")

    # Overall statistics
    quality_report["overall_statistics"] = {
        "total_qa_pairs": total_qa_pairs,
        "total_sentences": total_sentences,
        "total_memory_mb": total_memory / 1024**2,
        "banks_with_complete_data": len([
            bank for bank in BANKS
            if bank in sentence_level_datasets and
            len(sentence_level_datasets[bank]) > 0
        ]),
        "models_with_high_readiness": len([
            model for model, stats in quality_report["model_readiness"].items()
            if stats.get("overall_readiness_percentage", 0) >= 80
        ])
    }

    print(f"\n📈 Overall Statistics:")
    print(f"  Total Q&A pairs: {total_qa_pairs:,}")
    print(f"  Total sentences: {total_sentences:,}")
    print(f"  Total memory: {total_memory / 1024**2:.2f} MB")
    print(f"  Banks with complete data: {quality_report['overall_statistics']['banks_with_complete_data']}/{len(BANKS)}")
    print(f"  Models with >80% readiness: {quality_report['overall_statistics']['models_with_high_readiness']}/{len(MODELS)}")

    return quality_report

# Run enhanced quality assessment
enhanced_quality_report = assess_enhanced_data_quality()



ENHANCED DATA QUALITY ASSESSMENT

📊 [JPM] Quality Assessment
----------------------------------------
  Q&A level datasets: 3
  Sentence level datasets: 3
  Quality issues: 6
    - q1_2025: Low finbert_yiyanghkust readiness (17.9%)
    - q1_2025: Low finbert_prosusai readiness (17.9%)
    - q2_2025: Low finbert_yiyanghkust readiness (19.3%)
    - q2_2025: Low finbert_prosusai readiness (19.3%)
    - combined: Low finbert_yiyanghkust readiness (18.8%)
    - combined: Low finbert_prosusai readiness (18.8%)

📊 [HSBC] Quality Assessment
----------------------------------------
  Q&A level datasets: 3
  Sentence level datasets: 3
  Quality issues: 6
    - q1_2025: Low finbert_yiyanghkust readiness (40.3%)
    - q1_2025: Low finbert_prosusai readiness (40.3%)
    - q2_2025: Low finbert_yiyanghkust readiness (34.1%)
    - q2_2025: Low finbert_prosusai readiness (34.1%)
    - combined: Low finbert_yiyanghkust readiness (37.0%)
    - combined: Low finbert_prosusai readiness (37.0%)

🎯 Model Re

In [7]:
## Save Enhanced Clean and Processed Datasets

def save_enhanced_datasets():
    """Save all cleaned and processed datasets with enhanced organization."""
    print(f"\n{'='*60}")
    print("SAVING ENHANCED DATASETS")
    print(f"{'='*60}")

    saved_files = {}

    for bank in BANKS:
        saved_files[bank] = {}

        print(f"\n💾 Saving {bank.upper()} datasets...")

        # Save Q&A level cleaned datasets
        if bank in processed_datasets:
            for dataset_type, df in processed_datasets[bank].items():
                if df is not None:
                    filename = f"clean_{bank}_{dataset_type}_qa_level.csv"

                    # Save to multiple locations
                    drive_path = drive_base / f"data/clean/{bank}" / filename
                    colab_path = colab_base / f"data/clean/{bank}" / filename

                    drive_path.parent.mkdir(parents=True, exist_ok=True)
                    colab_path.parent.mkdir(parents=True, exist_ok=True)

                    df.to_csv(drive_path, index=False)
                    df.to_csv(colab_path, index=False)

                    saved_files[bank][f"qa_{dataset_type}"] = {
                        "drive_path": str(drive_path),
                        "colab_path": str(colab_path),
                        "shape": df.shape,
                        "type": "qa_level"
                    }

                    print(f"    Q&A {dataset_type}: {filename} ({df.shape})")

        # Save sentence level datasets for 4 models
        if bank in sentence_level_datasets:
            for dataset_type, df in sentence_level_datasets[bank].items():
                if df is not None:
                    filename = f"processed_{bank}_{dataset_type}_sentence_level.csv"

                    # Save to processed directory
                    drive_path = drive_base / f"data/processed/{bank}" / filename
                    colab_path = colab_base / f"data/processed/{bank}" / filename

                    drive_path.parent.mkdir(parents=True, exist_ok=True)
                    colab_path.parent.mkdir(parents=True, exist_ok=True)

                    df.to_csv(drive_path, index=False)
                    df.to_csv(colab_path, index=False)

                    saved_files[bank][f"sentence_{dataset_type}"] = {
                        "drive_path": str(drive_path),
                        "colab_path": str(colab_path),
                        "shape": df.shape,
                        "type": "sentence_level"
                    }

                    print(f"    Sentence {dataset_type}: {filename} ({df.shape})")

    return saved_files

# Save all enhanced datasets
saved_files = save_enhanced_datasets()



SAVING ENHANCED DATASETS

💾 Saving JPM datasets...
    Q&A q1_2025: clean_jpm_q1_2025_qa_level.csv ((106, 28))
    Q&A q2_2025: clean_jpm_q2_2025_qa_level.csv ((140, 28))
    Q&A combined: clean_jpm_combined_qa_level.csv ((245, 28))
    Sentence q1_2025: processed_jpm_q1_2025_sentence_level.csv ((313, 21))
    Sentence q2_2025: processed_jpm_q2_2025_sentence_level.csv ((440, 21))
    Sentence combined: processed_jpm_combined_sentence_level.csv ((752, 21))

💾 Saving HSBC datasets...
    Q&A q1_2025: clean_hsbc_q1_2025_qa_level.csv ((49, 29))
    Q&A q2_2025: clean_hsbc_q2_2025_qa_level.csv ((30, 29))
    Q&A combined: clean_hsbc_combined_qa_level.csv ((79, 29))
    Sentence q1_2025: processed_hsbc_q1_2025_sentence_level.csv ((300, 21))
    Sentence q2_2025: processed_hsbc_q2_2025_sentence_level.csv ((340, 21))
    Sentence combined: processed_hsbc_combined_sentence_level.csv ((640, 21))


In [8]:
## Save Enhanced Processing Report

enhanced_processing_report = {
    "processing_timestamp": pd.Timestamp.now().isoformat(),
    "banks_processed": BANKS,
    "models_configured": list(MODELS.keys()),
    "processing_stages": [
        "column_standardization",
        "speaker_role_detection",
        "enhanced_text_cleaning",
        "duplicate_removal",
        "metadata_enhancement",
        "sentence_level_preprocessing",
        "model_readiness_assessment"
    ],
    "datasets_created": saved_files,
    "quality_assessment": enhanced_quality_report,
    "model_readiness_summary": {
        model_key: stats for model_key, stats in enhanced_quality_report["model_readiness"].items()
    },
    "processing_statistics": {
        "total_files_saved": sum(len(bank_files) for bank_files in saved_files.values()),
        "total_qa_pairs": enhanced_quality_report["overall_statistics"]["total_qa_pairs"],
        "total_sentences": enhanced_quality_report["overall_statistics"]["total_sentences"],
        "total_memory_mb": enhanced_quality_report["overall_statistics"]["total_memory_mb"]
    }
}

# Save processing report
report_path = drive_base / "configs" / "enhanced_processing_report.json"
with open(report_path, "w") as f:
    json.dump(enhanced_processing_report, f, indent=2, default=str)

print(f"Enhanced processing report saved: {report_path}")

## Final Summary

print(f"\n{'='*60}")
print("ENHANCED CLEANING AND PREPROCESSING COMPLETE")
print(f"{'='*60}")

# Extract key statistics
total_files = sum(len(bank_files) for bank_files in saved_files.values())
total_qa_pairs = enhanced_quality_report["overall_statistics"]["total_qa_pairs"]
total_sentences = enhanced_quality_report["overall_statistics"]["total_sentences"]
total_memory = enhanced_quality_report["overall_statistics"]["total_memory_mb"]
high_readiness_models = enhanced_quality_report["overall_statistics"]["models_with_high_readiness"]

print(f"📊 Processing Summary:")
print(f"  Banks processed: {len(BANKS)} ({', '.join([b.upper() for b in BANKS])})")
print(f"  Files created: {total_files}")
print(f"  Q&A pairs processed: {total_qa_pairs:,}")
print(f"  Sentences created: {total_sentences:,}")
print(f"  Total memory: {total_memory:.2f} MB")
print(f"  Models with high readiness: {high_readiness_models}/{len(MODELS)}")

print(f"\n🏦 Bank-specific Summary:")
for bank in BANKS:
    if bank in saved_files:
        bank_files = len(saved_files[bank])
        qa_files = len([f for f in saved_files[bank].values() if f["type"] == "qa_level"])
        sentence_files = len([f for f in saved_files[bank].values() if f["type"] == "sentence_level"])

        print(f"  {bank.upper()}:")
        print(f"    Total files: {bank_files}")
        print(f"    Q&A level: {qa_files}")
        print(f"    Sentence level: {sentence_files}")

print(f"\n🎯 Model Readiness:")
for model_key in MODELS.keys():
    if model_key in enhanced_quality_report["model_readiness"]:
        readiness = enhanced_quality_report["model_readiness"][model_key]
        ready_records = readiness["total_ready_records"]
        ready_pct = readiness["overall_readiness_percentage"]
        status = "Ready" if ready_pct >= 80 else "Needs attention"
        print(f"  {model_key}: {ready_records:,} records ({ready_pct:.1f}%) - {status}")

print(f"\n🚀 Next Steps:")
print(f"  1. Run 03b_manual_validation.ipynb for manual label validation")
print(f"  2. Run 04_sentiment_analysis.ipynb for 4-model sentiment analysis")
print(f"  3. Enhanced datasets ready for all {len(MODELS)} models")
print(f"  4. Financial context features integrated for domain-specific analysis")

print(f"\nEnhanced preprocessing complete for multi-bank, multi-model analysis!")

Enhanced processing report saved: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_processing_report.json

ENHANCED CLEANING AND PREPROCESSING COMPLETE
📊 Processing Summary:
  Banks processed: 2 (JPM, HSBC)
  Files created: 12
  Q&A pairs processed: 649
  Sentences created: 2,785
  Total memory: 4.39 MB
  Models with high readiness: 2/4

🏦 Bank-specific Summary:
  JPM:
    Total files: 6
    Q&A level: 3
    Sentence level: 3
  HSBC:
    Total files: 6
    Q&A level: 3
    Sentence level: 3

🎯 Model Readiness:
  finbert_yiyanghkust: 756 records (27.1%) - Needs attention
  finbert_prosusai: 756 records (27.1%) - Needs attention
  distilroberta: 2,785 records (100.0%) - Ready
  cardiffnlp_roberta: 2,785 records (100.0%) - Ready

🚀 Next Steps:
  1. Run 03b_manual_validation.ipynb for manual label validation
  2. Run 04_sentiment_analysis.ipynb for 4-model sentiment analysis
  3. Enhanced datasets ready for all 4 models
  4. Financial context features integrated for domai