In [1]:
# 02_load_data_enhanced.ipynb
# Purpose: Load earnings call transcript data for both JP Morgan and HSBC
# Banks: JP Morgan (JPM) and HSBC
# Quarters: Q1 2025, Q2 2025
# Input: Google Drive CSV files for both banks
# Output: Enhanced raw datasets for both banks and quarters

## Import Libraries

import pandas as pd
import numpy as np
import json
import requests
import io
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Google Colab
from google.colab import drive

# Mount drive
drive.mount("/content/drive")

# Load enhanced configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_config.json")
with open(config_path, "r") as f:
    enhanced_config = json.load(f)

SEED = enhanced_config["SEED"]
BANKS = enhanced_config["BANKS"]
QUARTERS = enhanced_config["QUARTERS"]
MODELS = enhanced_config["MODELS"]
drive_base = Path(enhanced_config["drive_base"])
colab_base = Path(enhanced_config["colab_base"])
data_urls = enhanced_config["data_urls"]

print(f"Enhanced data loading for banks: {', '.join([bank.upper() for bank in BANKS])}")
print(f"Quarters: {', '.join(QUARTERS)}")


Mounted at /content/drive
Enhanced data loading for banks: JPM, HSBC
Quarters: q1_2025, q2_2025


In [2]:
## Enhanced Helper Functions

def extract_file_id_from_drive_url(url: str) -> str:
    """Extract file ID from Google Drive sharing URL."""
    if "drive.google.com" in url and "/file/d/" in url:
        return url.split("/file/d/")[1].split("/")[0]
    return None

def download_from_drive_enhanced(file_id: str, filename: str, bank_code: str) -> Tuple[Path, Path]:
    """Enhanced download function with bank-specific paths."""
    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"

    # Bank-specific paths
    drive_path = drive_base / f"data/raw/{bank_code}" / filename
    colab_path = colab_base / f"data/raw/{bank_code}" / filename

    # Ensure directories exist
    drive_path.parent.mkdir(parents=True, exist_ok=True)
    colab_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        response = requests.get(download_url, timeout=300)
        response.raise_for_status()

        # Save to both locations
        with open(drive_path, 'wb') as f:
            f.write(response.content)

        with open(colab_path, 'wb') as f:
            f.write(response.content)

        print(f"✅ Downloaded {filename} for {bank_code.upper()}")
        print(f"  Drive: {drive_path}")
        print(f"  Colab: {colab_path}")

        return drive_path, colab_path

    except Exception as e:
        print(f"❌ Error downloading {filename} for {bank_code.upper()}: {str(e)}")
        return None, None

def read_csv_safe_enhanced(path: Path, encoding='utf-8') -> pd.DataFrame:
    """Enhanced CSV reading with multiple encoding attempts."""
    encodings_to_try = [encoding, 'latin-1', 'cp1252', 'iso-8859-1']

    for enc in encodings_to_try:
        try:
            df = pd.read_csv(path, encoding=enc)
            print(f"✅ Loaded {path.name}: {len(df):,} rows × {len(df.columns)} cols (encoding: {enc})")
            return df
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"❌ Error reading {path.name}: {str(e)}")
            break

    return None


In [3]:
## Download Enhanced Datasets

def download_bank_datasets(bank_code: str) -> Dict[str, Tuple[Path, Path]]:
    """Download all datasets for a specific bank."""
    print(f"\n{'='*50}")
    print(f"DOWNLOADING {bank_code.upper()} DATASETS")
    print(f"{'='*50}")

    bank_urls = data_urls[bank_code]
    downloaded_files = {}

    # Download quarterly data
    for quarter in QUARTERS:
        if quarter in bank_urls:
            print(f"\nDownloading {bank_code.upper()} {quarter.upper()} data...")

            file_id = extract_file_id_from_drive_url(bank_urls[quarter])
            filename = f"{bank_code}-{quarter.replace('_', '')}-earnings-call-transcript.csv"

            if file_id:
                drive_path, colab_path = download_from_drive_enhanced(file_id, filename, bank_code)
                if drive_path and colab_path:
                    downloaded_files[quarter] = (drive_path, colab_path)
            else:
                print(f"❌ Could not extract file ID from {quarter} URL")

    # Download manual labels if available
    if "manual_labels" in bank_urls:
        print(f"\nDownloading {bank_code.upper()} manual labels...")

        file_id = extract_file_id_from_drive_url(bank_urls["manual_labels"])
        filename = f"{bank_code}_manual_labels.csv"

        if file_id:
            drive_path, colab_path = download_from_drive_enhanced(file_id, filename, bank_code)
            if drive_path and colab_path:
                downloaded_files["manual_labels"] = (drive_path, colab_path)
        else:
            print(f"❌ Could not extract file ID from manual labels URL")

    return downloaded_files

# Download datasets for both banks
all_downloaded_files = {}
for bank in BANKS:
    all_downloaded_files[bank] = download_bank_datasets(bank)



DOWNLOADING JPM DATASETS

Downloading JPM Q1_2025 data...
✅ Downloaded jpm-q12025-earnings-call-transcript.csv for JPM
  Drive: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/raw/jpm/jpm-q12025-earnings-call-transcript.csv
  Colab: /content/cam_ds_ai_enhanced/data/raw/jpm/jpm-q12025-earnings-call-transcript.csv

Downloading JPM Q2_2025 data...
✅ Downloaded jpm-q22025-earnings-call-transcript.csv for JPM
  Drive: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/raw/jpm/jpm-q22025-earnings-call-transcript.csv
  Colab: /content/cam_ds_ai_enhanced/data/raw/jpm/jpm-q22025-earnings-call-transcript.csv

Downloading JPM manual labels...
✅ Downloaded jpm_manual_labels.csv for JPM
  Drive: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/data/raw/jpm/jpm_manual_labels.csv
  Colab: /content/cam_ds_ai_enhanced/data/raw/jpm/jpm_manual_labels.csv

DOWNLOADING HSBC DATASETS

Downloading HSBC Q1_2025 data...
✅ Downloaded hsbc-q12025-earnings-call-transcript.csv for HSBC
  Drive: /conte

In [4]:
## Enhanced Data Loading and Validation

def load_and_validate_datasets():
    """Load and validate all downloaded datasets."""
    print(f"\n{'='*60}")
    print("LOADING AND VALIDATING DATASETS")
    print(f"{'='*60}")

    loaded_datasets = {}

    for bank in BANKS:
        print(f"\n📊 Loading {bank.upper()} datasets...")
        loaded_datasets[bank] = {}

        bank_files = all_downloaded_files[bank]

        # Load quarterly data
        for quarter in QUARTERS:
            if quarter in bank_files:
                drive_path, colab_path = bank_files[quarter]

                if drive_path and drive_path.exists():
                    df = read_csv_safe_enhanced(drive_path)
                    if df is not None:
                        loaded_datasets[bank][quarter] = df

                        # Basic validation
                        print(f"  {quarter.upper()}: {df.shape}")
                        if 'text' in df.columns or 'content' in df.columns:
                            text_col = 'text' if 'text' in df.columns else 'content'
                            avg_text_length = df[text_col].astype(str).str.len().mean()
                            print(f"    Average text length: {avg_text_length:.0f} characters")

                        if 'speaker' in df.columns or 'speaker_name' in df.columns:
                            speaker_col = 'speaker' if 'speaker' in df.columns else 'speaker_name'
                            unique_speakers = df[speaker_col].nunique()
                            print(f"    Unique speakers: {unique_speakers}")
                    else:
                        print(f"  ❌ Failed to load {quarter} data")
                else:
                    print(f"  ❌ {quarter} file not found")

        # Load manual labels if available
        if "manual_labels" in bank_files:
            drive_path, colab_path = bank_files["manual_labels"]

            if drive_path and drive_path.exists():
                df = read_csv_safe_enhanced(drive_path)
                if df is not None:
                    loaded_datasets[bank]["manual_labels"] = df
                    print(f"  Manual Labels: {df.shape}")

                    # Validate manual labels structure
                    label_cols = [col for col in df.columns if 'label' in col.lower()]
                    if label_cols:
                        print(f"    Label columns: {label_cols}")

                        # Check label distribution
                        for label_col in label_cols:
                            if df[label_col].notna().sum() > 0:
                                dist = df[label_col].value_counts()
                                print(f"    {label_col} distribution: {dist.to_dict()}")
                else:
                    print(f"  ❌ Failed to load manual labels")

    return loaded_datasets

# Load all datasets
loaded_datasets = load_and_validate_datasets()



LOADING AND VALIDATING DATASETS

📊 Loading JPM datasets...
✅ Loaded jpm-q12025-earnings-call-transcript.csv: 112 rows × 10 cols (encoding: utf-8)
  Q1_2025: (112, 10)
    Average text length: 518 characters
    Unique speakers: 14
✅ Loaded jpm-q22025-earnings-call-transcript.csv: 149 rows × 10 cols (encoding: utf-8)
  Q2_2025: (149, 10)
    Average text length: 372 characters
    Unique speakers: 17
✅ Loaded jpm_manual_labels.csv: 1,121 rows × 27 cols (encoding: utf-8)
  Manual Labels: (1121, 27)
    Label columns: ['human_label']
    human_label distribution: {'neutral': 19, 'positive': 19, 'negative': 11}

📊 Loading HSBC datasets...
✅ Loaded hsbc-q12025-earnings-call-transcript.csv: 49 rows × 11 cols (encoding: utf-8)
  Q1_2025: (49, 11)
    Average text length: 1121 characters
    Unique speakers: 14
✅ Loaded hsbc-q22025-earnings-call-transcript.csv: 30 rows × 11 cols (encoding: utf-8)
  Q2_2025: (30, 11)
    Average text length: 1714 characters
    Unique speakers: 10
✅ Loaded hsb

In [5]:
## Enhanced Data Exploration

def explore_dataset_enhanced(df: pd.DataFrame, dataset_name: str, bank_code: str):
    """Enhanced dataset exploration with bank context."""
    if df is None:
        print(f"❌ {bank_code.upper()} {dataset_name} dataset is None")
        return

    print(f"\n📋 [{bank_code.upper()}] {dataset_name.upper()} DATASET EXPLORATION")
    print("-" * 50)

    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Column types analysis
    print(f"\nColumn Types:")
    for col, dtype in df.dtypes.items():
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df)) * 100
        print(f"  {col}: {dtype} ({null_count} nulls, {null_pct:.1f}%)")

    # Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory usage: {memory_mb:.2f} MB")

    # Text analysis for relevant columns
    text_columns = [col for col in df.columns
                   if col.lower() in ['text', 'content', 'question', 'answer', 'transcript']]

    for text_col in text_columns:
        if text_col in df.columns and df[text_col].dtype == 'object':
            text_data = df[text_col].dropna().astype(str)
            if len(text_data) > 0:
                print(f"\n{text_col} Analysis:")
                print(f"  Valid entries: {len(text_data)}")
                print(f"  Avg length: {text_data.str.len().mean():.0f} chars")
                print(f"  Avg words: {text_data.str.split().str.len().mean():.0f} words")
                print(f"  Length range: {text_data.str.len().min()}-{text_data.str.len().max()} chars")

                # Sample text
                sample_text = text_data.iloc[0] if len(text_data) > 0 else ""
                print(f"  Sample: '{sample_text[:100]}...'")

    # Speaker analysis
    speaker_columns = [col for col in df.columns
                      if 'speaker' in col.lower() or 'role' in col.lower()]

    for speaker_col in speaker_columns:
        if speaker_col in df.columns:
            speaker_dist = df[speaker_col].value_counts()
            print(f"\n{speaker_col} Distribution:")
            for speaker, count in speaker_dist.head(10).items():
                print(f"  {speaker}: {count}")

    # Sample data preview
    print(f"\nFirst 3 rows:")
    display_cols = df.columns[:5] if len(df.columns) > 5 else df.columns
    print(df[display_cols].head(3).to_string())

# Explore all loaded datasets
for bank in BANKS:
    for dataset_name, df in loaded_datasets[bank].items():
        if df is not None:
            explore_dataset_enhanced(df, dataset_name, bank)



📋 [JPM] Q1_2025 DATASET EXPLORATION
--------------------------------------------------
Shape: (112, 10)
Columns: ['section', 'question_number', 'answer_number', 'speaker_name', 'role', 'company', 'content', 'year', 'quarter', 'is_pleasantry']

Column Types:
  section: object (0 nulls, 0.0%)
  question_number: float64 (1 nulls, 0.9%)
  answer_number: float64 (23 nulls, 20.5%)
  speaker_name: object (0 nulls, 0.0%)
  role: object (0 nulls, 0.0%)
  company: object (0 nulls, 0.0%)
  content: object (0 nulls, 0.0%)
  year: int64 (0 nulls, 0.0%)
  quarter: object (0 nulls, 0.0%)
  is_pleasantry: bool (0 nulls, 0.0%)

Memory usage: 0.13 MB

content Analysis:
  Valid entries: 112
  Avg length: 518 chars
  Avg words: 91 words
  Length range: 5-10011 chars
  Sample: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 bill...'

speaker_name Distribution:
  Jeremy Barnum: 37
  Jamie Dimon: 31
  Betsy L. Graseck: 7
  John McDonald: 5
  Jim Mitchell: 5
 

In [6]:
## Create Enhanced Combined Datasets

def create_combined_datasets():
    """Create combined datasets for multi-quarter analysis."""
    print(f"\n{'='*60}")
    print("CREATING ENHANCED COMBINED DATASETS")
    print(f"{'='*60}")

    combined_datasets = {}

    for bank in BANKS:
        print(f"\n🔄 Creating combined dataset for {bank.upper()}...")

        bank_data = loaded_datasets[bank]
        quarterly_dfs = []

        # Combine quarterly data
        for quarter in QUARTERS:
            if quarter in bank_data and bank_data[quarter] is not None:
                df = bank_data[quarter].copy()
                df['quarter'] = quarter
                df['bank_code'] = bank
                quarterly_dfs.append(df)
                print(f"  Added {quarter}: {df.shape}")

        if quarterly_dfs:
            # Check column alignment
            all_columns = [set(df.columns) for df in quarterly_dfs]
            common_columns = set.intersection(*all_columns)
            all_unique_columns = set.union(*all_columns)

            print(f"  Common columns: {len(common_columns)}")
            print(f"  All unique columns: {len(all_unique_columns)}")

            # Identify column differences
            column_differences = all_unique_columns - common_columns
            if column_differences:
                print(f"  Column differences: {column_differences}")

                # Align columns by adding missing ones
                for df in quarterly_dfs:
                    for col in column_differences:
                        if col not in df.columns:
                            df[col] = None

            # Combine datasets
            combined_df = pd.concat(quarterly_dfs, ignore_index=True, sort=False)
            combined_datasets[bank] = combined_df

            print(f"  ✅ Combined dataset shape: {combined_df.shape}")

            # Verify quarter distribution
            quarter_dist = combined_df['quarter'].value_counts()
            print(f"  Quarter distribution: {quarter_dist.to_dict()}")
        else:
            print(f"  ❌ No quarterly data found for {bank.upper()}")

    return combined_datasets

# Create combined datasets
combined_datasets = create_combined_datasets()



CREATING ENHANCED COMBINED DATASETS

🔄 Creating combined dataset for JPM...
  Added q1_2025: (112, 11)
  Added q2_2025: (149, 11)
  Common columns: 11
  All unique columns: 11
  ✅ Combined dataset shape: (261, 11)
  Quarter distribution: {'q2_2025': 149, 'q1_2025': 112}

🔄 Creating combined dataset for HSBC...
  Added q1_2025: (49, 12)
  Added q2_2025: (30, 12)
  Common columns: 12
  All unique columns: 12
  ✅ Combined dataset shape: (79, 12)
  Quarter distribution: {'q1_2025': 49, 'q2_2025': 30}


In [7]:
## Enhanced Data Quality Assessment

def assess_data_quality_enhanced():
    """Enhanced data quality assessment across all datasets."""
    print(f"\n{'='*60}")
    print("ENHANCED DATA QUALITY ASSESSMENT")
    print(f"{'='*60}")

    quality_report = {
        "banks_loaded": len(loaded_datasets),
        "bank_details": {},
        "overall_statistics": {},
        "data_quality_flags": []
    }

    total_records = 0
    total_memory = 0

    for bank in BANKS:
        bank_stats = {
            "quarters_loaded": 0,
            "total_records": 0,
            "has_manual_labels": False,
            "data_issues": []
        }

        print(f"\n📊 [{bank.upper()}] Data Quality Assessment")
        print("-" * 40)

        if bank in loaded_datasets:
            bank_data = loaded_datasets[bank]

            # Check quarterly data
            for quarter in QUARTERS:
                if quarter in bank_data and bank_data[quarter] is not None:
                    df = bank_data[quarter]
                    bank_stats["quarters_loaded"] += 1
                    bank_stats["total_records"] += len(df)
                    total_records += len(df)
                    total_memory += df.memory_usage(deep=True).sum()

                    # Data quality checks
                    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
                    if missing_pct > 20:
                        bank_stats["data_issues"].append(f"{quarter}: High missing data ({missing_pct:.1f}%)")

                    # Check for essential columns
                    essential_cols = ['text', 'content', 'speaker', 'speaker_name']
                    has_text_col = any(col in df.columns for col in essential_cols[:2])
                    has_speaker_col = any(col in df.columns for col in essential_cols[2:])

                    if not has_text_col:
                        bank_stats["data_issues"].append(f"{quarter}: Missing text column")
                    if not has_speaker_col:
                        bank_stats["data_issues"].append(f"{quarter}: Missing speaker column")

                    print(f"  {quarter}: {len(df)} records, {missing_pct:.1f}% missing data")

            # Check manual labels
            if "manual_labels" in bank_data and bank_data["manual_labels"] is not None:
                bank_stats["has_manual_labels"] = True
                manual_df = bank_data["manual_labels"]

                # Count labeled records
                label_cols = [col for col in manual_df.columns if 'label' in col.lower()]
                if label_cols:
                    labeled_count = manual_df[label_cols[0]].notna().sum()
                    print(f"  Manual labels: {labeled_count} labeled records")
                    bank_stats["labeled_records"] = labeled_count

        quality_report["bank_details"][bank] = bank_stats

        # Print bank summary
        print(f"  Quarters loaded: {bank_stats['quarters_loaded']}/{len(QUARTERS)}")
        print(f"  Total records: {bank_stats['total_records']:,}")
        print(f"  Has manual labels: {bank_stats['has_manual_labels']}")

        if bank_stats["data_issues"]:
            print(f"  Issues found: {len(bank_stats['data_issues'])}")
            for issue in bank_stats["data_issues"]:
                print(f"    - {issue}")
        else:
            print(f"  ✅ No major data quality issues")

    # Overall statistics
    quality_report["overall_statistics"] = {
        "total_records": total_records,
        "total_memory_mb": total_memory / 1024**2,
        "banks_with_all_quarters": sum(1 for bank_stats in quality_report["bank_details"].values()
                                      if bank_stats["quarters_loaded"] == len(QUARTERS)),
        "banks_with_manual_labels": sum(1 for bank_stats in quality_report["bank_details"].values()
                                       if bank_stats["has_manual_labels"])
    }

    print(f"\n📈 Overall Statistics:")
    print(f"  Total records: {total_records:,}")
    print(f"  Total memory: {total_memory / 1024**2:.2f} MB")
    print(f"  Banks with all quarters: {quality_report['overall_statistics']['banks_with_all_quarters']}/{len(BANKS)}")
    print(f"  Banks with manual labels: {quality_report['overall_statistics']['banks_with_manual_labels']}/{len(BANKS)}")

    return quality_report

# Assess data quality
quality_report = assess_data_quality_enhanced()



ENHANCED DATA QUALITY ASSESSMENT

📊 [JPM] Data Quality Assessment
----------------------------------------
  q1_2025: 112 records, 2.1% missing data
  q2_2025: 149 records, 3.0% missing data
  Manual labels: 49 labeled records
  Quarters loaded: 2/2
  Total records: 261
  Has manual labels: True
  ✅ No major data quality issues

📊 [HSBC] Data Quality Assessment
----------------------------------------
  q1_2025: 49 records, 10.2% missing data
  q2_2025: 30 records, 14.5% missing data
  Manual labels: 69 labeled records
  Quarters loaded: 2/2
  Total records: 79
  Has manual labels: True
  ✅ No major data quality issues

📈 Overall Statistics:
  Total records: 340
  Total memory: 0.50 MB
  Banks with all quarters: 2/2
  Banks with manual labels: 2/2


In [8]:
## Save Enhanced Raw Datasets

def save_enhanced_datasets():
    """Save all loaded datasets with enhanced organization."""
    print(f"\n{'='*60}")
    print("SAVING ENHANCED DATASETS")
    print(f"{'='*60}")

    saved_files = {}

    # Save individual quarterly datasets
    for bank in BANKS:
        print(f"\n💾 Saving {bank.upper()} datasets...")
        saved_files[bank] = {}

        if bank in loaded_datasets:
            bank_data = loaded_datasets[bank]

            # Save quarterly datasets
            for quarter in QUARTERS:
                if quarter in bank_data and bank_data[quarter] is not None:
                    df = bank_data[quarter]

                    # Enhanced filename
                    filename = f"raw_{bank}_{quarter}_earnings_call.csv"

                    # Save to both locations
                    drive_path = drive_base / f"data/raw/{bank}" / filename
                    colab_path = colab_base / f"data/raw/{bank}" / filename

                    df.to_csv(drive_path, index=False)
                    df.to_csv(colab_path, index=False)

                    saved_files[bank][quarter] = {
                        "drive_path": str(drive_path),
                        "colab_path": str(colab_path),
                        "shape": df.shape
                    }

                    print(f"  ✅ {quarter}: {filename} ({df.shape})")

            # Save manual labels
            if "manual_labels" in bank_data and bank_data["manual_labels"] is not None:
                df = bank_data["manual_labels"]
                filename = f"raw_{bank}_manual_labels.csv"

                drive_path = drive_base / f"data/raw/{bank}" / filename
                colab_path = colab_base / f"data/raw/{bank}" / filename

                df.to_csv(drive_path, index=False)
                df.to_csv(colab_path, index=False)

                saved_files[bank]["manual_labels"] = {
                    "drive_path": str(drive_path),
                    "colab_path": str(colab_path),
                    "shape": df.shape
                }

                print(f"  ✅ Manual labels: {filename} ({df.shape})")

    # Save combined datasets
    print(f"\n💾 Saving combined datasets...")
    for bank in BANKS:
        if bank in combined_datasets:
            df = combined_datasets[bank]
            filename = f"raw_{bank}_multi_2025_earnings_call.csv"

            drive_path = drive_base / f"data/raw/{bank}" / filename
            colab_path = colab_base / f"data/raw/{bank}" / filename

            df.to_csv(drive_path, index=False)
            df.to_csv(colab_path, index=False)

            if "combined" not in saved_files[bank]:
                saved_files[bank]["combined"] = {}

            saved_files[bank]["combined"] = {
                "drive_path": str(drive_path),
                "colab_path": str(colab_path),
                "shape": df.shape
            }

            print(f"  ✅ {bank.upper()} combined: {filename} ({df.shape})")

    return saved_files

# Save all datasets
saved_files = save_enhanced_datasets()



SAVING ENHANCED DATASETS

💾 Saving JPM datasets...
  ✅ q1_2025: raw_jpm_q1_2025_earnings_call.csv ((112, 10))
  ✅ q2_2025: raw_jpm_q2_2025_earnings_call.csv ((149, 10))
  ✅ Manual labels: raw_jpm_manual_labels.csv ((1121, 27))

💾 Saving HSBC datasets...
  ✅ q1_2025: raw_hsbc_q1_2025_earnings_call.csv ((49, 11))
  ✅ q2_2025: raw_hsbc_q2_2025_earnings_call.csv ((30, 11))
  ✅ Manual labels: raw_hsbc_manual_labels.csv ((858, 27))

💾 Saving combined datasets...
  ✅ JPM combined: raw_jpm_multi_2025_earnings_call.csv ((261, 11))
  ✅ HSBC combined: raw_hsbc_multi_2025_earnings_call.csv ((79, 12))


In [9]:
## Create Enhanced Data Registry

enhanced_data_registry = {
    "creation_timestamp": pd.Timestamp.now().isoformat(),
    "banks": BANKS,
    "quarters": QUARTERS,
    "models_configured": list(MODELS.keys()),
    "saved_files": saved_files,
    "quality_report": quality_report,
    "data_urls": data_urls,
    "loading_summary": {
        "total_files_saved": sum(len(bank_files) for bank_files in saved_files.values()),
        "banks_successfully_loaded": len([bank for bank in BANKS if bank in saved_files and saved_files[bank]]),
        "quarters_loaded_per_bank": {bank: len([q for q in QUARTERS if q in saved_files.get(bank, {})])
                                    for bank in BANKS}
    }
}

# Save enhanced registry
registry_path = drive_base / "configs" / "enhanced_data_registry.json"
with open(registry_path, "w") as f:
    json.dump(enhanced_data_registry, f, indent=2, default=str)

print(f"✅ Enhanced data registry saved: {registry_path}")


✅ Enhanced data registry saved: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_data_registry.json


In [10]:
## Final Summary

print(f"\n{'='*60}")
print("ENHANCED DATA LOADING COMPLETE")
print(f"{'='*60}")

# Summary statistics
total_files = sum(len(bank_files) for bank_files in saved_files.values())
successful_banks = len([bank for bank in BANKS if bank in saved_files and saved_files[bank]])
total_records = quality_report["overall_statistics"]["total_records"]
total_memory = quality_report["overall_statistics"]["total_memory_mb"]

print(f"📊 Loading Summary:")
print(f"  Banks processed: {successful_banks}/{len(BANKS)}")
print(f"  Files saved: {total_files}")
print(f"  Total records: {total_records:,}")
print(f"  Total memory: {total_memory:.2f} MB")

print(f"\n🏦 Bank-specific Summary:")
for bank in BANKS:
    if bank in saved_files and saved_files[bank]:
        bank_quarters = len([q for q in QUARTERS if q in saved_files[bank]])
        has_manual = "manual_labels" in saved_files[bank]
        has_combined = "combined" in saved_files[bank]

        print(f"  {bank.upper()}:")
        print(f"    Quarters: {bank_quarters}/{len(QUARTERS)}")
        print(f"    Manual labels: {'✅' if has_manual else '❌'}")
        print(f"    Combined dataset: {'✅' if has_combined else '❌'}")

print(f"\n🚀 Ready for next step: 03_clean_preprocess_enhanced.ipynb")
print(f"   Enhanced datasets loaded for {len(BANKS)} banks")
print(f"   {len(MODELS)} models configured for analysis")


ENHANCED DATA LOADING COMPLETE
📊 Loading Summary:
  Banks processed: 2/2
  Files saved: 8
  Total records: 340
  Total memory: 0.50 MB

🏦 Bank-specific Summary:
  JPM:
    Quarters: 2/2
    Manual labels: ✅
    Combined dataset: ✅
  HSBC:
    Quarters: 2/2
    Manual labels: ✅
    Combined dataset: ✅

🚀 Ready for next step: 03_clean_preprocess_enhanced.ipynb
   Enhanced datasets loaded for 2 banks
   4 models configured for analysis
