In [None]:
# FIXED 03_clean_preprocess.ipynb
# Purpose: Clean and preprocess JPM earnings call transcript data
# Input: raw_jpm_q1_2025_df, raw_jpm_q2_2025_df, raw_jpm_multi_2025_df
# Output: clean_jpm_q1_2025_df, clean_jpm_q2_2025_df, clean_jpm_multi_2025_df

## Import Libraries

import pandas as pd
import numpy as np
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive")


# Load configuration and data registry
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

registry_path = Path(config["drive_base"]) / "data_registry.json"
with open(registry_path, "r") as f:
    data_registry = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

print(f"Cleaning and preprocessing data for bank: {BANK_CODE.upper()}")


Mounted at /content/drive
Cleaning and preprocessing data for bank: JPM


In [None]:
## Define Paths

raw_data_path = drive_base / "data/raw/jpm"
clean_data_path = drive_base / "data/clean/jpm"
processed_data_path = drive_base / "data/processed/jpm"

# Ensure directories exist
clean_data_path.mkdir(parents=True, exist_ok=True)
processed_data_path.mkdir(parents=True, exist_ok=True)

In [None]:
## Load Raw Data

def load_raw_dataset(dataset_key):
    """Load raw dataset using registry information."""
    if not data_registry[dataset_key]["loaded"]:
        print(f"❌ {dataset_key} was not successfully loaded in previous step")
        return None

    file_path = Path(data_registry[dataset_key]["path"])
    if not file_path.exists():
        print(f"❌ {dataset_key} file not found: {file_path}")
        return None

    df = pd.read_csv(file_path)
    expected_shape = data_registry[dataset_key]["shape"]
    print(f"✓ Loaded {dataset_key}: {df.shape} (expected: {expected_shape})")

    return df

print("Loading raw datasets...")
raw_jpm_q1_2025_df = load_raw_dataset("raw_jpm_q1_2025_df")
raw_jpm_q2_2025_df = load_raw_dataset("raw_jpm_q2_2025_df")
raw_jpm_multi_2025_df = load_raw_dataset("raw_jpm_multi_2025_df")


Loading raw datasets...
✓ Loaded raw_jpm_q1_2025_df: (112, 10) (expected: [112, 10])
✓ Loaded raw_jpm_q2_2025_df: (149, 10) (expected: [149, 10])
✓ Loaded raw_jpm_multi_2025_df: (261, 10) (expected: [261, 10])


In [None]:
## FIXED Data Cleaning Functions

def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize column names to snake_case and map to expected names."""
    df = df.copy()

    # Convert to snake_case
    new_columns = {}
    for col in df.columns:
        # Remove special characters and convert to lowercase
        clean_col = re.sub(r'[^\w\s]', '', str(col).lower())
        # Replace spaces with underscores
        clean_col = re.sub(r'\s+', '_', clean_col.strip())
        # Remove multiple underscores
        clean_col = re.sub(r'_+', '_', clean_col)
        new_columns[col] = clean_col

    df = df.rename(columns=new_columns)

    # FIXED: Map column names to expected names
    column_mapping = {
        'content': 'text',  # Map 'content' to 'text'
        'speaker_name': 'speaker',  # Map 'speaker_name' to 'speaker'
        'role': 'original_role'  # Keep original role info
    }

    # Apply mapping if columns exist
    for old_col, new_col in column_mapping.items():
        if old_col in df.columns:
            df = df.rename(columns={old_col: new_col})
            print(f"✓ Mapped '{old_col}' → '{new_col}'")

    print(f"✓ Column names standardized: {list(df.columns)}")

    return df

def clean_speaker_names(df: pd.DataFrame, speaker_col: str = 'speaker') -> pd.DataFrame:
    """Clean and standardize speaker names."""
    if speaker_col not in df.columns:
        print(f"⚠️  Speaker column '{speaker_col}' not found. Available: {list(df.columns)}")
        return df

    df = df.copy()

    # Define speaker role mapping
    speaker_mapping = {
        # Analyst patterns
        r'.*analyst.*': 'analyst',
        r'.*research.*': 'analyst',

        # Executive patterns
        r'.*ceo.*': 'ceo',
        r'.*chief executive.*': 'ceo',
        r'.*cfo.*': 'cfo',
        r'.*chief financial.*': 'cfo',
        r'.*coo.*': 'coo',
        r'.*president.*': 'president',
        r'.*chairman.*': 'chairman',

        # Operator patterns
        r'.*operator.*': 'operator',
        r'.*moderator.*': 'operator',

        # Generic patterns
        r'.*management.*': 'executive',
        r'.*executive.*': 'executive'
    }

    # Clean speaker names
    df[speaker_col] = df[speaker_col].astype(str).str.lower().str.strip()

    # Apply mapping
    df[f'{speaker_col}_role'] = 'other'  # Default value

    for pattern, role in speaker_mapping.items():
        mask = df[speaker_col].str.contains(pattern, regex=True, na=False)
        df.loc[mask, f'{speaker_col}_role'] = role

    # Also check original_role column if it exists
    if 'original_role' in df.columns:
        df['original_role'] = df['original_role'].astype(str).str.lower().str.strip()
        for pattern, role in speaker_mapping.items():
            mask = df['original_role'].str.contains(pattern, regex=True, na=False)
            df.loc[mask, f'{speaker_col}_role'] = role

    print(f"✓ Speaker roles assigned:")
    role_counts = df[f'{speaker_col}_role'].value_counts()
    for role, count in role_counts.items():
        print(f"  {role}: {count}")

    return df

def clean_text_content(df: pd.DataFrame, text_col: str = 'text') -> pd.DataFrame:
    """Clean and preprocess text content."""
    if text_col not in df.columns:
        print(f"⚠️  Text column '{text_col}' not found. Available: {list(df.columns)}")
        return df

    df = df.copy()

    print(f"Cleaning text in column: {text_col}")

    # Convert to string and handle missing values
    df[text_col] = df[text_col].astype(str).fillna('')

    # Remove or replace common artifacts
    df[f'{text_col}_clean'] = df[text_col].str.replace(r'\[.*?\]', '', regex=True)  # Remove [brackets]
    df[f'{text_col}_clean'] = df[f'{text_col}_clean'].str.replace(r'\(.*?\)', '', regex=True)  # Remove (parentheses)
    df[f'{text_col}_clean'] = df[f'{text_col}_clean'].str.replace(r'--+', ' ', regex=True)  # Replace dashes
    df[f'{text_col}_clean'] = df[f'{text_col}_clean'].str.replace(r'\s+', ' ', regex=True)  # Multiple spaces
    df[f'{text_col}_clean'] = df[f'{text_col}_clean'].str.strip()

    # Remove empty strings
    df[f'{text_col}_clean'] = df[f'{text_col}_clean'].replace('', pd.NA)

    # Calculate text statistics
    df[f'{text_col}_length'] = df[f'{text_col}_clean'].str.len()
    df[f'{text_col}_word_count'] = df[f'{text_col}_clean'].str.split().str.len()

    # Remove very short texts (likely artifacts)
    min_length = 10
    short_text_mask = df[f'{text_col}_length'] < min_length
    print(f"  Texts shorter than {min_length} chars: {short_text_mask.sum()}")

    # Calculate cleaning stats
    original_lengths = df[text_col].str.len()
    clean_lengths = df[f'{text_col}_length'].fillna(0)
    avg_reduction = (original_lengths - clean_lengths).mean()
    print(f"  Average length reduction: {avg_reduction:.1f} characters")

    # Show sample of cleaned text
    valid_text_mask = df[f'{text_col}_clean'].notna() & (df[f'{text_col}_length'] >= min_length)
    if valid_text_mask.sum() > 0:
        sample_text = df.loc[valid_text_mask, f'{text_col}_clean'].iloc[0]
        print(f"  Sample cleaned text: '{sample_text[:100]}...'")
        print(f"  Valid texts for processing: {valid_text_mask.sum()}/{len(df)}")
    else:
        print(f"  ❌ No valid texts found after cleaning!")

    return df

def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """Remove duplicate entries."""
    original_count = len(df)

    # Remove exact duplicates
    df_dedup = df.drop_duplicates()

    # Remove duplicates based on text content if available
    if 'text_clean' in df.columns:
        df_dedup = df_dedup.drop_duplicates(subset=['text_clean'])

    removed_count = original_count - len(df_dedup)
    print(f"✓ Removed {removed_count} duplicate entries ({removed_count/original_count*100:.1f}%)")

    return df_dedup

def add_metadata_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Add metadata columns for analysis."""
    df = df.copy()

    # Add row ID
    df['qa_id'] = range(len(df))

    # Add timestamp if not present
    if 'timestamp' not in df.columns:
        df['timestamp'] = pd.Timestamp.now()

    # Add bank identifier
    df['bank_code'] = BANK_CODE

    # Add is_valid flag
    df['is_valid'] = True

    # Mark invalid entries
    if 'text_clean' in df.columns:
        # Invalid if text is too short or missing
        invalid_text_mask = (df['text_clean'].isna()) | (df['text_clean'].str.len() < 10)
        df.loc[invalid_text_mask, 'is_valid'] = False

    if 'speaker_role' in df.columns:
        # Don't mark 'other' as invalid - they might still have valid content
        pass

    invalid_count = (~df['is_valid']).sum()
    print(f"✓ Added metadata columns, marked {invalid_count} entries as invalid")

    return df

In [None]:
## Clean Individual Datasets

def clean_dataset(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """Apply all cleaning steps to a dataset."""
    if df is None:
        print(f"❌ Cannot clean {dataset_name} - dataset is None")
        return None

    print(f"\n🧹 CLEANING {dataset_name}")
    print("-" * 40)
    print(f"Input shape: {df.shape}")
    print(f"Input columns: {list(df.columns)}")

    # Step 1: Clean column names
    df_clean = clean_column_names(df)

    # Step 2: Clean speaker names
    df_clean = clean_speaker_names(df_clean)

    # Step 3: Clean text content
    df_clean = clean_text_content(df_clean)

    # Step 4: Remove duplicates
    df_clean = remove_duplicates(df_clean)

    # Step 5: Add metadata
    df_clean = add_metadata_columns(df_clean)

    # Step 6: Filter out invalid entries but keep some for debugging
    valid_df = df_clean[df_clean['is_valid']].copy()
    invalid_count = len(df_clean) - len(valid_df)

    print(f"Final shape: {valid_df.shape} (removed {invalid_count} invalid entries)")

    return valid_df

# Clean all datasets with enhanced debugging
clean_jpm_q1_2025_df = clean_dataset(raw_jpm_q1_2025_df, "Q1 2025")
clean_jpm_q2_2025_df = clean_dataset(raw_jpm_q2_2025_df, "Q2 2025")
clean_jpm_multi_2025_df = clean_dataset(raw_jpm_multi_2025_df, "Multi 2025")


🧹 CLEANING Q1 2025
----------------------------------------
Input shape: (112, 10)
Input columns: ['section', 'question_number', 'answer_number', 'speaker_name', 'role', 'company', 'content', 'year', 'quarter', 'is_pleasantry']
✓ Mapped 'content' → 'text'
✓ Mapped 'speaker_name' → 'speaker'
✓ Mapped 'role' → 'original_role'
✓ Column names standardized: ['section', 'question_number', 'answer_number', 'speaker', 'original_role', 'company', 'text', 'year', 'quarter', 'is_pleasantry']
✓ Speaker roles assigned:
  analyst: 44
  cfo: 37
  executive: 31
Cleaning text in column: text
  Texts shorter than 10 chars: 2
  Average length reduction: 1.0 characters
  Sample cleaned text: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 bill...'
  Valid texts for processing: 110/112
✓ Removed 1 duplicate entries (0.9%)
✓ Added metadata columns, marked 2 entries as invalid
Final shape: (109, 18) (removed 2 invalid entries)

🧹 CLEANING Q2 2025
------------

In [None]:
## FIXED Preprocessing for Sentiment Analysis

def preprocess_for_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    """Additional preprocessing specifically for sentiment analysis."""
    if df is None:
        print("❌ Cannot preprocess - dataset is None")
        return None

    print(f"\n🔄 PREPROCESSING FOR SENTIMENT ANALYSIS")
    print(f"Input shape: {df.shape}")
    print(f"Available columns: {list(df.columns)}")

    # Check if text_clean column exists
    if 'text_clean' not in df.columns:
        print("❌ 'text_clean' column not found - cannot proceed with sentence splitting")
        return pd.DataFrame()  # Return empty DataFrame instead of None

    df = df.copy()

    # Create sentence-level splits for granular analysis
    sentences = []
    processed_count = 0

    for idx, row in df.iterrows():
        text_content = row.get('text_clean', '')

        # Skip if no valid text content
        if pd.isna(text_content) or str(text_content).strip() == '' or str(text_content) == 'nan':
            continue

        # Convert to string and clean
        text_content = str(text_content).strip()

        if len(text_content) < 10:  # Skip very short texts
            continue

        # Enhanced sentence splitting
        # Split on multiple sentence terminators
        text_sentences = re.split(r'[.!?]+', text_content)

        sentence_count = 0
        for sent_idx, sentence in enumerate(text_sentences):
            sentence = sentence.strip()
            if len(sentence) > 20:  # Only keep meaningful sentences
                sentences.append({
                    'original_qa_id': row.get('qa_id', idx),
                    'sentence_id': f"{row.get('qa_id', idx)}_{sent_idx}",
                    'text': sentence,
                    'speaker': row.get('speaker', ''),
                    'speaker_role': row.get('speaker_role', ''),
                    'quarter': row.get('quarter', ''),
                    'bank_code': row.get('bank_code', BANK_CODE),
                    'sentence_length': len(sentence),
                    'sentence_word_count': len(sentence.split())
                })
                sentence_count += 1

        if sentence_count > 0:
            processed_count += 1

    sentences_df = pd.DataFrame(sentences)

    print(f"✓ Processed {processed_count}/{len(df)} Q&A pairs")
    print(f"✓ Created sentence-level dataset: {len(sentences_df)} sentences")

    if len(sentences_df) > 0:
        print(f"  Sample sentence: '{sentences_df.iloc[0]['text'][:100]}...'")
        print(f"  Average sentence length: {sentences_df['sentence_length'].mean():.1f} chars")
        print(f"  Average words per sentence: {sentences_df['sentence_word_count'].mean():.1f}")

    return sentences_df

# Create sentence-level datasets for sentiment analysis with enhanced processing
print("\n" + "="*60)
print("PREPROCESSING FOR SENTIMENT ANALYSIS")
print("="*60)

processed_jpm_q1_2025_df = preprocess_for_sentiment(clean_jpm_q1_2025_df)
processed_jpm_q2_2025_df = preprocess_for_sentiment(clean_jpm_q2_2025_df)
processed_jpm_multi_2025_df = preprocess_for_sentiment(clean_jpm_multi_2025_df)


PREPROCESSING FOR SENTIMENT ANALYSIS

🔄 PREPROCESSING FOR SENTIMENT ANALYSIS
Input shape: (109, 18)
Available columns: ['section', 'question_number', 'answer_number', 'speaker', 'original_role', 'company', 'text', 'year', 'quarter', 'is_pleasantry', 'speaker_role', 'text_clean', 'text_length', 'text_word_count', 'qa_id', 'timestamp', 'bank_code', 'is_valid']
✓ Processed 97/109 Q&A pairs
✓ Created sentence-level dataset: 578 sentences
  Sample sentence: 'Thank you and good morning, everyone...'
  Average sentence length: 94.6 chars
  Average words per sentence: 17.0

🔄 PREPROCESSING FOR SENTIMENT ANALYSIS
Input shape: (145, 18)
Available columns: ['section', 'question_number', 'answer_number', 'speaker', 'original_role', 'company', 'text', 'year', 'quarter', 'is_pleasantry', 'speaker_role', 'text_clean', 'text_length', 'text_word_count', 'qa_id', 'timestamp', 'bank_code', 'is_valid']
✓ Processed 121/145 Q&A pairs
✓ Created sentence-level dataset: 532 sentences
  Sample sentence: 'names

In [None]:
## Enhanced Data Quality Assessment

def assess_data_quality(df: pd.DataFrame, dataset_name: str):
    """Assess and report data quality metrics."""
    if df is None:
        print(f"❌ Cannot assess {dataset_name} - dataset is None")
        return

    print(f"\n📋 DATA QUALITY ASSESSMENT - {dataset_name}")
    print("-" * 50)

    # Basic stats
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Missing values
    missing = df.isnull().sum()
    if missing.any():
        print(f"\nMissing values:")
        for col, count in missing[missing > 0].items():
            pct = (count / len(df)) * 100
            print(f"  {col}: {count} ({pct:.1f}%)")
    else:
        print("✓ No missing values")

    # Text quality metrics
    if 'text_clean' in df.columns:
        valid_text_mask = df['text_clean'].notna() & (df['text_clean'] != '') & (df['text_clean'] != 'nan')
        valid_texts = df.loc[valid_text_mask, 'text_clean']

        if len(valid_texts) > 0:
            text_lengths = valid_texts.str.len()
            word_counts = valid_texts.str.split().str.len()

            print(f"\nText quality metrics ({len(valid_texts)} valid texts):")
            print(f"  Length - Mean: {text_lengths.mean():.0f}, Median: {text_lengths.median():.0f}")
            print(f"  Length - Min: {text_lengths.min()}, Max: {text_lengths.max()}")
            print(f"  Words - Mean: {word_counts.mean():.1f}, Median: {word_counts.median():.1f}")

            # Show sample text
            print(f"  Sample text: '{valid_texts.iloc[0][:100]}...'")
        else:
            print(f"\n❌ No valid text content found!")

    # Check for 'text' column (should exist after processing)
    if 'text' in df.columns:
        valid_text_mask = df['text'].notna() & (df['text'] != '') & (df['text'] != 'nan')
        print(f"\nText column: {valid_text_mask.sum()} valid entries")
        if valid_text_mask.sum() > 0:
            sample_text = df.loc[valid_text_mask, 'text'].iloc[0]
            print(f"  Sample: '{sample_text[:100]}...'")

    # Speaker distribution
    if 'speaker_role' in df.columns:
        print(f"\nSpeaker role distribution:")
        for role, count in df['speaker_role'].value_counts().items():
            pct = (count / len(df)) * 100
            print(f"  {role}: {count} ({pct:.1f}%)")

# Assess all datasets
assess_data_quality(clean_jpm_q1_2025_df, "Q1 2025 Clean")
assess_data_quality(clean_jpm_q2_2025_df, "Q2 2025 Clean")
assess_data_quality(clean_jpm_multi_2025_df, "Multi 2025 Clean")

assess_data_quality(processed_jpm_q1_2025_df, "Q1 2025 Processed")
assess_data_quality(processed_jpm_q2_2025_df, "Q2 2025 Processed")
assess_data_quality(processed_jpm_multi_2025_df, "Multi 2025 Processed")


📋 DATA QUALITY ASSESSMENT - Q1 2025 Clean
--------------------------------------------------
Shape: (109, 18)
Memory usage: 0.32 MB

Missing values:
  question_number: 1 (0.9%)
  answer_number: 23 (21.1%)

Text quality metrics (109 valid texts):
  Length - Mean: 531, Median: 262
  Length - Min: 10, Max: 9998
  Words - Mean: 93.2, Median: 47.0
  Sample text: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 bill...'

Text column: 109 valid entries
  Sample: 'Thank you and good morning, everyone. Starting on page 1, the Firm reported net income of $14.6 bill...'

Speaker role distribution:
  analyst: 41 (37.6%)
  cfo: 37 (33.9%)
  executive: 31 (28.4%)

📋 DATA QUALITY ASSESSMENT - Q2 2025 Clean
--------------------------------------------------
Shape: (145, 18)
Memory usage: 0.27 MB

Missing values:
  question_number: 5 (3.4%)
  answer_number: 38 (26.2%)

Text quality metrics (145 valid texts):
  Length - Mean: 381, Median: 247
  Length - M

In [None]:
## Save Clean and Processed Datasets

def save_dataset_multiple_locations(df, filename, description):
    """Save dataset to multiple locations with error handling."""
    if df is None or len(df) == 0:
        print(f"❌ Cannot save {description} - dataset is None or empty")
        return None

    print(f"Saving {description}...")

    paths_saved = []

    try:
        # Save to clean data directory
        clean_path = clean_data_path / filename
        df.to_csv(clean_path, index=False)
        paths_saved.append(str(clean_path))
        print(f"  ✓ Clean: {clean_path} ({len(df)} rows)")

        # Save to processed directory if it's a processed dataset
        if 'processed' in filename:
            processed_path = processed_data_path / filename
            df.to_csv(processed_path, index=False)
            paths_saved.append(str(processed_path))
            print(f"  ✓ Processed: {processed_path} ({len(df)} rows)")

        # Save to colab for easy access
        colab_clean_path = colab_base / "data/clean/jpm" / filename
        colab_clean_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(colab_clean_path, index=False)
        paths_saved.append(str(colab_clean_path))
        print(f"  ✓ Colab: {colab_clean_path} ({len(df)} rows)")

    except Exception as e:
        print(f"❌ Error saving {description}: {str(e)}")

    return paths_saved

print("\n" + "="*60)
print("SAVING CLEAN AND PROCESSED DATASETS")
print("="*60)

# Save clean datasets (Q&A level)
save_dataset_multiple_locations(clean_jpm_q1_2025_df, "clean_jpm_q1_2025.csv", "Q1 2025 clean data")
save_dataset_multiple_locations(clean_jpm_q2_2025_df, "clean_jpm_q2_2025.csv", "Q2 2025 clean data")
save_dataset_multiple_locations(clean_jpm_multi_2025_df, "clean_jmp_multi_2025.csv", "Multi 2025 clean data")

# Save processed datasets (sentence level) - only if they have data
if processed_jpm_q1_2025_df is not None and len(processed_jpm_q1_2025_df) > 0:
    save_dataset_multiple_locations(processed_jpm_q1_2025_df, "processed_jpm_q1_2025.csv", "Q1 2025 processed data")
else:
    print("❌ Q1 2025 processed data is empty - not saving")

if processed_jpm_q2_2025_df is not None and len(processed_jpm_q2_2025_df) > 0:
    save_dataset_multiple_locations(processed_jpm_q2_2025_df, "processed_jpm_q2_2025.csv", "Q2 2025 processed data")
else:
    print("❌ Q2 2025 processed data is empty - not saving")

if processed_jpm_multi_2025_df is not None and len(processed_jpm_multi_2025_df) > 0:
    save_dataset_multiple_locations(processed_jpm_multi_2025_df, "processed_jmp_multi_2025.csv", "Multi 2025 processed data")
else:
    print("❌ Multi 2025 processed data is empty - not saving")


SAVING CLEAN AND PROCESSED DATASETS
Saving Q1 2025 clean data...
  ✓ Clean: /content/drive/MyDrive/CAM_DS_AI_Project/data/clean/jpm/clean_jpm_q1_2025.csv (109 rows)
  ✓ Colab: /content/cam_ds_ai_project/data/clean/jpm/clean_jpm_q1_2025.csv (109 rows)
Saving Q2 2025 clean data...
  ✓ Clean: /content/drive/MyDrive/CAM_DS_AI_Project/data/clean/jpm/clean_jpm_q2_2025.csv (145 rows)
  ✓ Colab: /content/cam_ds_ai_project/data/clean/jpm/clean_jpm_q2_2025.csv (145 rows)
Saving Multi 2025 clean data...
  ✓ Clean: /content/drive/MyDrive/CAM_DS_AI_Project/data/clean/jpm/clean_jmp_multi_2025.csv (252 rows)
  ✓ Colab: /content/cam_ds_ai_project/data/clean/jpm/clean_jmp_multi_2025.csv (252 rows)
Saving Q1 2025 processed data...
  ✓ Clean: /content/drive/MyDrive/CAM_DS_AI_Project/data/clean/jpm/processed_jpm_q1_2025.csv (578 rows)
  ✓ Processed: /content/drive/MyDrive/CAM_DS_AI_Project/data/processed/jpm/processed_jpm_q1_2025.csv (578 rows)
  ✓ Colab: /content/cam_ds_ai_project/data/clean/jpm/process

In [None]:
## Final Status Report

print("\n" + "="*60)
print("FINAL PROCESSING STATUS")
print("="*60)

datasets = [
    ("Clean Q1 2025", clean_jpm_q1_2025_df),
    ("Clean Q2 2025", clean_jpm_q2_2025_df),
    ("Clean Multi 2025", clean_jpm_multi_2025_df),
    ("Processed Q1 2025", processed_jpm_q1_2025_df),
    ("Processed Q2 2025", processed_jpm_q2_2025_df),
    ("Processed Multi 2025", processed_jpm_multi_2025_df)
]

successful_datasets = 0
total_sentences = 0

for name, df in datasets:
    if df is not None and len(df) > 0:
        print(f"✅ {name}: {df.shape}")
        successful_datasets += 1
        if 'Processed' in name:
            total_sentences += len(df)
    else:
        print(f"❌ {name}: Empty or None")

print(f"\n📊 Summary:")
print(f"  Successful datasets: {successful_datasets}/6")
print(f"  Total sentences ready for sentiment analysis: {total_sentences}")

if total_sentences > 0:
    print(f"\n✅ SUCCESS: Ready for sentiment analysis!")
    print(f"   Use the processed datasets in notebook 04")
else:
    print(f"\n⚠️  WARNING: No sentences generated for sentiment analysis")
    print(f"   Check the text content in your raw data files")


FINAL PROCESSING STATUS
✅ Clean Q1 2025: (109, 18)
✅ Clean Q2 2025: (145, 18)
✅ Clean Multi 2025: (252, 18)
✅ Processed Q1 2025: (578, 9)
✅ Processed Q2 2025: (532, 9)
✅ Processed Multi 2025: (1110, 9)

📊 Summary:
  Successful datasets: 6/6
  Total sentences ready for sentiment analysis: 2220

✅ SUCCESS: Ready for sentiment analysis!
   Use the processed datasets in notebook 04


In [None]:
## Export Variables Summary

final_datasets = {
    "clean_qa_level": [
        clean_jpm_q1_2025_df,
        clean_jpm_q2_2025_df,
        clean_jpm_multi_2025_df
    ],
    "processed_sentence_level": [
        processed_jpm_q1_2025_df,
        processed_jpm_q2_2025_df,
        processed_jpm_multi_2025_df
    ]
}

# Count total records ready for sentiment analysis
total_sentences = 0
total_qa_pairs = 0

for df in final_datasets["clean_qa_level"]:
    if df is not None:
        total_qa_pairs += len(df)

for df in final_datasets["processed_sentence_level"]:
    if df is not None:
        total_sentences += len(df)

print(f"\nReady for sentiment analysis:")
print(f"  Q&A pairs: {total_qa_pairs:,}")
print(f"  Sentences: {total_sentences:,}")
print(f"  Total memory: {sum(df.memory_usage(deep=True).sum() for df in final_datasets['clean_qa_level'] + final_datasets['processed_sentence_level'] if df is not None) / 1024**2:.2f} MB")


Ready for sentiment analysis:
  Q&A pairs: 506
  Sentences: 2,220
  Total memory: 2.16 MB
