# 🔄 Data Processing Pipeline - Generalized

This notebook handles data loading, preprocessing, and preparation.

**Configuration-driven approach:** All settings loaded from `../config/pipeline_config.json`

In [1]:
# Import configuration system and utilities
import sys
import os
sys.path.append("../")

from src.pipeline_utils import ConfigManager, StateManager, LoggingManager
import torch
import numpy as np
from pathlib import Path
from datetime import datetime

# Initialize managers
config = ConfigManager("../config/pipeline_config.json")
state = StateManager("../config/pipeline_state.json")
logger_manager = LoggingManager(config, 'data_processing')
logger = logger_manager.get_logger()

print("📋 Configuration loaded from ../config/pipeline_config.json")

📋 Configuration loaded from ../config/pipeline_config.json


In [2]:
# Verify setup completion and load data configuration
logger.info("🔍 Checking pipeline prerequisites...")

# Verify setup was completed
if not state.is_step_complete('setup_completed'):
    logger.error("Setup step not completed. Please run 0_setup_generalized.ipynb first.")
    raise RuntimeError("Pipeline setup required. Run 0_setup_generalized.ipynb first.")

print("✅ Setup verification passed")

# Load data configuration
data_config = config.get('data', {})
print(f"📊 Data Configuration:")
print(f"   📁 Raw data path: {data_config.get('raw_data_path', 'Not set')}")
print(f"   📁 Processed data dir: {data_config.get('processed_data_dir', 'Not set')}")
print(f"   📈 Validation split: {data_config.get('validation_split', 'Not set')}")

logger.info("Data configuration loaded successfully")

2025-08-08 16:29:01,928 - pipeline.data_processing - INFO - 🔍 Checking pipeline prerequisites...
2025-08-08 16:29:01,930 - pipeline.data_processing - INFO - Data configuration loaded successfully
2025-08-08 16:29:01,930 - pipeline.data_processing - INFO - Data configuration loaded successfully


✅ Setup verification passed
📊 Data Configuration:
   📁 Raw data path: data/FinancialPhraseBank/all-data.csv
   📁 Processed data dir: data/processed
   📈 Validation split: 0.1


In [3]:
# Load and validate raw dataset
import pandas as pd

logger.info("📂 Loading raw dataset...")

print("📂 Loading Raw Dataset:")

# Get the main data path from configuration
raw_data_path = data_config.get('raw_data_path', '')
if not raw_data_path:
    logger.error("No raw data path configured")
    raise RuntimeError("No raw data path found in configuration")

full_path = Path(f"../{raw_data_path}")

if not full_path.exists():
    logger.error(f"Data file not found: {full_path}")
    raise FileNotFoundError(f"Data file not found: {full_path}")

# Load the dataset
if full_path.suffix.lower() == '.csv':
    df = pd.read_csv(full_path, encoding='unicode_escape', names=['label', 'text'])
    print(f"   ✅ Loaded CSV: {len(df)} samples")
    print(f"   📋 Columns: {list(df.columns)}")
    
    # Display sample
    if len(df) > 0:
        print(f"   📝 Sample: label='{df.iloc[0]['label']}', text='{df.iloc[0]['text'][:50]}...'")
    
else:
    logger.error(f"Unsupported file format: {full_path.suffix}")
    raise ValueError(f"Unsupported file format: {full_path.suffix}")

logger.info(f"Successfully loaded dataset with {len(df)} samples")

2025-08-08 16:29:02,386 - pipeline.data_processing - INFO - 📂 Loading raw dataset...
2025-08-08 16:29:02,406 - pipeline.data_processing - INFO - Successfully loaded dataset with 4846 samples
2025-08-08 16:29:02,406 - pipeline.data_processing - INFO - Successfully loaded dataset with 4846 samples


📂 Loading Raw Dataset:
   ✅ Loaded CSV: 4846 samples
   📋 Columns: ['label', 'text']
   📝 Sample: label='neutral', text='According to Gran , the company has no plans to mo...'


In [4]:
# Data preprocessing and standardization
logger.info("🔄 Processing and standardizing dataset...")

print("🔄 Data Preprocessing:")

# Get label columns configuration
label_columns = data_config.get('label_columns', ['label', 'sentence'])
text_preprocessing = data_config.get('text_preprocessing', {})

# Check if we have the expected columns
if len(df.columns) >= 2:
    # Data is already in correct format: first column = label, second column = text
    df_processed = df.copy()
    
    print(f"   📋 Data structure recognized: label column = '{df.columns[0]}', text column = '{df.columns[1]}'")
    
    # Clean and preprocess text
    print(f"   🧹 Cleaning text data...")
    df_processed['text'] = df_processed['text'].astype(str)
    df_processed['text'] = df_processed['text'].str.strip()
    
    # Remove quotes if configured
    if text_preprocessing.get('strip_quotes', True):
        df_processed['text'] = df_processed['text'].str.strip('"\'')
        print(f"   🔄 Stripped quotes from text")
    
    # Remove empty texts
    before_count = len(df_processed)
    df_processed = df_processed[df_processed['text'].str.len() > 0]
    after_count = len(df_processed)
    
    if before_count != after_count:
        print(f"   🗑️ Removed {before_count - after_count} empty texts")
    
    # Standardize labels
    print(f"   🏷️ Standardizing labels...")
    df_processed['label'] = df_processed['label'].astype(str).str.lower().str.strip()
    
    # Display label distribution
    label_counts = df_processed['label'].value_counts()
    print(f"   📊 Label distribution:")
    for label, count in label_counts.items():
        print(f"      {label}: {count} samples ({count/len(df_processed)*100:.1f}%)")
    
    # Add metadata
    df_processed['processed_timestamp'] = datetime.now().isoformat()
    
    print(f"   ✅ Processed {len(df_processed)} samples")
    logger.info("Successfully processed dataset")
    
else:
    logger.error("Dataset does not have enough columns")
    raise ValueError("Dataset must have at least 2 columns (text and label)")

2025-08-08 16:29:02,417 - pipeline.data_processing - INFO - 🔄 Processing and standardizing dataset...
2025-08-08 16:29:02,434 - pipeline.data_processing - INFO - Successfully processed dataset
2025-08-08 16:29:02,434 - pipeline.data_processing - INFO - Successfully processed dataset


🔄 Data Preprocessing:
   📋 Data structure recognized: label column = 'label', text column = 'text'
   🧹 Cleaning text data...
   🔄 Stripped quotes from text
   🏷️ Standardizing labels...
   📊 Label distribution:
      neutral: 2879 samples (59.4%)
      positive: 1363 samples (28.1%)
      negative: 604 samples (12.5%)
   ✅ Processed 4846 samples


In [5]:
# Data splitting and final preparation
logger.info("📊 Splitting data into train/validation sets...")

print("📊 Data Splitting:")

# Import sklearn only when needed
from sklearn.model_selection import train_test_split

# Get splitting configuration
validation_split = data_config.get('validation_split', 0.1)
random_seed = config.get('pipeline.random_seed', 42)

print(f"   📋 Splitting configuration:")
print(f"      📈 Validation split: {validation_split}")
print(f"      🎲 Random seed: {random_seed}")

# Stratified split to maintain label distribution
# Check if stratified split is possible (each class needs at least 2 samples)
label_counts = df_processed['label'].value_counts()
min_class_count = label_counts.min()

if len(df_processed['label'].unique()) > 1 and min_class_count >= 2:
    try:
        train_df, val_df = train_test_split(
            df_processed,
            test_size=validation_split,
            random_state=random_seed,
            stratify=df_processed['label']
        )
        print(f"   📈 Stratified split applied")
    except ValueError:
        # Fall back to random split if stratification fails
        train_df, val_df = train_test_split(
            df_processed,
            test_size=validation_split,
            random_state=random_seed
        )
        print(f"   📈 Random split applied (stratification failed)")
else:
    train_df, val_df = train_test_split(
        df_processed,
        test_size=validation_split,
        random_state=random_seed
    )
    if len(df_processed['label'].unique()) == 1:
        print(f"   📈 Random split applied (single label)")
    else:
        print(f"   📈 Random split applied (insufficient samples for stratification)")
        print(f"       Minimum class count: {min_class_count}, need at least 2")

print(f"   📊 Train set: {len(train_df)} samples")
print(f"   📊 Validation set: {len(val_df)} samples")

# Display label distribution in splits
print(f"   🏷️ Train label distribution:")
train_labels = train_df['label'].value_counts()
for label, count in train_labels.items():
    print(f"      {label}: {count} ({count/len(train_df)*100:.1f}%)")

print(f"   🏷️ Validation label distribution:")
val_labels = val_df['label'].value_counts()
for label, count in val_labels.items():
    print(f"      {label}: {count} ({count/len(val_df)*100:.1f}%)")

logger.info("Successfully split dataset")

2025-08-08 16:29:02,451 - pipeline.data_processing - INFO - 📊 Splitting data into train/validation sets...


📊 Data Splitting:


2025-08-08 16:29:03,151 - pipeline.data_processing - INFO - Successfully split dataset


   📋 Splitting configuration:
      📈 Validation split: 0.1
      🎲 Random seed: 42
   📈 Stratified split applied
   📊 Train set: 4361 samples
   📊 Validation set: 485 samples
   🏷️ Train label distribution:
      neutral: 2591 (59.4%)
      positive: 1227 (28.1%)
      negative: 543 (12.5%)
   🏷️ Validation label distribution:
      neutral: 288 (59.4%)
      positive: 136 (28.0%)
      negative: 61 (12.6%)


In [6]:
# Save processed datasets and complete data processing step
import json

logger.info("💾 Saving processed datasets...")

print("💾 Saving Processed Data:")

# Get processed data directory from configuration
processed_data_dir = data_config.get('processed_data_dir', 'data/processed')
processed_dir = Path(f"../{processed_data_dir}")
processed_dir.mkdir(parents=True, exist_ok=True)

# Save train, validation, and full datasets
train_path = processed_dir / "train.csv"
val_path = processed_dir / "validation.csv"
full_path = processed_dir / "full_processed.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
df_processed.to_csv(full_path, index=False)

print(f"   ✅ Dataset saved:")
print(f"      📁 Full dataset: {full_path}")
print(f"      📁 Train set: {train_path}")
print(f"      📁 Validation set: {val_path}")

logger.info("Saved processed datasets")

# Create data processing summary
processing_summary = {
    'processing_timestamp': datetime.now().isoformat(),
    'total_samples': len(df_processed),
    'train_samples': len(train_df),
    'validation_samples': len(val_df),
    'labels': df_processed['label'].value_counts().to_dict(),
    'validation_split': len(val_df) / len(df_processed),
    'data_paths': {
        'train': str(train_path),
        'validation': str(val_path),
        'full': str(full_path)
    }
}

# Update pipeline state
state.mark_step_complete('data_processing_completed', **processing_summary)

# Save processing report
results_dir = Path("../results")
results_dir.mkdir(exist_ok=True)

with open(results_dir / 'data_processing_report.json', 'w') as f:
    json.dump(processing_summary, f, indent=2)

print(f"\n{'='*60}")
print("🎉 DATA PROCESSING COMPLETED SUCCESSFULLY!")
print(f"{'='*60}")
print("📝 Next Steps:")
print("1. Run 2_train_models_generalized.ipynb to train the models")
print("2. Continue with the sequential pipeline: 3 → 4 → 5 → 6")

print(f"\n📊 Processing Summary:")
print(f"   📋 Total samples: {processing_summary['total_samples']}")
print(f"   📊 Train samples: {processing_summary['train_samples']}")
print(f"   📊 Validation samples: {processing_summary['validation_samples']}")
print(f"   🏷️ Labels: {list(processing_summary['labels'].keys())}")

print(f"\n📄 Processing report saved to: {results_dir / 'data_processing_report.json'}")

logger.info("✅ Data processing completed successfully")

2025-08-08 16:29:03,163 - pipeline.data_processing - INFO - 💾 Saving processed datasets...
2025-08-08 16:29:03,210 - pipeline.data_processing - INFO - Saved processed datasets
2025-08-08 16:29:03,213 - pipeline.data_processing - INFO - ✅ Data processing completed successfully
2025-08-08 16:29:03,210 - pipeline.data_processing - INFO - Saved processed datasets
2025-08-08 16:29:03,213 - pipeline.data_processing - INFO - ✅ Data processing completed successfully


💾 Saving Processed Data:
   ✅ Dataset saved:
      📁 Full dataset: ../data/processed/full_processed.csv
      📁 Train set: ../data/processed/train.csv
      📁 Validation set: ../data/processed/validation.csv

🎉 DATA PROCESSING COMPLETED SUCCESSFULLY!
📝 Next Steps:
1. Run 2_train_models_generalized.ipynb to train the models
2. Continue with the sequential pipeline: 3 → 4 → 5 → 6

📊 Processing Summary:
   📋 Total samples: 4846
   📊 Train samples: 4361
   📊 Validation samples: 485
   🏷️ Labels: ['neutral', 'positive', 'negative']

📄 Processing report saved to: ../results/data_processing_report.json
