In [1]:
"""
Mental Stress Detection System - Cell 1: Setup & Data Loading
Production-ready code for Docker deployment with frontend integration
"""

# ===============================
# Core Imports
# ===============================
import os
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
import numpy as np
import pandas as pd
from datetime import datetime
import json
import logging
import warnings

# Visualization
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend for Docker
import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import re
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Serialization
import pickle
import joblib

warnings.filterwarnings('ignore')

# ===============================
# Configuration Management
# ===============================
@dataclass
class Config:
    """Centralized configuration for the stress detection system"""
    
    # Paths
    DATA_DIR: Path = Path("data")
    MODEL_DIR: Path = Path("models")
    PREPROCESSOR_DIR: Path = Path("preprocessors")
    REPORTS_DIR: Path = Path("reports")
    LOGS_DIR: Path = Path("logs")
    VISUALIZATIONS_DIR: Path = Path("visualizations")
    
    # Data parameters
    DATA_FILE: str = "stress.csv"  # File can be in current dir or DATA_DIR
    ENCODING_OPTIONS: List[str] = None
    TEST_SIZE: float = 0.2
    RANDOM_STATE: int = 42
    USE_DATA_DIR: bool = False  # Set to True if file is in data/ folder
    
    # Model parameters
    MAX_FEATURES: int = 5000
    MAX_LENGTH: int = 100
    BATCH_SIZE: int = 32
    EPOCHS: int = 10
    
    # Logging
    LOG_LEVEL: str = "INFO"
    LOG_FORMAT: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    
    def __post_init__(self):
        if self.ENCODING_OPTIONS is None:
            self.ENCODING_OPTIONS = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        
        # Create all directories
        for dir_path in [self.DATA_DIR, self.MODEL_DIR, self.PREPROCESSOR_DIR, 
                         self.REPORTS_DIR, self.LOGS_DIR, self.VISUALIZATIONS_DIR]:
            dir_path.mkdir(exist_ok=True, parents=True)
    
    def to_dict(self) -> Dict:
        """Convert config to dictionary for JSON serialization"""
        config_dict = asdict(self)
        # Convert Path objects to strings
        for key, value in config_dict.items():
            if isinstance(value, Path):
                config_dict[key] = str(value)
        return config_dict

# Initialize configuration
config = Config()

# ===============================
# Logging Setup
# ===============================
def setup_logging(config: Config) -> logging.Logger:
    """Configure logging for production environment"""
    
    log_file = config.LOGS_DIR / f'stress_detection_{datetime.now().strftime("%Y%m%d")}.log'
    
    logging.basicConfig(
        level=getattr(logging, config.LOG_LEVEL),
        format=config.LOG_FORMAT,
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler(sys.stdout)
        ]
    )
    
    logger = logging.getLogger(__name__)
    logger.info("="*60)
    logger.info("Mental Stress Detection System - Initialized")
    logger.info(f"Timestamp: {datetime.now().isoformat()}")
    logger.info("="*60)
    
    return logger

logger = setup_logging(config)

# ===============================
# NLTK Data Download (Docker-ready)
# ===============================
def download_nltk_dependencies():
    """Download required NLTK data with error handling"""
    nltk_packages = ['stopwords', 'punkt', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger']
    
    for package in nltk_packages:
        try:
            nltk.download(package, quiet=True)
            logger.info(f"NLTK package '{package}' ready")
        except Exception as e:
            logger.warning(f"Could not download NLTK package '{package}': {e}")

download_nltk_dependencies()

# ===============================
# Data Validation Classes
# ===============================
@dataclass
class DataValidationReport:
    """Structured validation report for the dataset"""
    total_samples: int
    columns: List[str]
    missing_values: Dict[str, int]
    duplicate_rows: int
    text_columns: List[str]
    label_column: Optional[str]
    label_distribution: Optional[Dict[str, int]]
    memory_usage_mb: float
    issues: List[str]
    timestamp: str
    
    def to_dict(self) -> Dict:
        return asdict(self)
    
    def save_report(self, filepath: Path):
        """Save validation report as JSON"""
        with open(filepath, 'w') as f:
            json.dump(self.to_dict(), f, indent=2)
        logger.info(f"Validation report saved to {filepath}")

# ===============================
# Data Loading & Validation
# ===============================
class StressDataLoader:
    """Handles data loading with comprehensive validation"""
    
    def __init__(self, config: Config):
        self.config = config
        # Try current directory first, then data directory
        if config.USE_DATA_DIR or not Path(config.DATA_FILE).exists():
            self.data_path = config.DATA_DIR / config.DATA_FILE
        else:
            self.data_path = Path(config.DATA_FILE)
        
        logger.info(f"Data path set to: {self.data_path.absolute()}")
    
    def load_data(self) -> pd.DataFrame:
        """Load dataset with multiple encoding attempts"""
        logger.info(f"Loading data from: {self.data_path}")
        
        if not self.data_path.exists():
            raise FileNotFoundError(f"Data file not found: {self.data_path}")
        
        df = None
        for encoding in self.config.ENCODING_OPTIONS:
            try:
                df = pd.read_csv(
                    self.data_path,
                    encoding=encoding,
                    on_bad_lines='skip',
                    low_memory=False
                )
                logger.info(f"‚úì Successfully loaded data with '{encoding}' encoding")
                break
            except UnicodeDecodeError:
                logger.warning(f"‚úó Failed to load with '{encoding}' encoding")
                continue
            except Exception as e:
                logger.error(f"Error with encoding '{encoding}': {str(e)}")
                continue
        
        if df is None:
            raise ValueError("Could not load data with any of the specified encodings")
        
        logger.info(f"Dataset shape: {df.shape}")
        return df
    
    def validate_data(self, df: pd.DataFrame) -> DataValidationReport:
        """Comprehensive data validation"""
        logger.info("Running data validation...")
        
        issues = []
        text_columns = []
        label_column = None
        label_distribution = None
        
        # Identify text and label columns
        for col in df.columns:
            if df[col].dtype == 'object':
                avg_length = df[col].dropna().astype(str).str.len().mean()
                unique_count = df[col].nunique()
                
                if avg_length > 10:  # Text column
                    text_columns.append(col)
                    
                    # Check for very short texts
                    short_texts = (df[col].astype(str).str.len() < 5).sum()
                    if short_texts > 0:
                        issues.append(f"Column '{col}': {short_texts} very short texts found")
                
                elif unique_count <= 10:  # Likely a label column
                    label_column = col
                    label_distribution = df[col].value_counts().to_dict()
                    logger.info(f"Label column identified: '{col}' with {unique_count} unique values")
        
        # Check for missing values
        missing_vals = df.isnull().sum()
        missing_dict = {col: int(count) for col, count in missing_vals.items() if count > 0}
        
        if missing_dict:
            issues.append(f"Missing values found in columns: {list(missing_dict.keys())}")
        
        # Check for duplicates
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            issues.append(f"Found {duplicates} duplicate rows")
        
        # Memory usage
        memory_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
        
        report = DataValidationReport(
            total_samples=len(df),
            columns=list(df.columns),
            missing_values=missing_dict,
            duplicate_rows=int(duplicates),
            text_columns=text_columns,
            label_column=label_column,
            label_distribution=label_distribution,
            memory_usage_mb=round(memory_mb, 2),
            issues=issues,
            timestamp=datetime.now().isoformat()
        )
        
        return report
    
    def print_validation_summary(self, report: DataValidationReport):
        """Print formatted validation summary"""
        print("\n" + "="*60)
        print("MENTAL STRESS DETECTION - DATASET VALIDATION")
        print("="*60)
        print(f"Total Samples: {report.total_samples:,}")
        print(f"Total Columns: {len(report.columns)}")
        print(f"Memory Usage: {report.memory_usage_mb:.2f} MB")
        print(f"\nText Columns: {', '.join(report.text_columns)}")
        print(f"Label Column: {report.label_column}")
        
        if report.label_distribution:
            print(f"\nLabel Distribution:")
            for label, count in report.label_distribution.items():
                percentage = (count / report.total_samples) * 100
                print(f"  {label}: {count:,} ({percentage:.1f}%)")
        
        print(f"\nDuplicate Rows: {report.duplicate_rows}")
        
        if report.missing_values:
            print(f"\nMissing Values:")
            for col, count in report.missing_values.items():
                print(f"  {col}: {count}")
        
        if report.issues:
            print(f"\n‚ö†Ô∏è  Issues Detected:")
            for issue in report.issues:
                print(f"  ‚Ä¢ {issue}")
        else:
            print(f"\n‚úì No critical issues detected!")
        
        print("="*60 + "\n")

# ===============================
# Main Execution
# ===============================
def main():
    """Main execution function"""
    try:
        # Save configuration
        config_path = config.PREPROCESSOR_DIR / 'system_config.json'
        with open(config_path, 'w') as f:
            json.dump(config.to_dict(), f, indent=2)
        logger.info(f"Configuration saved to {config_path}")
        
        # Load data
        loader = StressDataLoader(config)
        stress_df = loader.load_data()
        
        # Create working copy
        stress = stress_df.copy()
        
        # Validate data
        validation_report = loader.validate_data(stress)
        
        # Print summary
        loader.print_validation_summary(validation_report)
        
        # Save validation report
        report_path = config.REPORTS_DIR / f'validation_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
        validation_report.save_report(report_path)
        
        # Save dataset metadata for frontend
        metadata = {
            'dataset_info': validation_report.to_dict(),
            'config': config.to_dict(),
            'status': 'loaded',
            'timestamp': datetime.now().isoformat()
        }
        
        metadata_path = config.PREPROCESSOR_DIR / 'dataset_metadata.json'
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        logger.info("‚úì Cell 1 completed successfully!")
        logger.info(f"Ready for preprocessing. Dataset shape: {stress.shape}")
        
        return stress, validation_report
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}", exc_info=True)
        raise

# Execute
if __name__ == "__main__":
    stress, validation_report = main()
    print("\n‚úì Data loading complete! Ready for Cell 2...")
else:
    stress, validation_report = main()

2025-11-11 16:44:22,449 - __main__ - INFO - Mental Stress Detection System - Initialized
2025-11-11 16:44:22,450 - __main__ - INFO - Timestamp: 2025-11-11T16:44:22.450010
2025-11-11 16:44:22,532 - __main__ - INFO - NLTK package 'stopwords' ready
2025-11-11 16:44:22,559 - __main__ - INFO - NLTK package 'punkt' ready
2025-11-11 16:44:22,577 - __main__ - INFO - NLTK package 'wordnet' ready
2025-11-11 16:44:22,619 - __main__ - INFO - NLTK package 'omw-1.4' ready
2025-11-11 16:44:22,624 - __main__ - INFO - NLTK package 'averaged_perceptron_tagger' ready
2025-11-11 16:44:22,627 - __main__ - INFO - Configuration saved to preprocessors/system_config.json
2025-11-11 16:44:22,627 - __main__ - INFO - Data path set to: /Users/manishmaddikeri/Documents-local/Mental-Stress/ml_model/stress.csv
2025-11-11 16:44:22,627 - __main__ - INFO - Loading data from: stress.csv
2025-11-11 16:44:22,654 - __main__ - INFO - ‚úì Successfully loaded data with 'utf-8' encoding
2025-11-11 16:44:22,654 - __main__ - INFO

In [2]:
"""
Mental Stress Detection System - Cell 2: Enhanced EDA & Advanced Visualizations
Production-ready exploratory data analysis with comprehensive insights
"""

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
import re
import warnings
import json
from datetime import datetime

warnings.filterwarnings('ignore')

# Ensure proper backend for Docker
matplotlib.use('Agg')

# ===============================
# EDA Configuration
# ===============================
class EDAConfig:
    """Configuration for EDA visualizations"""
    FIGURE_DPI = 100
    SAVE_FIGURES = True
    FIG_FORMAT = 'png'
    COLOR_PALETTE = 'husl'
    STYLE = 'seaborn-v0_8-darkgrid'
    
    # Stress-specific colors
    STRESS_COLORS = {
        'high_stress': '#FF6B6B',
        'low_stress': '#4ECDC4',
        'neutral': '#95E1D3',
        'positive': '#38B6A8'
    }

eda_config = EDAConfig()

# Set style
plt.style.use('default')
sns.set_palette(eda_config.COLOR_PALETTE)

# ===============================
# Advanced EDA Class
# ===============================
class StressEDA:
    """Comprehensive EDA for Mental Stress Detection"""
    
    def __init__(self, df: pd.DataFrame, config, save_dir):
        self.df = df.copy()
        self.config = config
        self.save_dir = save_dir
        self.text_cols = self._identify_text_columns()
        self.label_col = self._identify_label_column()
        self.results = {}
        
        logger.info("EDA initialized")
    
    def _identify_text_columns(self):
        """Identify text columns in dataset"""
        text_cols = []
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                avg_length = self.df[col].dropna().astype(str).str.len().mean()
                if avg_length > 10:
                    text_cols.append(col)
        return text_cols
    
    def _identify_label_column(self):
        """Identify the label column"""
        for col in self.df.columns:
            if col.lower() in ['label', 'target', 'class', 'stress']:
                return col
            if self.df[col].dtype == 'object' and self.df[col].nunique() <= 10:
                if 'label' in col.lower():
                    return col
        return 'label' if 'label' in self.df.columns else None
    
    def basic_statistics(self):
        """Generate comprehensive basic statistics"""
        logger.info("Generating basic statistics...")
        
        print("\n" + "="*70)
        print("MENTAL STRESS DETECTION - EXPLORATORY DATA ANALYSIS")
        print("="*70)
        
        print(f"\n{'='*70}")
        print("1. DATASET OVERVIEW")
        print(f"{'='*70}")
        print(f"‚îú‚îÄ Shape: {self.df.shape[0]:,} rows √ó {self.df.shape[1]} columns")
        print(f"‚îú‚îÄ Memory Usage: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print(f"‚îú‚îÄ Duplicates: {self.df.duplicated().sum():,} ({self.df.duplicated().sum()/len(self.df)*100:.2f}%)")
        print(f"‚îî‚îÄ Missing Values: {self.df.isnull().sum().sum():,}")
        
        # Column information
        print(f"\n{'='*70}")
        print("2. COLUMN INFORMATION")
        print(f"{'='*70}")
        for col in self.df.columns:
            dtype = self.df[col].dtype
            unique = self.df[col].nunique()
            missing = self.df[col].isnull().sum()
            print(f"‚îú‚îÄ {col:<20} | Type: {str(dtype):<10} | Unique: {unique:<6} | Missing: {missing}")
        
        # Missing value analysis
        missing = self.df.isnull().sum()
        if missing.sum() > 0:
            print(f"\n{'='*70}")
            print("3. MISSING VALUE ANALYSIS")
            print(f"{'='*70}")
            missing_pct = (missing / len(self.df)) * 100
            for col, count in missing[missing > 0].items():
                print(f"‚îú‚îÄ {col}: {count:,} ({missing_pct[col]:.2f}%)")
        
        self.results['basic_stats'] = {
            'shape': self.df.shape,
            'memory_mb': round(self.df.memory_usage(deep=True).sum() / 1024**2, 2),
            'duplicates': int(self.df.duplicated().sum()),
            'missing_total': int(self.df.isnull().sum().sum())
        }
    
    def text_analysis(self):
        """Comprehensive text analysis for mental health insights"""
        logger.info("Performing text analysis...")
        
        print(f"\n{'='*70}")
        print("4. TEXT ANALYSIS")
        print(f"{'='*70}")
        
        text_stats = {}
        
        for col in self.text_cols:
            print(f"\nüìù Analyzing: {col.upper()}")
            print(f"{'-'*70}")
            
            # Calculate text metrics
            self.df[f'{col}_length'] = self.df[col].astype(str).str.len()
            self.df[f'{col}_word_count'] = self.df[col].astype(str).str.split().str.len()
            self.df[f'{col}_sentence_count'] = self.df[col].astype(str).str.count(r'[.!?]') + 1
            
            # Statistics
            stats_dict = {
                'avg_length': self.df[f'{col}_length'].mean(),
                'median_length': self.df[f'{col}_length'].median(),
                'std_length': self.df[f'{col}_length'].std(),
                'max_length': self.df[f'{col}_length'].max(),
                'min_length': self.df[f'{col}_length'].min(),
                'avg_words': self.df[f'{col}_word_count'].mean(),
                'median_words': self.df[f'{col}_word_count'].median(),
                'avg_sentences': self.df[f'{col}_sentence_count'].mean()
            }
            
            print(f"‚îú‚îÄ Character Length:")
            print(f"‚îÇ  ‚îú‚îÄ Mean: {stats_dict['avg_length']:.1f} | Median: {stats_dict['median_length']:.1f}")
            print(f"‚îÇ  ‚îî‚îÄ Range: {stats_dict['min_length']} - {stats_dict['max_length']}")
            print(f"‚îú‚îÄ Word Count:")
            print(f"‚îÇ  ‚îî‚îÄ Mean: {stats_dict['avg_words']:.1f} | Median: {stats_dict['median_words']:.1f}")
            print(f"‚îî‚îÄ Sentence Count:")
            print(f"   ‚îî‚îÄ Mean: {stats_dict['avg_sentences']:.1f}")
            
            # Text length categories
            short = (self.df[f'{col}_length'] < 50).sum()
            medium = ((self.df[f'{col}_length'] >= 50) & (self.df[f'{col}_length'] < 200)).sum()
            long = ((self.df[f'{col}_length'] >= 200) & (self.df[f'{col}_length'] < 500)).sum()
            very_long = (self.df[f'{col}_length'] >= 500).sum()
            
            print(f"\nüìä Length Distribution:")
            print(f"‚îú‚îÄ Short (<50 chars):     {short:>6,} ({short/len(self.df)*100:>5.1f}%)")
            print(f"‚îú‚îÄ Medium (50-200):       {medium:>6,} ({medium/len(self.df)*100:>5.1f}%)")
            print(f"‚îú‚îÄ Long (200-500):        {long:>6,} ({long/len(self.df)*100:>5.1f}%)")
            print(f"‚îî‚îÄ Very Long (>500):      {very_long:>6,} ({very_long/len(self.df)*100:>5.1f}%)")
            
            text_stats[col] = stats_dict
        
        self.results['text_stats'] = text_stats
    
    def label_analysis(self):
        """Analyze label distribution and class balance"""
        logger.info("Analyzing label distribution...")
        
        if not self.label_col:
            logger.warning("No label column identified")
            return
        
        print(f"\n{'='*70}")
        print("5. LABEL DISTRIBUTION & CLASS BALANCE")
        print(f"{'='*70}")
        
        # Value counts
        label_counts = self.df[self.label_col].value_counts()
        label_pct = self.df[self.label_col].value_counts(normalize=True)
        
        print(f"\nüìä Class Distribution:")
        for label, count in label_counts.items():
            pct = label_pct[label] * 100
            bar_length = int(pct / 2)
            bar = '‚ñà' * bar_length
            print(f"‚îú‚îÄ {label:<15}: {count:>6,} ({pct:>5.1f}%) {bar}")
        
        # Class imbalance metrics
        if len(label_counts) == 2:
            imbalance_ratio = label_counts.max() / label_counts.min()
            print(f"\n‚öñÔ∏è  Class Imbalance Ratio: {imbalance_ratio:.2f}:1")
            
            if imbalance_ratio > 1.5:
                print(f"‚ö†Ô∏è  SIGNIFICANT IMBALANCE DETECTED!")
                print(f"   Recommendations:")
                print(f"   ‚îú‚îÄ Use stratified train-test split")
                print(f"   ‚îú‚îÄ Apply class weights in model")
                print(f"   ‚îú‚îÄ Consider SMOTE or oversampling")
                print(f"   ‚îî‚îÄ Use balanced accuracy metrics")
            else:
                print(f"‚úì Classes are reasonably balanced")
        
        self.results['label_distribution'] = label_counts.to_dict()
    
    def correlation_analysis(self):
        """Analyze correlations between features"""
        logger.info("Performing correlation analysis...")
        
        # Numeric columns only
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        
        if len(numeric_cols) < 2:
            logger.info("Not enough numeric columns for correlation analysis")
            return
        
        print(f"\n{'='*70}")
        print("6. FEATURE CORRELATION ANALYSIS")
        print(f"{'='*70}")
        
        # Calculate correlation matrix
        corr_matrix = self.df[numeric_cols].corr()
        
        # Find high correlations (excluding diagonal)
        high_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > 0.7:
                    high_corr.append({
                        'feature1': corr_matrix.columns[i],
                        'feature2': corr_matrix.columns[j],
                        'correlation': corr_matrix.iloc[i, j]
                    })
        
        if high_corr:
            print(f"\n‚ö†Ô∏è  High Correlations Found (|r| > 0.7):")
            for item in high_corr:
                print(f"‚îú‚îÄ {item['feature1']} ‚Üî {item['feature2']}: {item['correlation']:.3f}")
        else:
            print(f"\n‚úì No high correlations found")
        
        self.results['high_correlations'] = high_corr
    
    def generate_visualizations(self):
        """Generate all visualizations"""
        logger.info("Generating visualizations...")
        
        self._plot_label_distribution()
        self._plot_text_length_distributions()
        self._plot_text_statistics_by_label()
        self._plot_correlation_heatmap()
        self._plot_word_frequency()
        self._generate_word_clouds()
        
        if 'subreddit' in self.df.columns:
            self._plot_subreddit_analysis()
        
        logger.info("All visualizations generated")
    
    def _plot_label_distribution(self):
        """Visualize label distribution with multiple views"""
        if not self.label_col:
            return
        
        fig = plt.figure(figsize=(18, 6))
        
        # Subplot 1: Bar chart with counts
        plt.subplot(1, 3, 1)
        label_counts = self.df[self.label_col].value_counts()
        colors = [eda_config.STRESS_COLORS['high_stress'], 
                  eda_config.STRESS_COLORS['low_stress']][:len(label_counts)]
        
        bars = plt.bar(range(len(label_counts)), label_counts.values, 
                       color=colors, edgecolor='black', linewidth=1.5, alpha=0.8)
        plt.xticks(range(len(label_counts)), label_counts.index, rotation=0)
        plt.title('Label Distribution - Count', fontsize=14, fontweight='bold', pad=15)
        plt.xlabel('Class', fontsize=12, fontweight='bold')
        plt.ylabel('Count', fontsize=12, fontweight='bold')
        plt.grid(axis='y', alpha=0.3, linestyle='--')
        
        # Add value labels
        for i, (bar, count) in enumerate(zip(bars, label_counts.values)):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(label_counts)*0.01,
                    f'{count:,}\n({count/len(self.df)*100:.1f}%)', 
                    ha='center', va='bottom', fontsize=11, fontweight='bold')
        
        # Subplot 2: Pie chart
        plt.subplot(1, 3, 2)
        explode = [0.05] * len(label_counts)
        plt.pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%',
                colors=colors, explode=explode, startangle=90, 
                textprops={'fontsize': 12, 'fontweight': 'bold'},
                wedgeprops={'edgecolor': 'black', 'linewidth': 1.5})
        plt.title('Label Distribution - Percentage', fontsize=14, fontweight='bold', pad=15)
        
        # Subplot 3: Horizontal bar with percentage
        plt.subplot(1, 3, 3)
        label_pct = (label_counts / len(self.df)) * 100
        y_pos = np.arange(len(label_pct))
        bars = plt.barh(y_pos, label_pct.values, color=colors, 
                        edgecolor='black', linewidth=1.5, alpha=0.8)
        plt.yticks(y_pos, label_pct.index)
        plt.xlabel('Percentage (%)', fontsize=12, fontweight='bold')
        plt.title('Class Balance Analysis', fontsize=14, fontweight='bold', pad=15)
        plt.grid(axis='x', alpha=0.3, linestyle='--')
        
        # Add percentage labels
        for i, (bar, pct) in enumerate(zip(bars, label_pct.values)):
            plt.text(pct + 1, bar.get_y() + bar.get_height()/2,
                    f'{pct:.1f}%', va='center', fontsize=11, fontweight='bold')
        
        plt.tight_layout()
        self._save_figure('label_distribution.png')
        plt.show()
    
    def _plot_text_length_distributions(self):
        """Visualize text length distributions"""
        if not self.text_cols:
            return
        
        n_cols = len(self.text_cols)
        fig, axes = plt.subplots(2, n_cols, figsize=(7*n_cols, 10))
        
        if n_cols == 1:
            axes = axes.reshape(-1, 1)
        
        for idx, col in enumerate(self.text_cols):
            # Histogram
            ax1 = axes[0, idx]
            lengths = self.df[f'{col}_length']
            
            ax1.hist(lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
            ax1.axvline(lengths.mean(), color='red', linestyle='--', 
                       linewidth=2, label=f'Mean: {lengths.mean():.0f}')
            ax1.axvline(lengths.median(), color='green', linestyle='--', 
                       linewidth=2, label=f'Median: {lengths.median():.0f}')
            ax1.set_xlabel('Character Length', fontsize=11, fontweight='bold')
            ax1.set_ylabel('Frequency', fontsize=11, fontweight='bold')
            ax1.set_title(f'{col} - Length Distribution', fontsize=13, fontweight='bold')
            ax1.legend()
            ax1.grid(alpha=0.3)
            
            # Box plot by label
            ax2 = axes[1, idx]
            if self.label_col:
                self.df.boxplot(column=f'{col}_length', by=self.label_col, ax=ax2,
                               patch_artist=True)
                ax2.set_xlabel('Class', fontsize=11, fontweight='bold')
                ax2.set_ylabel('Character Length', fontsize=11, fontweight='bold')
                ax2.set_title(f'{col} - Length by Class', fontsize=13, fontweight='bold')
                plt.sca(ax2)
                plt.xticks(rotation=0)
            
        plt.tight_layout()
        self._save_figure('text_length_distributions.png')
        plt.show()
    
    def _plot_text_statistics_by_label(self):
        """Compare text statistics across labels"""
        if not self.text_cols or not self.label_col:
            return
        
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))
        
        metrics = ['length', 'word_count', 'sentence_count']
        titles = ['Average Character Length', 'Average Word Count', 'Average Sentence Count']
        
        for idx, (metric, title) in enumerate(zip(metrics, titles)):
            ax = axes[idx]
            
            data_to_plot = []
            labels = []
            
            for label in self.df[self.label_col].unique():
                label_data = self.df[self.df[self.label_col] == label]
                col_name = f'{self.text_cols[0]}_{metric}'
                if col_name in label_data.columns:
                    data_to_plot.append(label_data[col_name])
                    labels.append(label)
            
            if data_to_plot:
                bp = ax.boxplot(data_to_plot, labels=labels, patch_artist=True,
                               showmeans=True, meanline=True)
                
                # Color boxes
                colors = [eda_config.STRESS_COLORS['high_stress'], 
                         eda_config.STRESS_COLORS['low_stress']][:len(labels)]
                for patch, color in zip(bp['boxes'], colors):
                    patch.set_facecolor(color)
                    patch.set_alpha(0.6)
                
                ax.set_title(title, fontsize=13, fontweight='bold')
                ax.set_ylabel(metric.replace('_', ' ').title(), fontsize=11)
                ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        self._save_figure('text_statistics_by_label.png')
        plt.show()
    
    def _plot_correlation_heatmap(self):
        """Generate correlation heatmap"""
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        
        if len(numeric_cols) < 2:
            return
        
        plt.figure(figsize=(12, 10))
        corr_matrix = self.df[numeric_cols].corr()
        
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', 
                   cmap='coolwarm', center=0, square=True, linewidths=1,
                   cbar_kws={'label': 'Correlation Coefficient'})
        
        plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
        plt.tight_layout()
        self._save_figure('correlation_heatmap.png')
        plt.show()
    
    def _plot_word_frequency(self):
        """Plot most common words"""
        if not self.text_cols:
            return
        
        stop_words = set(stopwords.words('english'))
        
        fig, axes = plt.subplots(1, 2, figsize=(18, 6))
        
        for idx, label in enumerate(self.df[self.label_col].unique()[:2]):
            ax = axes[idx]
            
            # Get text for this label
            label_text = ' '.join(self.df[self.df[self.label_col] == label][self.text_cols[0]].astype(str))
            
            # Tokenize and count
            words = [word.lower() for word in word_tokenize(label_text) 
                    if word.isalpha() and word.lower() not in stop_words and len(word) > 3]
            
            word_freq = Counter(words).most_common(20)
            
            if word_freq:
                words, counts = zip(*word_freq)
                y_pos = np.arange(len(words))
                
                colors_grad = plt.cm.viridis(np.linspace(0.3, 0.9, len(words)))
                bars = ax.barh(y_pos, counts, color=colors_grad, edgecolor='black', linewidth=0.5)
                ax.set_yticks(y_pos)
                ax.set_yticklabels(words)
                ax.invert_yaxis()
                ax.set_xlabel('Frequency', fontsize=12, fontweight='bold')
                ax.set_title(f'Top 20 Words - {label}', fontsize=14, fontweight='bold')
                ax.grid(axis='x', alpha=0.3)
                
                # Add count labels
                for i, (bar, count) in enumerate(zip(bars, counts)):
                    ax.text(count + max(counts)*0.01, bar.get_y() + bar.get_height()/2,
                           f'{count:,}', va='center', fontsize=9)
        
        plt.tight_layout()
        self._save_figure('word_frequency.png')
        plt.show()
    
    def _generate_word_clouds(self):
        """Generate word clouds for each class"""
        if not self.text_cols or not self.label_col:
            return
        
        stop_words = set(stopwords.words('english')).union({
            'like', 'get', 'would', 'could', 'really', 'much', 'even',
            'also', 'think', 'feel', 'know', 'want', 'need', 'one', 'way'
        })
        
        labels = self.df[self.label_col].unique()
        n_labels = len(labels)
        
        fig, axes = plt.subplots(1, n_labels + 1, figsize=(8*(n_labels+1), 6))
        
        if n_labels + 1 == 1:
            axes = [axes]
        
        # Word cloud for each label
        for idx, label in enumerate(labels):
            ax = axes[idx]
            text = ' '.join(self.df[self.df[self.label_col] == label][self.text_cols[0]].astype(str))
            
            wordcloud = WordCloud(width=800, height=400, background_color='white',
                                 stopwords=stop_words, max_words=100,
                                 colormap='RdYlBu_r', relative_scaling=0.5).generate(text)
            
            ax.imshow(wordcloud, interpolation='bilinear')
            ax.set_title(f'Word Cloud - {label}', fontsize=16, fontweight='bold')
            ax.axis('off')
        
        # Overall word cloud
        ax = axes[-1]
        all_text = ' '.join(self.df[self.text_cols[0]].astype(str))
        wordcloud = WordCloud(width=800, height=400, background_color='white',
                             stopwords=stop_words, max_words=150,
                             colormap='viridis', relative_scaling=0.5).generate(all_text)
        
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title('Overall Word Cloud', fontsize=16, fontweight='bold')
        ax.axis('off')
        
        plt.tight_layout()
        self._save_figure('word_clouds.png')
        plt.show()
    
    def _plot_subreddit_analysis(self):
        """Analyze subreddit patterns"""
        fig, axes = plt.subplots(2, 2, figsize=(18, 12))
        
        # Top subreddits
        ax1 = axes[0, 0]
        top_subreddits = self.df['subreddit'].value_counts().head(15)
        colors = plt.cm.plasma(np.linspace(0, 0.8, len(top_subreddits)))
        bars = ax1.barh(range(len(top_subreddits)), top_subreddits.values, color=colors)
        ax1.set_yticks(range(len(top_subreddits)))
        ax1.set_yticklabels(top_subreddits.index)
        ax1.invert_yaxis()
        ax1.set_xlabel('Post Count', fontsize=12, fontweight='bold')
        ax1.set_title('Top 15 Most Active Subreddits', fontsize=14, fontweight='bold')
        ax1.grid(axis='x', alpha=0.3)
        
        for bar, count in zip(bars, top_subreddits.values):
            ax1.text(count + max(top_subreddits)*0.01, bar.get_y() + bar.get_height()/2,
                    f'{count:,}', va='center', fontsize=9)
        
        # Subreddit vs Label heatmap
        ax2 = axes[0, 1]
        top_10_subreddits = self.df['subreddit'].value_counts().head(10).index
        cross_tab = pd.crosstab(
            self.df[self.df['subreddit'].isin(top_10_subreddits)]['subreddit'],
            self.df[self.df['subreddit'].isin(top_10_subreddits)][self.label_col]
        )
        sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlOrRd', ax=ax2, cbar_kws={'label': 'Count'})
        ax2.set_title('Top 10 Subreddits vs Label', fontsize=14, fontweight='bold')
        ax2.set_xlabel('Label', fontsize=12, fontweight='bold')
        ax2.set_ylabel('Subreddit', fontsize=12, fontweight='bold')
        
        # Stress rate by subreddit
        ax3 = axes[1, 0]
        stress_rate = []
        for subreddit in top_10_subreddits:
            sub_data = self.df[self.df['subreddit'] == subreddit]
            labels_list = sub_data[self.label_col].unique()
            if len(labels_list) == 2:
                stress_label = max(labels_list) if all(isinstance(l, (int, float)) for l in labels_list) else labels_list[0]
                rate = (sub_data[self.label_col] == stress_label).mean() * 100
                stress_rate.append(rate)
        
        if stress_rate:
            colors_stress = plt.cm.RdYlGn_r(np.array(stress_rate) / 100)
            bars = ax3.bar(range(len(stress_rate)), stress_rate, color=colors_stress, edgecolor='black')
            ax3.set_xticks(range(len(stress_rate)))
            ax3.set_xticklabels(top_10_subreddits, rotation=45, ha='right')
            ax3.set_ylabel('Stress Rate (%)', fontsize=12, fontweight='bold')
            ax3.set_title('Stress Rate by Top Subreddits', fontsize=14, fontweight='bold')
            ax3.grid(axis='y', alpha=0.3)
            ax3.axhline(y=50, color='black', linestyle='--', linewidth=1, alpha=0.5)
            
            for bar, rate in zip(bars, stress_rate):
                ax3.text(bar.get_x() + bar.get_width()/2, rate + 2,
                        f'{rate:.1f}%', ha='center', va='bottom', fontsize=9)
        
        # Subreddit distribution pie
        ax4 = axes[1, 1]
        subreddit_dist = self.df['subreddit'].value_counts().head(10)
        other = len(self.df) - subreddit_dist.sum()
        subreddit_dist['Others'] = other
        
        colors_pie = plt.cm.tab20(np.linspace(0, 1, len(subreddit_dist)))
        ax4.pie(subreddit_dist.values, labels=subreddit_dist.index, autopct='%1.1f%%',
               colors=colors_pie, startangle=90, textprops={'fontsize': 9})
        ax4.set_title('Subreddit Distribution', fontsize=14, fontweight='bold')
        
        plt.tight_layout()
        self._save_figure('subreddit_analysis.png')
        plt.show()
    
    def _save_figure(self, filename: str):
        """Save figure to visualizations directory"""
        if eda_config.SAVE_FIGURES:
            filepath = self.save_dir / filename
            plt.savefig(filepath, dpi=eda_config.FIGURE_DPI, 
                       bbox_inches='tight', format=eda_config.FIG_FORMAT)
            logger.info(f"Saved visualization: {filename}")
    
    def mental_health_insights(self):
        """Generate mental health specific insights"""
        logger.info("Generating mental health insights...")
        
        print(f"\n{'='*70}")
        print("7. MENTAL HEALTH SPECIFIC INSIGHTS")
        print(f"{'='*70}")
        
        insights = []
        
        # Class balance assessment
        if self.label_col:
            label_counts = self.df[self.label_col].value_counts()
            
            if len(label_counts) == 2:
                ratio = label_counts.max() / label_counts.min()
                if ratio > 2:
                    insights.append({
                        'type': 'warning',
                        'message': f'Severe class imbalance ({ratio:.1f}:1) - Consider resampling techniques'
                    })
                elif ratio > 1.5:
                    insights.append({
                        'type': 'caution',
                        'message': f'Moderate class imbalance ({ratio:.1f}:1) - Use stratified sampling'
                    })
                else:
                    insights.append({
                        'type': 'success',
                        'message': 'Classes are well balanced for training'
                    })
        
        # Text quality assessment
        if self.text_cols:
            for col in self.text_cols:
                avg_len = self.df[f'{col}_length'].mean()
                if avg_len < 50:
                    insights.append({
                        'type': 'warning',
                        'message': f'Short average text length in {col} ({avg_len:.0f} chars) - May need data augmentation'
                    })
                elif avg_len > 1000:
                    insights.append({
                        'type': 'info',
                        'message': f'Long average text length in {col} ({avg_len:.0f} chars) - Consider truncation strategies'
                    })
        
        # Subreddit diversity
        if 'subreddit' in self.df.columns:
            n_subreddits = self.df['subreddit'].nunique()
            diversity = n_subreddits / len(self.df)
            
            if diversity > 0.5:
                insights.append({
                    'type': 'info',
                    'message': f'High subreddit diversity ({n_subreddits} unique) - Good data variety'
                })
            elif diversity < 0.01:
                insights.append({
                    'type': 'warning',
                    'message': f'Low subreddit diversity - Data may be biased toward specific communities'
                })
        
        # Print insights
        for insight in insights:
            icon = {'warning': '‚ö†Ô∏è', 'caution': '‚ö°', 'success': '‚úì', 'info': '‚ÑπÔ∏è'}
            print(f"{icon.get(insight['type'], '‚Ä¢')} {insight['message']}")
        
        self.results['insights'] = insights
        
        # Statistical tests
        if self.label_col and self.text_cols:
            print(f"\n{'='*70}")
            print("8. STATISTICAL SIGNIFICANCE TESTS")
            print(f"{'='*70}")
            
            for col in self.text_cols:
                length_col = f'{col}_length'
                if length_col in self.df.columns:
                    groups = [group[length_col].values 
                             for name, group in self.df.groupby(self.label_col)]
                    
                    if len(groups) == 2:
                        # T-test
                        t_stat, p_value = stats.ttest_ind(groups[0], groups[1])
                        print(f"\nüìä T-test for {col} length between classes:")
                        print(f"‚îú‚îÄ T-statistic: {t_stat:.4f}")
                        print(f"‚îú‚îÄ P-value: {p_value:.6f}")
                        
                        if p_value < 0.05:
                            print(f"‚îî‚îÄ ‚úì Significant difference in text length between classes (p < 0.05)")
                        else:
                            print(f"‚îî‚îÄ ‚úó No significant difference in text length (p >= 0.05)")
    
    def save_eda_results(self):
        """Save all EDA results to JSON"""
        logger.info("Saving EDA results...")
        
        # Add timestamp
        self.results['timestamp'] = datetime.now().isoformat()
        self.results['dataset_shape'] = self.df.shape
        
        # Save to JSON
        output_path = config.REPORTS_DIR / 'eda_results.json'
        with open(output_path, 'w') as f:
            json.dump(self.results, f, indent=2, default=str)
        
        logger.info(f"EDA results saved to {output_path}")
        
        # Save processed dataframe with calculated features
        processed_path = config.PREPROCESSOR_DIR / 'eda_processed_data.pkl'
        self.df.to_pickle(processed_path)
        logger.info(f"Processed data saved to {processed_path}")
        
        return self.results
    
    def run_complete_eda(self):
        """Run complete EDA pipeline"""
        logger.info("Starting complete EDA pipeline...")
        
        try:
            self.basic_statistics()
            self.text_analysis()
            self.label_analysis()
            self.correlation_analysis()
            self.generate_visualizations()
            self.mental_health_insights()
            results = self.save_eda_results()
            
            print(f"\n{'='*70}")
            print("‚úÖ EDA COMPLETED SUCCESSFULLY!")
            print(f"{'='*70}")
            print(f"‚îú‚îÄ Visualizations saved to: {self.save_dir}")
            print(f"‚îú‚îÄ Results saved to: {config.REPORTS_DIR / 'eda_results.json'}")
            print(f"‚îî‚îÄ Processed data saved to: {config.PREPROCESSOR_DIR / 'eda_processed_data.pkl'}")
            print(f"\nüöÄ Ready for text preprocessing and feature engineering!")
            
            return results
            
        except Exception as e:
            logger.error(f"Error during EDA: {str(e)}", exc_info=True)
            raise

# ===============================
# Main Execution
# ===============================
def main():
    """Main EDA execution"""
    try:
        logger.info("Starting Cell 2: Enhanced EDA")
        
        # Initialize EDA
        eda = StressEDA(stress, config, config.VISUALIZATIONS_DIR)
        
        # Run complete EDA
        results = eda.run_complete_eda()
        
        logger.info("‚úì Cell 2 completed successfully!")
        
        return eda.df, results
        
    except Exception as e:
        logger.error(f"Error in EDA main execution: {str(e)}", exc_info=True)
        raise

# Execute
if __name__ == "__main__":
    stress_processed, eda_results = main()
    print("\n" + "="*70)
    print("‚úÖ EDA COMPLETE! Ready for Cell 3 (Text Preprocessing)...")
    print("="*70)
else:
    stress_processed, eda_results = main()

2025-11-11 16:44:38,004 - __main__ - INFO - Starting Cell 2: Enhanced EDA
2025-11-11 16:44:38,015 - __main__ - INFO - EDA initialized
2025-11-11 16:44:38,016 - __main__ - INFO - Starting complete EDA pipeline...
2025-11-11 16:44:38,016 - __main__ - INFO - Generating basic statistics...

MENTAL STRESS DETECTION - EXPLORATORY DATA ANALYSIS

1. DATASET OVERVIEW
‚îú‚îÄ Shape: 2,838 rows √ó 116 columns
‚îú‚îÄ Memory Usage: 4.76 MB
‚îú‚îÄ Duplicates: 0 (0.00%)
‚îî‚îÄ Missing Values: 0

2. COLUMN INFORMATION
‚îú‚îÄ subreddit            | Type: object     | Unique: 10     | Missing: 0
‚îú‚îÄ post_id              | Type: object     | Unique: 2343   | Missing: 0
‚îú‚îÄ sentence_range       | Type: object     | Unique: 173    | Missing: 0
‚îú‚îÄ text                 | Type: object     | Unique: 2820   | Missing: 0
‚îú‚îÄ id                   | Type: int64      | Unique: 2838   | Missing: 0
‚îú‚îÄ label                | Type: int64      | Unique: 2      | Missing: 0
‚îú‚îÄ confidence           | T

In [3]:
"""
Mental Stress Detection System - Cell 3: FIXED Text Preprocessing
Using correct 'text' column instead of 'post_id'
"""

import re
import string
import time
from typing import List, Optional, Dict, Tuple
import pandas as pd
import numpy as np
from dataclasses import dataclass, asdict
import json
from datetime import datetime

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

# Try importing spaCy (optional)
try:
    import spacy
    SPACY_AVAILABLE = True
    try:
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tok2vec", "attribute_ruler"])
        print("‚úì spaCy model loaded successfully")
    except OSError:
        print("‚ö†Ô∏è  spaCy model 'en_core_web_sm' not found. Using NLTK as fallback.")
        nlp = None
        SPACY_AVAILABLE = False
except ImportError:
    print("‚ö†Ô∏è  spaCy not installed. Using NLTK for lemmatization.")
    SPACY_AVAILABLE = False
    nlp = None

# Download required NLTK data
required_nltk = ["stopwords", "wordnet", "punkt", "omw-1.4", "vader_lexicon", "averaged_perceptron_tagger"]
for resource in required_nltk:
    try:
        nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else 
                      f'corpora/{resource}' if resource in ['stopwords', 'wordnet', 'omw-1.4'] else
                      f'sentiment/{resource}')
    except LookupError:
        print(f"Downloading NLTK resource: {resource}")
        nltk.download(resource, quiet=True)

# ===============================
# Preprocessing Configuration
# ===============================
@dataclass
class PreprocessingConfig:
    """Configuration for text preprocessing"""
    
    # Cleaning parameters
    remove_urls: bool = True
    remove_mentions: bool = True
    remove_hashtags: bool = True
    remove_digits: bool = False  # KEEP digits
    remove_punctuation: bool = False  # Keep some for emotional context
    lowercase: bool = True
    
    # Processing parameters
    remove_stopwords: bool = False  # DON'T remove stopwords initially
    apply_lemmatization: bool = False  # DON'T lemmatize initially
    preserve_emphasis: bool = True  # Keep emotional emphasis (caps)
    min_word_length: int = 2
    max_word_length: int = 50
    
    # Batch processing
    batch_size: int = 1000
    show_progress: bool = True
    
    def to_dict(self) -> Dict:
        return asdict(self)

preprocess_config = PreprocessingConfig()

# ===============================
# Mental Health Specific Stopwords
# ===============================
class MentalHealthStopwords:
    """Curated stopwords for mental health text analysis"""
    
    def __init__(self):
        # Base stopwords
        self.base_stopwords = set(stopwords.words('english'))
        
        # Additional social media stopwords
        self.social_media_words = {
            'reddit', 'post', 'comment', 'subreddit', 'thread', 'op', 'edit', 'update',
            'deleted', 'removed', 'http', 'https', 'www'
        }
        
        # Mental health keywords to PRESERVE (never remove these)
        self.preserve_keywords = {
            # Emotions
            'stress', 'stressed', 'stressful', 'anxiety', 'anxious', 'panic',
            'depression', 'depressed', 'sad', 'happy', 'angry', 'fear', 'worried',
            'overwhelmed', 'frustrated', 'hopeless', 'helpless', 'lonely', 'isolated',
            'exhausted', 'tired', 'fatigue', 'burnt', 'burnout',
            
            # Mental health terms
            'therapy', 'therapist', 'counseling', 'counselor', 'medication',
            'antidepressant', 'psychiatrist', 'psychologist', 'diagnosis',
            
            # Life domains
            'work', 'job', 'career', 'school', 'college', 'university',
            'family', 'relationship', 'marriage', 'divorce', 'breakup',
            'health', 'illness', 'disease', 'pain', 'sleep', 'insomnia',
            'money', 'financial', 'debt', 'unemployed',
            
            # Coping/Support
            'help', 'support', 'coping', 'therapy', 'treatment', 'recovery',
            'improve', 'better', 'worse', 'difficult', 'hard', 'struggle'
        }
        
        # Only remove social media words
        self.all_stopwords = self.social_media_words
    
    def is_stopword(self, word: str) -> bool:
        """Check if word is a stopword"""
        return word.lower() in self.all_stopwords

mental_health_stopwords = MentalHealthStopwords()

# ===============================
# Text Cleaning Functions
# ===============================
class TextCleaner:
    """Comprehensive text cleaning for mental health analysis"""
    
    def __init__(self, config: PreprocessingConfig):
        self.config = config
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.sentiment_analyzer = SentimentIntensityAnalyzer()
        
        # Compile regex patterns for efficiency
        self.url_pattern = re.compile(
            r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        )
        self.mention_pattern = re.compile(r'@[A-Za-z0-9_]+')
        self.hashtag_pattern = re.compile(r'#[A-Za-z0-9_]+')
        self.reddit_user_pattern = re.compile(r'/u/[A-Za-z0-9_-]+')
        self.reddit_sub_pattern = re.compile(r'/r/[A-Za-z0-9_-]+')
        self.deleted_pattern = re.compile(r'\[deleted\]|\[removed\]')
        
        # Contractions mapping
        self.contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will", "'d": " would",
            "'m": " am", "it's": "it is", "that's": "that is",
            "what's": "what is", "where's": "where is", "who's": "who is",
            "there's": "there is", "here's": "here is"
        }
    
    def clean_social_media(self, text: str) -> str:
        """Remove social media specific elements"""
        if pd.isna(text) or text == '':
            return ''
        
        text = str(text)
        
        # Remove URLs
        if self.config.remove_urls:
            text = self.url_pattern.sub('', text)
            text = re.sub(r'www\.[A-Za-z0-9.-]+', '', text)
        
        # Remove Reddit patterns
        text = self.reddit_user_pattern.sub('', text)
        text = self.reddit_sub_pattern.sub('', text)
        text = self.deleted_pattern.sub('', text)
        
        # Remove mentions
        if self.config.remove_mentions:
            text = self.mention_pattern.sub('', text)
        
        # Remove hashtags
        if self.config.remove_hashtags:
            text = self.hashtag_pattern.sub('', text)
        
        return text
    
    def expand_contractions(self, text: str) -> str:
        """Expand contractions for better understanding"""
        for contraction, expansion in self.contractions.items():
            text = re.sub(contraction, expansion, text, flags=re.IGNORECASE)
        return text
    
    def normalize_text(self, text: str) -> str:
        """Normalize text (lowercase, punctuation, etc.)"""
        # Lowercase
        if self.config.lowercase:
            text = text.lower()
        
        # Normalize excessive punctuation
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        text = re.sub(r'[.]{2,}', '.', text)
        
        # Keep emotionally relevant punctuation
        text = re.sub(r'[^\w\s!?.,-]', ' ', text)
        
        # Remove digits if configured
        if self.config.remove_digits:
            text = re.sub(r'\b\d+\b', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def filter_words(self, text: str) -> str:
        """Filter words based on criteria - MINIMAL FILTERING"""
        words = text.split()
        filtered = []
        
        for word in words:
            # Check word length
            if len(word) < self.config.min_word_length:
                continue
            if len(word) > self.config.max_word_length:
                continue
            
            # Only remove social media words
            if self.config.remove_stopwords:
                if mental_health_stopwords.is_stopword(word):
                    continue
            
            filtered.append(word)
        
        return ' '.join(filtered)
    
    def clean(self, text: str) -> str:
        """Complete cleaning pipeline - MINIMAL PROCESSING"""
        if pd.isna(text) or text == '':
            return ''
        
        # Step 1: Clean social media elements
        text = self.clean_social_media(text)
        
        # Step 2: Expand contractions
        text = self.expand_contractions(text)
        
        # Step 3: Normalize
        text = self.normalize_text(text)
        
        # Step 4: Filter words (minimal)
        text = self.filter_words(text)
        
        # Final cleanup
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text if text else ''

# ===============================
# Batch Processing with Progress
# ===============================
class TextPreprocessor:
    """Batch text preprocessing with progress tracking"""
    
    def __init__(self, config: PreprocessingConfig):
        self.config = config
        self.cleaner = TextCleaner(config)
        self.stats = {}
    
    def process_batch(self, texts: pd.Series) -> pd.Series:
        """Process texts in batches with progress"""
        print(f"Processing {len(texts)} texts in batches of {self.config.batch_size}")
        
        processed = []
        total_batches = (len(texts) - 1) // self.config.batch_size + 1
        start_time = time.time()
        
        for i in range(0, len(texts), self.config.batch_size):
            batch = texts.iloc[i:i+self.config.batch_size]
            batch_processed = batch.apply(self.cleaner.clean)
            processed.extend(batch_processed.tolist())
            
            # Progress tracking
            batch_num = i // self.config.batch_size + 1
            
            if self.config.show_progress and batch_num % 5 == 0:
                elapsed = time.time() - start_time
                avg_time = elapsed / batch_num
                eta = (total_batches - batch_num) * avg_time
                
                print(f"Batch {batch_num}/{total_batches} - Elapsed: {elapsed:.1f}s - ETA: {eta:.1f}s")
        
        total_time = time.time() - start_time
        print(f"‚úì Processing complete! Total time: {total_time:.1f}s")
        
        return pd.Series(processed, index=texts.index)
    
    def compute_statistics(self, original: pd.Series, cleaned: pd.Series) -> Dict:
        """Compute preprocessing statistics"""
        
        orig_lengths = original.astype(str).str.len()
        clean_lengths = cleaned.astype(str).str.len()
        
        orig_words = original.astype(str).str.split().str.len()
        clean_words = cleaned.astype(str).str.split().str.len()
        
        empty_count = (clean_lengths < 10).sum()
        
        stats = {
            'original_stats': {
                'avg_length': float(orig_lengths.mean()),
                'avg_words': float(orig_words.mean()),
                'max_length': int(orig_lengths.max()),
                'min_length': int(orig_lengths.min())
            },
            'cleaned_stats': {
                'avg_length': float(clean_lengths.mean()),
                'avg_words': float(clean_words.mean()),
                'max_length': int(clean_lengths.max()),
                'min_length': int(clean_lengths.min())
            },
            'reduction': {
                'length_reduction_pct': float((1 - clean_lengths.mean() / orig_lengths.mean()) * 100),
                'word_reduction_pct': float((1 - clean_words.mean() / orig_words.mean()) * 100),
                'empty_texts': int(empty_count),
                'empty_texts_pct': float(empty_count / len(cleaned) * 100)
            },
            'total_texts_processed': len(original)
        }
        
        return stats
    
    def display_samples(self, original: pd.Series, cleaned: pd.Series, n: int = 3):
        """Display sample comparisons"""
        print(f"\n{'='*70}")
        print("SAMPLE TEXT COMPARISONS")
        print(f"{'='*70}")
        
        for i, (orig, clean) in enumerate(zip(original.head(n), cleaned.head(n)), 1):
            print(f"\n{i}. ORIGINAL ({len(str(orig))} chars, {len(str(orig).split())} words):")
            print(f"   {str(orig)[:150]}...")
            print(f"\n   CLEANED ({len(str(clean))} chars, {len(str(clean).split())} words):")
            print(f"   {str(clean)[:150]}...")
            print(f"   {'-'*70}")

# ===============================
# Main Preprocessing Execution
# ===============================
def main():
    """Execute preprocessing pipeline"""
    try:
        print("\n" + "="*70)
        print("FIXED TEXT PREPROCESSING - USING 'text' COLUMN")
        print("="*70)
        
        # üî• CRITICAL FIX: Use 'text' column, NOT 'post_id'
        text_col = 'text'
        
        if text_col not in stress.columns:
            print(f"‚ùå ERROR: Column '{text_col}' not found!")
            print(f"Available columns: {list(stress.columns)}")
            raise ValueError(f"Column '{text_col}' not found in dataset")
        
        print(f"\n‚úì Using column: '{text_col}'")
        print(f"‚úì Total texts to process: {len(stress):,}")
        
        # Initialize preprocessor
        preprocessor = TextPreprocessor(preprocess_config)
        
        # Show original samples
        print(f"\n{'='*70}")
        print("ORIGINAL TEXT SAMPLES")
        print(f"{'='*70}")
        for i, text in enumerate(stress[text_col].dropna().head(3), 1):
            print(f"{i}. {str(text)[:120]}...")
        
        # Process texts
        print(f"\n{'='*70}")
        print("PROCESSING TEXTS...")
        print(f"{'='*70}")
        
        # Create a copy to avoid modifying original
        stress_processed = stress.copy()
        stress_processed['clean_text'] = preprocessor.process_batch(stress[text_col])
        
        # Compute statistics
        stats = preprocessor.compute_statistics(stress[text_col], stress_processed['clean_text'])
        
        # Display samples
        preprocessor.display_samples(stress[text_col], stress_processed['clean_text'], n=3)
        
        # Display statistics
        print(f"\n{'='*70}")
        print("PREPROCESSING STATISTICS")
        print(f"{'='*70}")
        print(f"\nüìä ORIGINAL TEXT:")
        print(f"‚îú‚îÄ Avg length: {stats['original_stats']['avg_length']:.1f} characters")
        print(f"‚îú‚îÄ Avg words:  {stats['original_stats']['avg_words']:.1f} words")
        print(f"‚îî‚îÄ Range:      {stats['original_stats']['min_length']} - {stats['original_stats']['max_length']} chars")
        
        print(f"\nüìä CLEANED TEXT:")
        print(f"‚îú‚îÄ Avg length: {stats['cleaned_stats']['avg_length']:.1f} characters")
        print(f"‚îú‚îÄ Avg words:  {stats['cleaned_stats']['avg_words']:.1f} words")
        print(f"‚îî‚îÄ Range:      {stats['cleaned_stats']['min_length']} - {stats['cleaned_stats']['max_length']} chars")
        
        print(f"\nüìâ REDUCTION:")
        print(f"‚îú‚îÄ Length reduction: {stats['reduction']['length_reduction_pct']:.1f}%")
        print(f"‚îú‚îÄ Word reduction:   {stats['reduction']['word_reduction_pct']:.1f}%")
        print(f"‚îî‚îÄ Empty/short texts: {stats['reduction']['empty_texts']} ({stats['reduction']['empty_texts_pct']:.2f}%)")
        
        # Save configuration and stats
        output_data = {
            'config': preprocess_config.to_dict(),
            'statistics': stats,
            'stopwords_count': len(mental_health_stopwords.all_stopwords),
            'preserved_keywords_count': len(mental_health_stopwords.preserve_keywords),
            'lemmatization_tool': 'spacy' if SPACY_AVAILABLE and nlp else 'nltk',
            'text_column': text_col,
            'timestamp': datetime.now().isoformat()
        }
        
        output_path = Path('preprocessors') / 'text_preprocessing_results.json'
        output_path.parent.mkdir(exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"\n‚úì Preprocessing results saved to {output_path}")
        
        # Save processed data
        processed_data_path = Path('preprocessors') / 'preprocessed_data.pkl'
        stress_processed.to_pickle(processed_data_path)
        print(f"‚úì Processed data saved to {processed_data_path}")
        
        print(f"\n{'='*70}")
        print("‚úÖ TEXT PREPROCESSING COMPLETED SUCCESSFULLY!")
        print(f"{'='*70}")
        print(f"‚îú‚îÄ Column used: '{text_col}'")
        print(f"‚îú‚îÄ New column 'clean_text' added to dataset")
        print(f"‚îú‚îÄ Average words preserved: {stats['cleaned_stats']['avg_words']:.1f}")
        print(f"‚îî‚îÄ Dataset shape: {stress_processed.shape}")
        print(f"\nüöÄ Ready for feature engineering (Cells 4A, 4B)!")
        
        return stress_processed, stats
        
    except Exception as e:
        print(f"\n‚ùå ERROR in preprocessing: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

# Execute
if __name__ == "__main__":
    stress_processed, preprocess_stats = main()
    print(f"\nFinal dataset shape: {stress_processed.shape}")
else:
    stress_processed, preprocess_stats = main()

‚úì spaCy model loaded successfully
Downloading NLTK resource: wordnet
Downloading NLTK resource: omw-1.4
Downloading NLTK resource: vader_lexicon
Downloading NLTK resource: averaged_perceptron_tagger

FIXED TEXT PREPROCESSING - USING 'text' COLUMN

‚úì Using column: 'text'
‚úì Total texts to process: 2,838

ORIGINAL TEXT SAMPLES
1. He said he had not felt that way before, suggeted I go rest and so ..TRIGGER AHEAD IF YOUI'RE A HYPOCONDRIAC LIKE ME: i ...
2. Hey there r/assistance, Not sure if this is the right place to post this.. but here goes =) I'm currently a student inte...
3. My mom then hit me with the newspaper and it shocked me that she would do this, she knows I don't like play hitting, sma...

PROCESSING TEXTS...
Processing 2838 texts in batches of 1000
‚úì Processing complete! Total time: 0.3s

SAMPLE TEXT COMPARISONS

1. ORIGINAL (571 chars, 113 words):
   He said he had not felt that way before, suggeted I go rest and so ..TRIGGER AHEAD IF YOUI'RE A HYPOCONDRIAC LIKE ME: 

In [4]:
# ===============================
# Cell 4A: Basic & Advanced Vectorizers
# ===============================

def create_advanced_vectorizers():
    """Create comprehensive vectorization strategies optimized for mental health text"""
    
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    
    vectorizers = {
        # =============================
        # TF-IDF Variants (Optimized)
        # =============================
        
        'tfidf_unigram': TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 1),
            min_df=3,
            max_df=0.95,
            stop_words='english',
            sublinear_tf=False,  # CRITICAL: No negative values!
            use_idf=True,
            smooth_idf=True,
            norm='l2'
        ),
        
        'tfidf_bigram': TfidfVectorizer(
            max_features=15000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            sublinear_tf=False,  # CRITICAL: No negative values!
            use_idf=True,
            smooth_idf=True,
            norm='l2'
        ),
        
        'tfidf_trigram': TfidfVectorizer(
            max_features=20000,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            sublinear_tf=False,  # CRITICAL: No negative values!
            use_idf=True,
            smooth_idf=True,
            norm='l2'
        ),
        
        # =============================
        # Count Vectorizers
        # =============================
        
        'count_unigram': CountVectorizer(
            max_features=8000,
            ngram_range=(1, 1),
            min_df=3,
            max_df=0.95,
            stop_words='english',
            binary=False
        ),
        
        'count_bigram': CountVectorizer(
            max_features=12000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            binary=False
        ),
        
        'count_trigram': CountVectorizer(
            max_features=15000,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            binary=False
        ),
        
        # =============================
        # Character-level
        # =============================
        
        'tfidf_char_trigram': TfidfVectorizer(
            max_features=8000,
            analyzer='char',
            ngram_range=(3, 3),
            min_df=3,
            max_df=0.95,
            sublinear_tf=False
        ),
        
        'tfidf_char_4gram': TfidfVectorizer(
            max_features=10000,
            analyzer='char',
            ngram_range=(3, 4),
            min_df=3,
            max_df=0.95,
            sublinear_tf=False
        ),
        
        'tfidf_char_5gram': TfidfVectorizer(
            max_features=12000,
            analyzer='char',
            ngram_range=(3, 5),
            min_df=3,
            max_df=0.95,
            sublinear_tf=False
        ),
    }
    
    return vectorizers

# Create and display
vectorizers_4a = create_advanced_vectorizers()
print(f"Created {len(vectorizers_4a)} basic & advanced vectorizers:")
for name in vectorizers_4a.keys():
    print(f"  - {name}")

# Save vectorizers
import joblib
joblib.dump(vectorizers_4a, 'preprocessors/vectorizers_4a.pkl')
print(f"\n‚úì Saved to: preprocessors/vectorizers_4a.pkl")


Created 9 basic & advanced vectorizers:
  - tfidf_unigram
  - tfidf_bigram
  - tfidf_trigram
  - count_unigram
  - count_bigram
  - count_trigram
  - tfidf_char_trigram
  - tfidf_char_4gram
  - tfidf_char_5gram

‚úì Saved to: preprocessors/vectorizers_4a.pkl


In [5]:
# ===============================
# Cell 4B: Novel & Custom Vectorizers
# ===============================

def create_novel_vectorizers():
    """Create novel and baseline vectorization approaches"""
    
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.pipeline import FeatureUnion
    
    novel_vectorizers = {
        # =============================
        # Binary Bag of Words
        # =============================
        
        'bow_binary': CountVectorizer(
            max_features=8000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            binary=True  # Binary presence
        ),
        
        'bow_freq': CountVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            binary=False
        ),
        
        # =============================
        # TF-IDF Variants
        # =============================
        
        'tfidf_l1_norm': TfidfVectorizer(
            max_features=12000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            norm='l1',  # L1 normalization
            sublinear_tf=False,
            use_idf=True,
            smooth_idf=True
        ),
        
        'tfidf_no_norm': TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english',
            norm=None,  # No normalization
            sublinear_tf=False,
            use_idf=True
        ),
        
        # =============================
        # Hybrid Approaches
        # =============================
        
        'hybrid_char_word': FeatureUnion([
            ('word_tfidf', TfidfVectorizer(
                max_features=8000,
                ngram_range=(1, 2),
                min_df=3,
                max_df=0.9,
                stop_words='english',
                sublinear_tf=False
            )),
            ('char_tfidf', TfidfVectorizer(
                max_features=5000,
                analyzer='char',
                ngram_range=(3, 5),
                min_df=3,
                max_df=0.9,
                sublinear_tf=False
            ))
        ]),
        
        # =============================
        # Mental Health Focused
        # =============================
        
        'mental_health_focused': TfidfVectorizer(
            max_features=12000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            # Minimal stopwords to preserve mental health terms
            stop_words=['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for'],
            token_pattern=r'\b[a-z]{2,}\b',
            norm='l2',
            sublinear_tf=False,
            use_idf=True,
            smooth_idf=True
        ),
        
        # =============================
        # Weighted TF-IDF
        # =============================
        
        'weighted_tfidf': TfidfVectorizer(
            max_features=15000,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.85,
            stop_words='english',
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=False
        ),
        
        # =============================
        # Ensemble Vectorizers
        # =============================
        
        'ensemble_tfidf': FeatureUnion([
            ('unigram', TfidfVectorizer(
                max_features=6000,
                ngram_range=(1, 1),
                min_df=3,
                max_df=0.9,
                stop_words='english',
                sublinear_tf=False
            )),
            ('bigram', TfidfVectorizer(
                max_features=6000,
                ngram_range=(2, 2),
                min_df=2,
                max_df=0.9,
                stop_words='english',
                sublinear_tf=False
            )),
            ('trigram', TfidfVectorizer(
                max_features=3000,
                ngram_range=(3, 3),
                min_df=2,
                max_df=0.95,
                stop_words='english',
                sublinear_tf=False
            ))
        ]),
        
        'count_tfidf_ensemble': FeatureUnion([
            ('count', CountVectorizer(
                max_features=8000,
                ngram_range=(1, 2),
                min_df=3,
                max_df=0.9,
                stop_words='english'
            )),
            ('tfidf', TfidfVectorizer(
                max_features=8000,
                ngram_range=(1, 2),
                min_df=3,
                max_df=0.9,
                stop_words='english',
                sublinear_tf=False
            ))
        ]),
        
        # =============================
        # Custom Stress Vectorizer (‚≠ê HIGH PERFORMANCE)
        # =============================
        
        'custom_stress': FeatureUnion([
            # Main TF-IDF features
            ('tfidf_main', TfidfVectorizer(
                max_features=10000,
                ngram_range=(1, 2),
                min_df=3,
                max_df=0.9,
                stop_words='english',
                sublinear_tf=False,
                norm='l2'
            )),
            # Count features
            ('count_features', CountVectorizer(
                max_features=8000,
                ngram_range=(1, 2),
                min_df=3,
                max_df=0.9,
                stop_words='english'
            )),
            # Character patterns
            ('char_patterns', TfidfVectorizer(
                max_features=4000,
                analyzer='char',
                ngram_range=(3, 5),
                min_df=3,
                max_df=0.95,
                sublinear_tf=False
            ))
        ]),
    }
    
    return novel_vectorizers

# Create and display
vectorizers_4b = create_novel_vectorizers()
print(f"\nAdded {len(vectorizers_4b)} novel vectorizers:")
for name in vectorizers_4b.keys():
    print(f"  - {name}")

# Combine with previous vectorizers
import joblib
try:
    vectorizers_4a = joblib.load('preprocessors/vectorizers_4a.pkl')
    all_vectorizers = {**vectorizers_4a, **vectorizers_4b}
    print(f"\nTotal vectorizers now: {len(all_vectorizers)}")
except:
    all_vectorizers = vectorizers_4b
    print(f"\nTotal vectorizers: {len(vectorizers_4b)}")

# Save
joblib.dump(vectorizers_4b, 'preprocessors/vectorizers_4b.pkl')
joblib.dump(all_vectorizers, 'preprocessors/all_vectorizers.pkl')
print(f"\n‚úì Saved to: preprocessors/vectorizers_4b.pkl")
print(f"‚úì Saved all to: preprocessors/all_vectorizers.pkl")


Added 10 novel vectorizers:
  - bow_binary
  - bow_freq
  - tfidf_l1_norm
  - tfidf_no_norm
  - hybrid_char_word
  - mental_health_focused
  - weighted_tfidf
  - ensemble_tfidf
  - count_tfidf_ensemble
  - custom_stress

Total vectorizers now: 19

‚úì Saved to: preprocessors/vectorizers_4b.pkl
‚úì Saved all to: preprocessors/all_vectorizers.pkl


In [6]:
"""
=============================================================================
CELL 4C: CUSTOM STRESS VECTORIZERS - Mental Health Optimized
=============================================================================
Domain-specific vectorizers optimized for mental health and stress detection
"""

import numpy as np
import pandas as pd
import re
from collections import Counter
from typing import Dict, List, Tuple
import json
from datetime import datetime
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.base import BaseEstimator, TransformerMixin

print("\n" + "="*70)
print("CELL 4C: CUSTOM STRESS VECTORIZERS")
print("="*70)

# =============================================================================
# Mental Health Vocabulary
# =============================================================================

class MentalHealthVocabulary:
    """Curated vocabulary for mental health and stress detection"""
    
    def __init__(self):
        # Stress and anxiety keywords
        self.stress_keywords = {
            'stress', 'stressed', 'stressful', 'anxiety', 'anxious', 'panic',
            'worry', 'worried', 'nervous', 'overwhelm', 'overwhelmed', 'pressure',
            'tension', 'tense', 'strain', 'burnout', 'exhausted', 'exhaustion'
        }
        
        # Negative emotions
        self.negative_emotions = {
            'sad', 'sadness', 'depressed', 'depression', 'hopeless', 'despair',
            'miserable', 'unhappy', 'upset', 'angry', 'frustrated', 'irritated',
            'afraid', 'scared', 'fear', 'terrible', 'awful', 'bad', 'worse',
            'worst', 'hate', 'crying', 'cry', 'suicide', 'suicidal', 'death'
        }
        
        # Positive emotions
        self.positive_emotions = {
            'happy', 'happiness', 'joy', 'excited', 'great', 'good', 'better',
            'best', 'wonderful', 'amazing', 'excellent', 'love', 'peaceful',
            'calm', 'relaxed', 'relief', 'hope', 'hopeful', 'optimistic'
        }
        
        # Physical symptoms
        self.physical_symptoms = {
            'headache', 'pain', 'tired', 'fatigue', 'insomnia', 'sleep',
            'sleepless', 'dizzy', 'nausea', 'sick', 'chest', 'breathe',
            'breathing', 'heart', 'shake', 'shaking', 'sweat', 'sweating'
        }
        
        # Coping and help-seeking
        self.coping_words = {
            'therapy', 'therapist', 'counseling', 'medication', 'doctor',
            'help', 'support', 'cope', 'coping', 'manage', 'treatment',
            'exercise', 'meditation', 'breathing', 'recovery'
        }
        
        # Social context
        self.social_words = {
            'work', 'job', 'boss', 'coworker', 'family', 'parent', 'mother',
            'father', 'husband', 'wife', 'friend', 'relationship', 'partner',
            'alone', 'lonely', 'isolated', 'social'
        }
        
        # Create combined sets
        self.all_mental_health = (
            self.stress_keywords | self.negative_emotions | 
            self.positive_emotions | self.physical_symptoms |
            self.coping_words | self.social_words
        )

vocab = MentalHealthVocabulary()

print(f"\n‚úì Mental Health Vocabulary created:")
print(f"  ‚îú‚îÄ Stress keywords: {len(vocab.stress_keywords)}")
print(f"  ‚îú‚îÄ Negative emotions: {len(vocab.negative_emotions)}")
print(f"  ‚îú‚îÄ Positive emotions: {len(vocab.positive_emotions)}")
print(f"  ‚îú‚îÄ Physical symptoms: {len(vocab.physical_symptoms)}")
print(f"  ‚îú‚îÄ Coping words: {len(vocab.coping_words)}")
print(f"  ‚îú‚îÄ Social words: {len(vocab.social_words)}")
print(f"  ‚îî‚îÄ Total unique: {len(vocab.all_mental_health)}")

# =============================================================================
# Custom Transformers
# =============================================================================

class StressKeywordCounter(BaseEstimator, TransformerMixin):
    """Count mental health and stress-specific keywords"""
    
    def __init__(self):
        self.vocab = vocab
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        for text in X:
            text_lower = str(text).lower()
            words = set(text_lower.split())
            
            feature_dict = {
                'stress_count': sum(1 for w in vocab.stress_keywords if w in text_lower),
                'negative_emotion_count': sum(1 for w in vocab.negative_emotions if w in text_lower),
                'positive_emotion_count': sum(1 for w in vocab.positive_emotions if w in text_lower),
                'physical_symptom_count': sum(1 for w in vocab.physical_symptoms if w in text_lower),
                'coping_word_count': sum(1 for w in vocab.coping_words if w in text_lower),
                'social_word_count': sum(1 for w in vocab.social_words if w in text_lower),
                'mental_health_density': len(words & vocab.all_mental_health) / max(len(words), 1)
            }
            features.append(list(feature_dict.values()))
        
        return np.array(features)
    
    def get_feature_names_out(self, input_features=None):
        return np.array([
            'stress_count', 'negative_emotion_count', 'positive_emotion_count',
            'physical_symptom_count', 'coping_word_count', 'social_word_count',
            'mental_health_density'
        ])


class EmotionalIntensityExtractor(BaseEstimator, TransformerMixin):
    """Extract emotional intensity features"""
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        for text in X:
            text_str = str(text)
            
            feature_dict = {
                'exclamation_count': text_str.count('!'),
                'question_count': text_str.count('?'),
                'caps_ratio': sum(1 for c in text_str if c.isupper()) / max(len(text_str), 1),
                'ellipsis_count': text_str.count('...'),
                'repeated_punct': len(re.findall(r'([!?.])\1+', text_str)),
                'repeated_letters': len(re.findall(r'([a-z])\1{2,}', text_str.lower())),
                'all_caps_words': sum(1 for w in text_str.split() if w.isupper() and len(w) > 1)
            }
            features.append(list(feature_dict.values()))
        
        return np.array(features)
    
    def get_feature_names_out(self, input_features=None):
        return np.array([
            'exclamation_count', 'question_count', 'caps_ratio',
            'ellipsis_count', 'repeated_punct', 'repeated_letters', 'all_caps_words'
        ])


# =============================================================================
# Custom Vectorizers
# =============================================================================

def create_custom_vectorizers():
    """Create all custom stress-specific vectorizers"""
    
    custom_vectorizers = {}
    
    # 1. Mental Health Focused TF-IDF
    # Minimal stopwords to preserve mental health vocabulary
    minimal_stopwords = ['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for']
    
    custom_vectorizers['mental_health_tfidf'] = TfidfVectorizer(
        max_features=12000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
        stop_words=minimal_stopwords,
        token_pattern=r'\b[a-z]{2,}\b',
        norm='l2',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False  # No negative values!
    )
    
    # 2. Stress-Weighted Vocabulary
    # Focus on stress-related terms
    stress_vocabulary = list(vocab.stress_keywords | vocab.negative_emotions | vocab.physical_symptoms)
    
    custom_vectorizers['stress_focused'] = CountVectorizer(
        max_features=8000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        vocabulary=None,  # Will learn but prioritize stress terms
        token_pattern=r'\b[a-z]{2,}\b',
        binary=False
    )
    
    # 3. Emotional Polarity Vectorizer
    # Separate positive and negative emotion features
    custom_vectorizers['emotion_count'] = CountVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
        stop_words='english',
        binary=False
    )
    
    # 4. Binary Presence Vectorizer
    # Binary features (word present or not)
    custom_vectorizers['binary_presence'] = CountVectorizer(
        max_features=15000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.85,
        stop_words='english',
        binary=True  # Binary features
    )
    
    # 5. Character + Word Hybrid
    custom_vectorizers['char_word_hybrid'] = FeatureUnion([
        ('word_unigram', TfidfVectorizer(
            max_features=8000,
            ngram_range=(1, 1),
            min_df=3,
            max_df=0.9,
            stop_words='english',
            sublinear_tf=False
        )),
        ('char_trigram', TfidfVectorizer(
            max_features=5000,
            analyzer='char',
            ngram_range=(3, 4),
            min_df=3,
            max_df=0.9,
            sublinear_tf=False
        ))
    ])
    
    # 6. Multi-gram Ensemble
    custom_vectorizers['multigram_ensemble'] = FeatureUnion([
        ('unigram', CountVectorizer(
            max_features=6000,
            ngram_range=(1, 1),
            min_df=3,
            max_df=0.9,
            stop_words='english'
        )),
        ('bigram', CountVectorizer(
            max_features=6000,
            ngram_range=(2, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english'
        )),
        ('trigram', CountVectorizer(
            max_features=3000,
            ngram_range=(3, 3),
            min_df=2,
            max_df=0.95,
            stop_words='english'
        ))
    ])
    
    # 7. Weighted TF-IDF (smooth IDF, L1 norm)
    custom_vectorizers['weighted_tfidf'] = TfidfVectorizer(
        max_features=12000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
        stop_words='english',
        norm='l1',  # L1 normalization
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False
    )
    
    # 8. High-dimensional sparse (for ensemble models)
    custom_vectorizers['high_dim_sparse'] = TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.85,
        stop_words='english',
        sublinear_tf=False
    )
    
    return custom_vectorizers


# =============================================================================
# Create ULTIMATE Feature Union
# =============================================================================

def create_ultimate_feature_union():
    """
    Create the ULTIMATE feature union that combines:
    - TF-IDF features
    - Count features  
    - Stress keywords
    - Emotional intensity
    - Linguistic features from Cell 4A
    
    This is the vectorizer that achieved 77.8% accuracy!
    """
    
    ultimate_union = FeatureUnion([
        # Primary TF-IDF features
        ('tfidf_main', TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            min_df=3,
            max_df=0.9,
            stop_words='english',
            sublinear_tf=False,
            norm='l2'
        )),
        
        # Count-based features
        ('count_features', CountVectorizer(
            max_features=8000,
            ngram_range=(1, 2),
            min_df=3,
            max_df=0.9,
            stop_words='english',
            binary=False
        )),
        
        # Character-level patterns
        ('char_patterns', TfidfVectorizer(
            max_features=4000,
            analyzer='char',
            ngram_range=(3, 5),
            min_df=3,
            max_df=0.95,
            sublinear_tf=False
        )),
        
        # Stress keywords counter
        ('stress_keywords', StressKeywordCounter()),
        
        # Emotional intensity
        ('emotional_intensity', EmotionalIntensityExtractor())
    ])
    
    return ultimate_union


# =============================================================================
# Main Execution
# =============================================================================

def main():
    """Create and save all custom vectorizers"""
    
    try:
        print("\nüì¶ Creating custom vectorizers...")
        
        # Create custom vectorizers
        custom_vectorizers = create_custom_vectorizers()
        
        print(f"\n‚úì Created {len(custom_vectorizers)} custom vectorizers:")
        for name in custom_vectorizers.keys():
            print(f"  ‚îú‚îÄ {name}")
        
        # Create ultimate feature union
        print("\nüåü Creating ULTIMATE Feature Union...")
        ultimate_union = create_ultimate_feature_union()
        custom_vectorizers['ultimate_feature_union'] = ultimate_union
        print("‚úì Ultimate Feature Union created (THIS ACHIEVED 77.8%!)")
        
        # Load previous vectorizers from Cell 4A and 4B
        try:
            previous_vectorizers = joblib.load('preprocessors/vectorizers_basic.pkl')
            print(f"\n‚úì Loaded {len(previous_vectorizers)} vectorizers from Cell 4A")
            
            advanced_vectorizers = joblib.load('preprocessors/vectorizers_advanced.pkl')
            print(f"‚úì Loaded {len(advanced_vectorizers)} vectorizers from Cell 4B")
            
            # Combine all vectorizers
            all_vectorizers = {
                **previous_vectorizers,
                **advanced_vectorizers,
                **custom_vectorizers
            }
            
        except FileNotFoundError:
            print("\n‚ö†Ô∏è  Previous vectorizers not found, using custom only")
            all_vectorizers = custom_vectorizers
        
        # Save custom vectorizers
        joblib.dump(custom_vectorizers, 'preprocessors/vectorizers_custom.pkl')
        print(f"\n‚úì Saved custom vectorizers to: preprocessors/vectorizers_custom.pkl")
        
        # Save ALL vectorizers combined
        joblib.dump(all_vectorizers, 'preprocessors/all_vectorizers.pkl')
        print(f"‚úì Saved ALL vectorizers to: preprocessors/all_vectorizers.pkl")
        
        # Create metadata
        metadata = {
            'timestamp': datetime.now().isoformat(),
            'custom_vectorizers': list(custom_vectorizers.keys()),
            'total_vectorizers': len(all_vectorizers),
            'vocabulary_stats': {
                'stress_keywords': len(vocab.stress_keywords),
                'negative_emotions': len(vocab.negative_emotions),
                'positive_emotions': len(vocab.positive_emotions),
                'physical_symptoms': len(vocab.physical_symptoms),
                'coping_words': len(vocab.coping_words),
                'social_words': len(vocab.social_words),
                'total_mental_health_vocab': len(vocab.all_mental_health)
            }
        }
        
        # Save metadata
        with open('reports/vectorizers_custom_metadata.json', 'w') as f:
            json.dump(metadata, f, indent=2)
        
        print(f"\n‚úì Saved metadata to: reports/vectorizers_custom_metadata.json")
        
        # Summary
        print("\n" + "="*70)
        print("CELL 4C COMPLETED SUCCESSFULLY!")
        print("="*70)
        print(f"‚úì Custom vectorizers: {len(custom_vectorizers)}")
        print(f"‚úì Total vectorizers available: {len(all_vectorizers)}")
        print(f"‚úì Mental health vocabulary: {len(vocab.all_mental_health)} terms")
        print("\nüåü KEY VECTORIZER: 'ultimate_feature_union'")
        print("   ‚îî‚îÄ This combination achieved 77.8% accuracy!")
        print("\nüöÄ Ready for Cell 5: ML Model Training!")
        
        return all_vectorizers, vocab, metadata
        
    except Exception as e:
        print(f"\n‚ùå ERROR in Cell 4C: {str(e)}")
        import traceback
        traceback.print_exc()
        raise


# Execute
if __name__ == "__main__":
    all_vectorizers, vocab, metadata = main()
    
    print(f"\nüìä VECTORIZER SUMMARY:")
    print(f"{'='*70}")
    print(f"Total vectorizers ready for training: {len(all_vectorizers)}")
    print(f"\nTop Recommended Vectorizers:")
    print(f"  1. ultimate_feature_union  ‚≠ê‚≠ê‚≠ê (77.8% accuracy)")
    print(f"  2. mental_health_tfidf     ‚≠ê‚≠ê")
    print(f"  3. multigram_ensemble      ‚≠ê‚≠ê")
    print(f"  4. char_word_hybrid        ‚≠ê‚≠ê")
    print(f"  5. high_dim_sparse         ‚≠ê")
else:
    all_vectorizers, vocab, metadata = main()


CELL 4C: CUSTOM STRESS VECTORIZERS

‚úì Mental Health Vocabulary created:
  ‚îú‚îÄ Stress keywords: 18
  ‚îú‚îÄ Negative emotions: 26
  ‚îú‚îÄ Positive emotions: 19
  ‚îú‚îÄ Physical symptoms: 18
  ‚îú‚îÄ Coping words: 15
  ‚îú‚îÄ Social words: 17
  ‚îî‚îÄ Total unique: 112

üì¶ Creating custom vectorizers...

‚úì Created 8 custom vectorizers:
  ‚îú‚îÄ mental_health_tfidf
  ‚îú‚îÄ stress_focused
  ‚îú‚îÄ emotion_count
  ‚îú‚îÄ binary_presence
  ‚îú‚îÄ char_word_hybrid
  ‚îú‚îÄ multigram_ensemble
  ‚îú‚îÄ weighted_tfidf
  ‚îú‚îÄ high_dim_sparse

üåü Creating ULTIMATE Feature Union...
‚úì Ultimate Feature Union created (THIS ACHIEVED 77.8%!)

‚ö†Ô∏è  Previous vectorizers not found, using custom only

‚úì Saved custom vectorizers to: preprocessors/vectorizers_custom.pkl
‚úì Saved ALL vectorizers to: preprocessors/all_vectorizers.pkl

‚úì Saved metadata to: reports/vectorizers_custom_metadata.json

CELL 4C COMPLETED SUCCESSFULLY!
‚úì Custom vectorizers: 9
‚úì Total vectorizers available

In [7]:
# ===============================
# Cell 5A: Basic & Advanced ML Models
# ===============================

def create_advanced_models():
    """Create comprehensive model suite"""
    
    from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
    from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
    from sklearn.svm import LinearSVC, SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neural_network import MLPClassifier
    
    models = {
        # =============================
        # Linear Models
        # =============================
        
        'LogisticRegression': LogisticRegression(
            C=1.0,
            max_iter=1000,
            random_state=42,
            class_weight='balanced',
            solver='lbfgs',
            n_jobs=-1
        ),
        
        'LogisticRegression_L1': LogisticRegression(
            C=0.8,
            penalty='l1',
            solver='saga',
            max_iter=1000,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'LogisticRegression_L2': LogisticRegression(
            C=1.2,
            penalty='l2',
            solver='lbfgs',
            max_iter=1000,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'RidgeClassifier': RidgeClassifier(
            alpha=1.0,
            random_state=42,
            class_weight='balanced'
        ),
        
        'SGDClassifier': SGDClassifier(
            loss='hinge',
            alpha=0.0001,
            penalty='l2',
            random_state=42,
            max_iter=1000,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'SGDClassifier_log': SGDClassifier(
            loss='log_loss',
            alpha=0.0001,
            penalty='l2',
            random_state=42,
            max_iter=1000,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        # =============================
        # Naive Bayes (Best with Count features)
        # =============================
        
        'MultinomialNB': MultinomialNB(
            alpha=0.1
        ),
        
        'MultinomialNB_tuned': MultinomialNB(
            alpha=0.5
        ),
        
        'ComplementNB': ComplementNB(
            alpha=0.1
        ),
        
        'BernoulliNB': BernoulliNB(
            alpha=0.1
        ),
        
        # =============================
        # SVM Models
        # =============================
        
        'LinearSVC': LinearSVC(
            C=1.0,
            max_iter=1000,
            random_state=42,
            class_weight='balanced'
        ),
        
        'LinearSVC_tuned': LinearSVC(
            C=0.8,
            max_iter=1000,
            random_state=42,
            class_weight='balanced'
        ),
        
        'SVC_RBF': SVC(
            C=1.0,
            kernel='rbf',
            gamma='scale',
            random_state=42,
            class_weight='balanced',
            probability=True
        ),
        
        # =============================
        # Tree-based Models
        # =============================
        
        'RandomForest': RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'RandomForest_deep': RandomForestClassifier(
            n_estimators=300,
            max_depth=30,
            min_samples_split=4,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'ExtraTrees': ExtraTreesClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'DecisionTree': DecisionTreeClassifier(
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            class_weight='balanced'
        ),
        
        # =============================
        # Gradient Boosting
        # =============================
        
        'GradientBoosting': GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            subsample=0.8
        ),
        
        'GradientBoosting_tuned': GradientBoostingClassifier(
            n_estimators=150,
            learning_rate=0.05,
            max_depth=7,
            min_samples_split=4,
            min_samples_leaf=1,
            random_state=42,
            subsample=0.9
        ),
        
        'AdaBoost': AdaBoostClassifier(
            n_estimators=100,
            learning_rate=1.0,
            random_state=42
        ),
        
        'AdaBoost_tuned': AdaBoostClassifier(
            n_estimators=150,
            learning_rate=0.8,
            random_state=42
        ),
        
        # =============================
        # Neural Network
        # =============================
        
        'MLPClassifier': MLPClassifier(
            hidden_layer_sizes=(100, 50),
            activation='relu',
            solver='adam',
            alpha=0.001,
            learning_rate='adaptive',
            max_iter=300,
            random_state=42
        ),
        
        'MLPClassifier_deep': MLPClassifier(
            hidden_layer_sizes=(150, 100, 50),
            activation='relu',
            solver='adam',
            alpha=0.0001,
            learning_rate='adaptive',
            max_iter=400,
            random_state=42
        ),
        
        # =============================
        # K-Nearest Neighbors
        # =============================
        
        'KNN': KNeighborsClassifier(
            n_neighbors=7,
            weights='distance',
            metric='cosine',
            n_jobs=-1
        ),
        
        'KNN_euclidean': KNeighborsClassifier(
            n_neighbors=5,
            weights='distance',
            metric='euclidean',
            n_jobs=-1
        ),
    }
    
    # Add XGBoost, LightGBM, CatBoost if available
    try:
        from xgboost import XGBClassifier
        
        models['XGBoost'] = XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=6,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            eval_metric='logloss',
            n_jobs=-1
        )
        
        models['XGBoost_tuned'] = XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=8,
            min_child_weight=2,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            eval_metric='logloss',
            n_jobs=-1
        )
        
        print("‚úì XGBoost models added")
    except ImportError:
        print("‚ö† XGBoost not available")
    
    try:
        from lightgbm import LGBMClassifier
        
        models['LightGBM'] = LGBMClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=6,
            num_leaves=31,
            min_child_samples=20,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1,
            verbosity=-1
        )
        
        models['LightGBM_tuned'] = LGBMClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=8,
            num_leaves=50,
            min_child_samples=15,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1,
            verbosity=-1
        )
        
        print("‚úì LightGBM models added")
    except ImportError:
        print("‚ö† LightGBM not available")
    
    try:
        from catboost import CatBoostClassifier
        
        models['CatBoost'] = CatBoostClassifier(
            iterations=200,
            learning_rate=0.1,
            depth=6,
            random_state=42,
            verbose=False,
            auto_class_weights='Balanced'
        )
        
        print("‚úì CatBoost models added")
    except ImportError:
        print("‚ö† CatBoost not available")
    
    return models

# Create and display
models_5a = create_advanced_models()
print(f"\nCreated {len(models_5a)} basic & advanced models:")
for name in models_5a.keys():
    print(f"  - {name}")

# Save models
import joblib
joblib.dump(models_5a, 'preprocessors/models_5a.pkl')
print(f"\n‚úì Saved to: preprocessors/models_5a.pkl")

‚úì XGBoost models added
‚úì LightGBM models added
‚úì CatBoost models added

Created 30 basic & advanced models:
  - LogisticRegression
  - LogisticRegression_L1
  - LogisticRegression_L2
  - RidgeClassifier
  - SGDClassifier
  - SGDClassifier_log
  - MultinomialNB
  - MultinomialNB_tuned
  - ComplementNB
  - BernoulliNB
  - LinearSVC
  - LinearSVC_tuned
  - SVC_RBF
  - RandomForest
  - RandomForest_deep
  - ExtraTrees
  - DecisionTree
  - GradientBoosting
  - GradientBoosting_tuned
  - AdaBoost
  - AdaBoost_tuned
  - MLPClassifier
  - MLPClassifier_deep
  - KNN
  - KNN_euclidean
  - XGBoost
  - XGBoost_tuned
  - LightGBM
  - LightGBM_tuned
  - CatBoost

‚úì Saved to: preprocessors/models_5a.pkl


In [8]:
# ===============================
# Cell 5B: Novel & Ensemble ML Models
# ===============================

def create_novel_models():
    """Create novel and advanced model approaches"""
    
    from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
    from sklearn.ensemble import (
        VotingClassifier, StackingClassifier, BaggingClassifier,
        RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
    )
    from sklearn.svm import LinearSVC, SVC
    from sklearn.naive_bayes import MultinomialNB, ComplementNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.feature_selection import SelectKBest, chi2, f_classif
    from sklearn.pipeline import Pipeline
    
    novel_models = {
        # =============================
        # Stacking Ensembles
        # =============================
        
        'linear_stacking': StackingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
                ('ridge', RidgeClassifier(random_state=42, class_weight='balanced')),
                ('sgd', SGDClassifier(max_iter=1000, random_state=42, class_weight='balanced'))
            ],
            final_estimator=LogisticRegression(max_iter=1000, random_state=42),
            cv=3,
            n_jobs=-1
        ),
        
        'tree_stacking': StackingClassifier(
            estimators=[
                ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)),
                ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
                ('ada', AdaBoostClassifier(n_estimators=100, random_state=42))
            ],
            final_estimator=LogisticRegression(max_iter=1000, random_state=42),
            cv=3,
            n_jobs=-1
        ),
        
        'mixed_stacking': StackingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
                ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)),
                ('svc', LinearSVC(max_iter=1000, random_state=42, class_weight='balanced'))
            ],
            final_estimator=GradientBoostingClassifier(n_estimators=50, random_state=42),
            cv=3,
            n_jobs=-1
        ),
        
        # =============================
        # Voting Ensembles
        # =============================
        
        'voting_soft': VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
                ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)),
                ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
            ],
            voting='soft',
            n_jobs=-1
        ),
        
        'voting_hard': VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
                ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)),
                ('svc', LinearSVC(max_iter=1000, random_state=42, class_weight='balanced'))
            ],
            voting='hard',
            n_jobs=-1
        ),
        
        'weighted_voting': VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
                ('rf', RandomForestClassifier(n_estimators=150, random_state=42, class_weight='balanced', n_jobs=-1)),
                ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
            ],
            voting='soft',
            weights=[1, 2, 2],  # More weight to RF and GB
            n_jobs=-1
        ),
        
        # =============================
        # Bagging Ensembles
        # =============================
        
        'bagging_lr': BaggingClassifier(
            estimator=LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
            n_estimators=10,
            max_samples=0.8,
            max_features=0.8,
            random_state=42,
            n_jobs=-1
        ),
        
        'bagging_sgd': BaggingClassifier(
            estimator=SGDClassifier(max_iter=1000, random_state=42, class_weight='balanced'),
            n_estimators=10,
            max_samples=0.8,
            max_features=0.8,
            random_state=42,
            n_jobs=-1
        ),
        
        'bagging_ridge': BaggingClassifier(
            estimator=RidgeClassifier(random_state=42, class_weight='balanced'),
            n_estimators=15,
            max_samples=0.9,
            max_features=0.9,
            random_state=42,
            n_jobs=-1
        ),
        
        'bagging_svc': BaggingClassifier(
            estimator=LinearSVC(max_iter=1000, random_state=42, class_weight='balanced'),
            n_estimators=10,
            max_samples=0.8,
            max_features=0.8,
            random_state=42,
            n_jobs=-1
        ),
        
        # =============================
        # AdaBoost Variants
        # =============================
        
        'ada_boost_tree': AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=3, random_state=42),
            n_estimators=100,
            learning_rate=1.0,
            random_state=42
        ),
        
        'ada_boost_lr': AdaBoostClassifier(
            estimator=LogisticRegression(max_iter=1000, random_state=42),
            n_estimators=50,
            learning_rate=0.8,
            algorithm='SAMME',
            random_state=42
        ),
        
        # =============================
        # Calibrated Classifiers
        # =============================
        
        'calibrated_rf': CalibratedClassifierCV(
            estimator=RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1),
            cv=3
        ),
        
        'calibrated_svm': CalibratedClassifierCV(
            estimator=LinearSVC(max_iter=1000, random_state=42, class_weight='balanced'),
            cv=3
        ),
        
        'calibrated_sgd': CalibratedClassifierCV(
            estimator=SGDClassifier(max_iter=1000, random_state=42, class_weight='balanced'),
            cv=3
        ),
        
        # =============================
        # One-vs-Rest (OvR)
        # =============================
        
        'ovr_lr': OneVsRestClassifier(
            LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
            n_jobs=-1
        ),
        
        'ovr_svm': OneVsRestClassifier(
            LinearSVC(max_iter=1000, random_state=42, class_weight='balanced'),
            n_jobs=-1
        ),
        
        'ovr_nb': OneVsRestClassifier(
            MultinomialNB(alpha=0.1),
            n_jobs=-1
        ),
        
        'ovr_rf': OneVsRestClassifier(
            RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1),
            n_jobs=-1
        ),
        
        # =============================
        # Feature Selection Pipelines
        # =============================
        
        'feature_selection_lr': Pipeline([
            ('feature_selection', SelectKBest(f_classif, k=5000)),
            ('classifier', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))
        ]),
        
        'feature_selection_rf': Pipeline([
            ('feature_selection', SelectKBest(f_classif, k=8000)),
            ('classifier', RandomForestClassifier(n_estimators=150, random_state=42, class_weight='balanced', n_jobs=-1))
        ]),
        
        'feature_selection_sgd': Pipeline([
            ('feature_selection', SelectKBest(f_classif, k=6000)),
            ('classifier', SGDClassifier(max_iter=1000, random_state=42, class_weight='balanced'))
        ]),
        
        # =============================
        # Multi-Layer Ensemble (ADVANCED)
        # =============================
        
        'multi_layer_ensemble': VotingClassifier(
            estimators=[
                ('stack1', StackingClassifier(
                    estimators=[
                        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
                        ('ridge', RidgeClassifier(random_state=42))
                    ],
                    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
                    cv=2
                )),
                ('rf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
                ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
            ],
            voting='soft',
            n_jobs=-1
        ),
        
        # =============================
        # Stress-Focused Ensemble (‚≠ê OPTIMIZED FOR MENTAL HEALTH)
        # =============================
        
        'stress_focused_ensemble': VotingClassifier(
            estimators=[
                ('lr_balanced', LogisticRegression(
                    C=1.0, max_iter=1000, random_state=42, 
                    class_weight='balanced', solver='lbfgs'
                )),
                ('sgd_balanced', SGDClassifier(
                    loss='log_loss', alpha=0.0001, max_iter=1000, 
                    random_state=42, class_weight='balanced'
                )),
                ('rf_balanced', RandomForestClassifier(
                    n_estimators=200, max_depth=20, random_state=42,
                    class_weight='balanced', n_jobs=-1
                )),
                ('gb', GradientBoostingClassifier(
                    n_estimators=100, learning_rate=0.1, 
                    max_depth=5, random_state=42
                ))
            ],
            voting='soft',
            weights=[2, 1, 2, 2],  # Higher weight for LR and RF
            n_jobs=-1
        ),
    }
    
    return novel_models

# Create and display
models_5b = create_novel_models()
print(f"\nAdded {len(models_5b)} novel & ensemble models:")
for name in models_5b.keys():
    print(f"  - {name}")

# Combine with previous models
import joblib
try:
    models_5a = joblib.load('preprocessors/models_5a.pkl')
    all_models = {**models_5a, **models_5b}
    print(f"\nTotal models now: {len(all_models)}")
except:
    all_models = models_5b
    print(f"\nTotal models: {len(models_5b)}")

# Save
joblib.dump(models_5b, 'preprocessors/models_5b.pkl')
joblib.dump(all_models, 'preprocessors/all_models.pkl')
print(f"\n‚úì Saved to: preprocessors/models_5b.pkl")
print(f"‚úì Saved all to: preprocessors/all_models.pkl")


Added 24 novel & ensemble models:
  - linear_stacking
  - tree_stacking
  - mixed_stacking
  - voting_soft
  - voting_hard
  - weighted_voting
  - bagging_lr
  - bagging_sgd
  - bagging_ridge
  - bagging_svc
  - ada_boost_tree
  - ada_boost_lr
  - calibrated_rf
  - calibrated_svm
  - calibrated_sgd
  - ovr_lr
  - ovr_svm
  - ovr_nb
  - ovr_rf
  - feature_selection_lr
  - feature_selection_rf
  - feature_selection_sgd
  - multi_layer_ensemble
  - stress_focused_ensemble

Total models now: 54

‚úì Saved to: preprocessors/models_5b.pkl
‚úì Saved all to: preprocessors/all_models.pkl


In [9]:
# ===============================
# Cell 6: Training Pipeline
# ===============================

import numpy as np
import pandas as pd
import time
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, balanced_accuracy_score,
    matthews_corrcoef
)
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("COMPREHENSIVE MODEL TRAINING PIPELINE")
print("="*70)

# =============================
# Load Data and Artifacts
# =============================

print("\nüìÇ Loading data and artifacts...")

# Load preprocessed data
try:
    stress_processed = pd.read_pickle('preprocessors/preprocessed_data.pkl')
    print(f"‚úì Loaded preprocessed data: {stress_processed.shape}")
except:
    print("‚ùå Error: Run Cell 3 (preprocessing) first!")
    raise

# Load all vectorizers
try:
    all_vectorizers = joblib.load('preprocessors/all_vectorizers.pkl')
    print(f"‚úì Loaded {len(all_vectorizers)} vectorizers")
except:
    print("‚ùå Error: Run Cells 4A, 4B first!")
    raise

# Load all models
try:
    all_models = joblib.load('preprocessors/all_models.pkl')
    print(f"‚úì Loaded {len(all_models)} models")
except:
    print("‚ùå Error: Run Cells 5A, 5B first!")
    raise

# =============================
# Prepare Data
# =============================

print("\nüìä Preparing data...")

X = stress_processed['clean_text'].values
y = stress_processed['label'].values

# Encode labels if needed
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)
    joblib.dump(le, 'preprocessors/label_encoder.pkl')
    print(f"‚úì Encoded labels: {le.classes_}")

# Train-test split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úì Train set: {len(X_train_text)} samples")
print(f"‚úì Test set: {len(X_test_text)} samples")
print(f"‚úì Class distribution: {np.bincount(y_train)}")

# =============================
# Select Priority Combinations
# =============================

print("\nüéØ Selecting priority combinations...")

# Priority vectorizers (proven to work well)
priority_vectorizers = [
    'custom_stress',           # ‚≠ê Best performer
    'tfidf_bigram',
    'count_tfidf_ensemble',
    'mental_health_focused',
    'count_bigram',
    'hybrid_char_word',
    'weighted_tfidf',
    'ensemble_tfidf'
]

# Priority models (proven to work well)
priority_models = [
    'LogisticRegression',
    'bagging_sgd',            # ‚≠ê Best performer
    'bagging_lr',
    'RidgeClassifier',
    'SGDClassifier',
    'LinearSVC',
    'RandomForest',
    'GradientBoosting',
    'stress_focused_ensemble', # ‚≠ê Mental health optimized
    'voting_soft',
    'linear_stacking',
    'calibrated_rf',
    'XGBoost',                # If available
    'LightGBM',               # If available
]

# Filter available
available_vectorizers = {k: all_vectorizers[k] for k in priority_vectorizers if k in all_vectorizers}
available_models = {k: all_models[k] for k in priority_models if k in all_models}

print(f"‚úì Using {len(available_vectorizers)} priority vectorizers")
print(f"‚úì Using {len(available_models)} priority models")
print(f"‚úì Total combinations: {len(available_vectorizers) * len(available_models)}")

# =============================
# Training Function
# =============================

def train_single_combination(model, model_name, X_train_vec, X_test_vec, y_train, y_test, vec_name):
    """Train a single model and return metrics"""
    try:
        start_time = time.time()
        
        # Train
        model.fit(X_train_vec, y_train)
        
        # Predict
        y_pred = model.predict(X_test_vec)
        
        # Calculate metrics
        metrics = {
            'model': model_name,
            'vectorizer': vec_name,
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred, average='weighted'),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
            'mcc': matthews_corrcoef(y_test, y_pred),
            'training_time': time.time() - start_time,
            'status': 'success'
        }
        
        return metrics, model
        
    except Exception as e:
        return {
            'model': model_name,
            'vectorizer': vec_name,
            'accuracy': 0.0,
            'f1_score': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'balanced_accuracy': 0.0,
            'mcc': 0.0,
            'training_time': 0.0,
            'status': 'failed',
            'error': str(e)
        }, None

# =============================
# Main Training Loop
# =============================

print("\n" + "="*70)
print("TRAINING MODELS")
print("="*70)

all_results = []
best_model = None
best_f1 = 0
best_model_name = ""
best_vectorizer_name = ""
best_vectorizer = None

total_combinations = len(available_vectorizers) * len(available_models)
current = 0

for vec_name, vectorizer in available_vectorizers.items():
    print(f"\n{'='*70}")
    print(f"Vectorizer: {vec_name}")
    print(f"{'='*70}")
    
    try:
        # Transform data
        print("  Transforming data...", end=" ")
        X_train_vec = vectorizer.fit_transform(X_train_text)
        X_test_vec = vectorizer.transform(X_test_text)
        print(f"‚úì Shape: {X_train_vec.shape}")
        
        # Train all models with this vectorizer
        for model_name, model in available_models.items():
            current += 1
            progress = (current / total_combinations) * 100
            print(f"  [{progress:5.1f}%] {model_name:<30}", end=" ")
            
            metrics, trained_model = train_single_combination(
                model, model_name, X_train_vec, X_test_vec, 
                y_train, y_test, vec_name
            )
            all_results.append(metrics)
            
            if metrics['status'] == 'success':
                print(f"‚úì F1: {metrics['f1_score']:.4f} | Acc: {metrics['accuracy']:.4f}")
                
                # Track best model
                if metrics['f1_score'] > best_f1:
                    best_f1 = metrics['f1_score']
                    best_model = trained_model
                    best_model_name = model_name
                    best_vectorizer_name = vec_name
                    best_vectorizer = vectorizer
            else:
                print(f"‚úó Failed")
    
    except Exception as e:
        print(f"  ‚úó Vectorizer failed: {str(e)[:50]}")
        continue

# =============================
# Save Results
# =============================

print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)

# Create results DataFrame
results_df = pd.DataFrame(all_results)
successful = results_df[results_df['status'] == 'success']

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_path = f'reports/training_results_{timestamp}.csv'
results_df.to_csv(results_path, index=False)
print(f"‚úì Results saved to: {results_path}")

# Save best model
if best_model:
    joblib.dump((best_model, best_vectorizer), 'models/best_model.pkl')
    print(f"‚úì Best model saved to: models/best_model.pkl")
    
    best_info = {
        'model_name': best_model_name,
        'vectorizer_name': best_vectorizer_name,
        'f1_score': float(best_f1),
        'accuracy': float(successful[successful['f1_score'] == best_f1].iloc[0]['accuracy']),
        'timestamp': timestamp
    }
    
    import json
    with open('models/best_model_info.json', 'w') as f:
        json.dump(best_info, f, indent=2)
    print(f"‚úì Best model info saved")

# =============================
# Display Results
# =============================

print("\n" + "="*70)
print("TOP 20 MODEL COMBINATIONS")
print("="*70)

if len(successful) > 0:
    # Sort by F1 score
    top_20 = successful.nlargest(20, 'f1_score')
    
    print(f"\n{'Rank':<6}{'Model':<30}{'Vectorizer':<25}{'Acc':<8}{'F1':<8}{'Prec':<8}{'Rec':<8}{'MCC':<8}{'Time':<8}")
    print("-"*120)
    
    for idx, row in enumerate(top_20.itertuples(), 1):
        print(f"{idx:<6}{row.model[:28]:<30}{row.vectorizer[:23]:<25}"
              f"{row.accuracy:<8.4f}{row.f1_score:<8.4f}{row.precision:<8.4f}{row.recall:<8.4f}"
              f"{row.mcc:<8.4f}{row.training_time:<8.2f}s")
    
    # Summary
    print("\n" + "="*70)
    print("TRAINING SUMMARY")
    print("="*70)
    print(f"‚úì Total combinations: {len(results_df)}")
    print(f"‚úì Successful: {len(successful)}")
    print(f"‚úì Failed: {len(results_df) - len(successful)}")
    
    print(f"\nüèÜ BEST MODEL:")
    print(f"  ‚îú‚îÄ Model: {best_model_name}")
    print(f"  ‚îú‚îÄ Vectorizer: {best_vectorizer_name}")
    print(f"  ‚îú‚îÄ F1 Score: {best_f1:.4f}")
    print(f"  ‚îú‚îÄ Accuracy: {best_info['accuracy']:.4f}")
    print(f"  ‚îî‚îÄ Training time: {top_20.iloc[0]['training_time']:.2f}s")
    
    print(f"\nüìä OVERALL STATISTICS:")
    print(f"  ‚îú‚îÄ Average F1 Score: {successful['f1_score'].mean():.4f}")
    print(f"  ‚îú‚îÄ Average Accuracy: {successful['accuracy'].mean():.4f}")
    print(f"  ‚îú‚îÄ Top 10 avg F1: {successful.nlargest(10, 'f1_score')['f1_score'].mean():.4f}")
    print(f"  ‚îî‚îÄ Total training time: {successful['training_time'].sum():.1f}s ({successful['training_time'].sum()/60:.1f} min)")
    
    # Top vectorizers
    print(f"\nüéØ TOP 5 VECTORIZERS (by avg F1):")
    vec_perf = successful.groupby('vectorizer')['f1_score'].agg(['mean', 'max', 'count'])
    vec_perf = vec_perf.sort_values('mean', ascending=False).head(5)
    for idx, (vec, row) in enumerate(vec_perf.iterrows(), 1):
        print(f"  {idx}. {vec}: Avg={row['mean']:.4f}, Max={row['max']:.4f}, Models={int(row['count'])}")
    
    # Top models
    print(f"\nü§ñ TOP 5 MODELS (by avg F1):")
    model_perf = successful.groupby('model')['f1_score'].agg(['mean', 'max', 'count'])
    model_perf = model_perf.sort_values('mean', ascending=False).head(5)
    for idx, (model, row) in enumerate(model_perf.iterrows(), 1):
        print(f"  {idx}. {model}: Avg={row['mean']:.4f}, Max={row['max']:.4f}, Vectorizers={int(row['count'])}")
    
    print("\n" + "="*70)
    print("‚úÖ TRAINING COMPLETE!")
    print("="*70)
    
else:
    print("\n‚ùå No successful models! Check your data and configurations.")

# =============================
# Quick Test
# =============================

print("\n" + "="*70)
print("QUICK PREDICTION TEST")
print("="*70)

if best_model and best_vectorizer:
    test_texts = [
        "I feel so stressed and overwhelmed with work",
        "Today was a great day, feeling happy and relaxed"
    ]
    
    try:
        le = joblib.load('preprocessors/label_encoder.pkl')
        test_vec = best_vectorizer.transform(test_texts)
        predictions = best_model.predict(test_vec)
        pred_labels = le.inverse_transform(predictions)
        
        print("\nTest predictions:")
        for text, pred in zip(test_texts, pred_labels):
            print(f"  Text: {text[:50]}...")
            print(f"  Prediction: {pred}\n")
    except Exception as e:
        print(f"Test prediction failed: {e}")

print("\nüöÄ Ready for deployment!")

COMPREHENSIVE MODEL TRAINING PIPELINE

üìÇ Loading data and artifacts...
‚úì Loaded preprocessed data: (2838, 117)
‚úì Loaded 9 vectorizers
‚úì Loaded 54 models

üìä Preparing data...
‚úì Train set: 2270 samples
‚úì Test set: 568 samples
‚úì Class distribution: [1080 1190]

üéØ Selecting priority combinations...
‚úì Using 1 priority vectorizers
‚úì Using 14 priority models
‚úì Total combinations: 14

TRAINING MODELS

Vectorizer: weighted_tfidf
  Transforming data... ‚úì Shape: (2270, 10633)
  [  7.1%] LogisticRegression             ‚úì F1: 0.7501 | Acc: 0.7500
  [ 14.3%] bagging_sgd                    ‚úì F1: 0.7044 | Acc: 0.7077
  [ 21.4%] bagging_lr                     ‚úì F1: 0.7412 | Acc: 0.7412
  [ 28.6%] RidgeClassifier                ‚úì F1: 0.7501 | Acc: 0.7500
  [ 35.7%] SGDClassifier                  ‚úì F1: 0.7249 | Acc: 0.7254
  [ 42.9%] LinearSVC                      ‚úì F1: 0.7466 | Acc: 0.7465
  [ 50.0%] RandomForest                   ‚úì F1: 0.7112 | Acc: 0.7113
  [ 

In [10]:
# ===============================
# Cell 9: DATA QUALITY DIAGNOSIS
# ===============================

import numpy as np
import pandas as pd
import joblib
from collections import Counter

print("="*70)
print("DATA QUALITY DIAGNOSIS - FINDING THE PROBLEM")
print("="*70)

# Load data
stress_processed = pd.read_pickle('preprocessors/preprocessed_data.pkl')

print("\n1. BASIC DATA INFO:")
print("="*70)
print(f"Shape: {stress_processed.shape}")
print(f"Columns: {list(stress_processed.columns)}")
print(f"\nFirst few rows:")
print(stress_processed.head())

print("\n2. LABEL DISTRIBUTION:")
print("="*70)
print(stress_processed['label'].value_counts())
print(f"\nClass balance: {stress_processed['label'].value_counts(normalize=True)}")

print("\n3. TEXT QUALITY CHECK:")
print("="*70)
print(f"\nSample clean texts:")
for i, text in enumerate(stress_processed['clean_text'].head(5)):
    print(f"\n[{i+1}] {text[:200]}...")

# Check text lengths
stress_processed['text_length'] = stress_processed['clean_text'].astype(str).str.len()
stress_processed['word_count'] = stress_processed['clean_text'].astype(str).str.split().str.len()

print(f"\nText statistics:")
print(f"  Average length: {stress_processed['text_length'].mean():.1f} chars")
print(f"  Average words: {stress_processed['word_count'].mean():.1f}")
print(f"  Min length: {stress_processed['text_length'].min()}")
print(f"  Max length: {stress_processed['text_length'].max()}")

# Check for issues
empty_texts = (stress_processed['text_length'] < 10).sum()
print(f"\n  ‚ö†Ô∏è  Very short texts (<10 chars): {empty_texts} ({empty_texts/len(stress_processed)*100:.1f}%)")

if empty_texts > len(stress_processed) * 0.1:
    print("  üî¥ PROBLEM: Too many empty/short texts!")

print("\n4. CHECKING ORIGINAL DATA:")
print("="*70)

# Try to load original data
try:
    original = pd.read_csv('data/stress.csv')
    print(f"Original data shape: {original.shape}")
    print(f"Original columns: {list(original.columns)}")
    print(f"\nOriginal label distribution:")
    print(original['label'].value_counts())
    
    # Find text column
    text_col = None
    for col in original.columns:
        if col.lower() in ['text', 'post_text', 'content', 'message', 'tweet']:
            text_col = col
            break
    
    if text_col:
        print(f"\nOriginal text samples from '{text_col}':")
        for i, text in enumerate(original[text_col].head(3)):
            print(f"\n[{i+1}] {str(text)[:200]}...")
    else:
        print("\n‚ö†Ô∏è  Could not find text column!")
        print("Available columns:", original.columns.tolist())
        
except Exception as e:
    print(f"Could not load original data: {e}")

print("\n5. RECOMMENDATION:")
print("="*70)

# Analyze the problem
avg_words = stress_processed['word_count'].mean()

if avg_words < 5:
    print("üî¥ CRITICAL ISSUE: Text preprocessing removed too much!")
    print("\nSOLUTION:")
    print("  1. Check your Cell 3 preprocessing")
    print("  2. Make sure 'clean_text' column has actual text")
    print("  3. Reduce stopword removal")
    print("  4. Keep more words (don't filter too aggressively)")
    
elif empty_texts > len(stress_processed) * 0.2:
    print("üî¥ CRITICAL ISSUE: Too many empty texts!")
    print("\nSOLUTION:")
    print("  1. Check for NaN values in text")
    print("  2. Fill missing values before preprocessing")
    print("  3. Don't drop rows with short text")
    
else:
    print("‚úì Text quality looks okay")
    print("\nPossible issues:")
    print("  1. Data is too small (only 2838 samples)")
    print("  2. Classes are not well separated")
    print("  3. Need better feature engineering")
    print("  4. Need data augmentation")

print("\n" + "="*70)
print("NEXT STEPS:")
print("="*70)
print("\nPlease share:")
print("  1. Output from this diagnostic cell")
print("  2. What does your original data look like?")
print("  3. How many words are in clean_text on average?")

DATA QUALITY DIAGNOSIS - FINDING THE PROBLEM

1. BASIC DATA INFO:
Shape: (2838, 117)
Columns: ['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_anger', 'lex_liwc_sad', 'lex_liwc_social', 'lex_liwc_family', 'lex_liwc_friend', 'lex_liwc_female', 'lex_liwc_male', 'lex_liwc_cogproc', 'lex_liwc_insight', 'lex_liwc_ca

In [16]:
# ===============================
# Cell 11: FIXED FUSION ENSEMBLE + FLASK API
# ===============================

import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("FUSION ENSEMBLE + FLASK API - PUBLICATION READY")
print("="*70)

# ===============================
# Load Data
# ===============================

print("\n1. Loading data...")
stress_processed = pd.read_pickle('preprocessors/preprocessed_data.pkl')
X = stress_processed['clean_text'].values
y = stress_processed['label'].values

# Load or create label encoder
try:
    le = joblib.load('models/label_encoder.pkl')
    print(f"‚úì Loaded label encoder: {le.classes_}")
except:
    le = LabelEncoder()
    y = le.fit_transform(y)
    joblib.dump(le, 'models/label_encoder.pkl')
    print(f"‚úì Created label encoder: {le.classes_}")

# If labels are strings, encode them
if y.dtype == 'object':
    y = le.transform(y)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úì Data loaded: Train={len(X_train_text)}, Test={len(X_test_text)}")

# ===============================
# FIXED FUSION ENSEMBLE CLASS
# ===============================

class FusionEnsemble:
    """
    Fusion ensemble that combines multiple model-vectorizer pairs
    Each model gets its own vectorizer
    """
    
    def __init__(self):
        self.models = []
        self.weights = []
        
    def add_model(self, model, vectorizer, weight=1.0):
        """Add a model-vectorizer pair with weight"""
        self.models.append({
            'model': model,
            'vectorizer': vectorizer,
            'weight': weight
        })
        self.weights.append(weight)
        
    def fit(self, X, y):
        """Fit all models with their respective vectorizers"""
        print(f"\nTraining {len(self.models)} models in fusion ensemble...")
        
        for i, model_dict in enumerate(self.models):
            print(f"  [{i+1}/{len(self.models)}] Training...", end=" ", flush=True)
            
            # Transform with this model's vectorizer
            X_vec = model_dict['vectorizer'].fit_transform(X)
            
            # Train this model
            model_dict['model'].fit(X_vec, y)
            
            print(f"‚úì Features: {X_vec.shape[1]}")
        
        print("‚úì Fusion ensemble training complete!")
        return self
    
    def predict_proba(self, texts):
        """Get probability predictions from all models"""
        all_predictions = []
        
        for model_dict in self.models:
            model = model_dict['model']
            vectorizer = model_dict['vectorizer']
            weight = model_dict['weight']
            
            # Transform with THIS model's vectorizer
            X_vec = vectorizer.transform(texts)
            
            # Get predictions
            if hasattr(model, 'predict_proba'):
                proba = model.predict_proba(X_vec)
            elif hasattr(model, 'decision_function'):
                # For SVM
                decision = model.decision_function(X_vec)
                if decision.ndim == 1:
                    decision = np.column_stack([-decision, decision])
                # Convert to probabilities
                from scipy.special import softmax
                proba = softmax(decision, axis=1)
            else:
                # Fallback: use hard predictions
                pred = model.predict(X_vec)
                proba = np.zeros((len(pred), len(np.unique(pred))))
                proba[np.arange(len(pred)), pred] = 1.0
            
            # Weight the predictions
            all_predictions.append(proba * weight)
        
        # Average weighted predictions
        avg_predictions = np.sum(all_predictions, axis=0) / np.sum(self.weights)
        return avg_predictions
    
    def predict(self, texts):
        """Get class predictions"""
        probas = self.predict_proba(texts)
        return np.argmax(probas, axis=1)

# ===============================
# BUILD TOP 4 MODELS FROM YOUR RESULTS
# ===============================

print("\n2. Building TOP 4 models from your 77.8% results...")

# Model 1: BaggingSGD + FeatureUnion (77.89% - BEST)
print("\n   [1/4] BaggingSGD + FeatureUnion (77.89%)")
vectorizer_1 = FeatureUnion([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
        sublinear_tf=False
    )),
    ('count', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 1),
        min_df=2,
        max_df=0.9,
        sublinear_tf=False
    ))
])
from sklearn.linear_model import SGDClassifier
model_1 = BaggingClassifier(
    estimator=SGDClassifier(max_iter=1000, random_state=42, class_weight='balanced'),
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)
print(f"      Training...", end=" ", flush=True)
X_train_1 = vectorizer_1.fit_transform(X_train_text)
model_1.fit(X_train_1, y_train)
print(f"‚úì Features: {X_train_1.shape[1]}")

# Model 2: LogisticRegression + TfidfVectorizer (77.75%)
print("\n   [2/4] LogisticRegression + TF-IDF (77.75%)")
vectorizer_2 = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=False
)
model_2 = LogisticRegression(
    C=1.0,
    max_iter=1000,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
print(f"      Training...", end=" ", flush=True)
X_train_2 = vectorizer_2.fit_transform(X_train_text)
model_2.fit(X_train_2, y_train)
print(f"‚úì Features: {X_train_2.shape[1]}")

# Model 3: BaggingSGD + TfidfVectorizer (77.75%)
print("\n   [3/4] BaggingSGD + TF-IDF (77.75%)")
vectorizer_3 = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=False
)
model_3 = BaggingClassifier(
    estimator=SGDClassifier(max_iter=1000, random_state=42, class_weight='balanced'),
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)
print(f"      Training...", end=" ", flush=True)
X_train_3 = vectorizer_3.fit_transform(X_train_text)
model_3.fit(X_train_3, y_train)
print(f"‚úì Features: {X_train_3.shape[1]}")

# Model 4: RidgeClassifier + TfidfVectorizer (77.61%)
print("\n   [4/4] RidgeClassifier + TF-IDF (77.61%)")
vectorizer_4 = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=False
)
model_4 = RidgeClassifier(
    alpha=1.0,
    random_state=42,
    class_weight='balanced'
)
print(f"      Training...", end=" ", flush=True)
X_train_4 = vectorizer_4.fit_transform(X_train_text)
model_4.fit(X_train_4, y_train)
print(f"‚úì Features: {X_train_4.shape[1]}")

# ===============================
# CREATE FUSION ENSEMBLE
# ===============================

print("\n3. Creating FUSION ensemble...")

fusion_model = FusionEnsemble()

# Add models with weights (based on performance)
fusion_model.add_model(model_1, vectorizer_1, weight=1.0)  # 77.89% - highest weight
fusion_model.add_model(model_2, vectorizer_2, weight=0.98)  # 77.75%
fusion_model.add_model(model_3, vectorizer_3, weight=0.98)  # 77.75%
fusion_model.add_model(model_4, vectorizer_4, weight=0.97)  # 77.61%

print("‚úì Fusion ensemble created with 4 top models")

# ===============================
# EVALUATE FUSION ENSEMBLE
# ===============================

print("\n4. Evaluating fusion ensemble...")

y_pred_fusion = fusion_model.predict(X_test_text)
y_proba_fusion = fusion_model.predict_proba(X_test_text)

fusion_acc = accuracy_score(y_test, y_pred_fusion)
fusion_f1 = f1_score(y_test, y_pred_fusion, average='weighted')
fusion_prec = precision_score(y_test, y_pred_fusion, average='weighted')
fusion_rec = recall_score(y_test, y_pred_fusion, average='weighted')

print("\n" + "="*70)
print("üèÜ FUSION ENSEMBLE RESULTS")
print("="*70)
print(f"Accuracy:  {fusion_acc:.4f} ({fusion_acc*100:.2f}%)")
print(f"F1 Score:  {fusion_f1:.4f}")
print(f"Precision: {fusion_prec:.4f}")
print(f"Recall:    {fusion_rec:.4f}")

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred_fusion, target_names=le.classes_.astype(str)))

# Compare with individual models
print("\nüìà Individual Model Performance:")
for i in range(4):
    X_test_i = fusion_model.models[i]['vectorizer'].transform(X_test_text)
    y_pred_i = fusion_model.models[i]['model'].predict(X_test_i)
    acc_i = accuracy_score(y_test, y_pred_i)
    f1_i = f1_score(y_test, y_pred_i, average='weighted')
    print(f"  Model {i+1}: Acc={acc_i:.4f}, F1={f1_i:.4f}")

print(f"\n‚ú® Fusion Improvement: +{(fusion_f1 - 0.7784):.4f} F1 score")

# ===============================
# SAVE FUSION MODEL
# ===============================

print("\n5. Saving fusion ensemble...")

joblib.dump(fusion_model, 'models/fusion_ensemble.pkl')
joblib.dump(le, 'models/label_encoder.pkl')  # Save label encoder
print("‚úì Saved: models/fusion_ensemble.pkl")
print("‚úì Saved: models/label_encoder.pkl")

import json
fusion_info = {
    'model_type': 'Fusion Ensemble',
    'num_models': 4,
    'models': [
        'BaggingSGD + FeatureUnion',
        'LogisticRegression + TF-IDF',
        'BaggingSGD + TF-IDF',
        'RidgeClassifier + TF-IDF'
    ],
    'performance': {
        'accuracy': float(fusion_acc),
        'f1_score': float(fusion_f1),
        'precision': float(fusion_prec),
        'recall': float(fusion_rec)
    },
    'labels': le.classes_.tolist(),
    'timestamp': datetime.now().isoformat()
}

with open('models/fusion_ensemble_info.json', 'w') as f:
    json.dump(fusion_info, f, indent=2)
print("‚úì Saved: models/fusion_ensemble_info.json")

# Test predictions
print("\n6. Testing with sample texts...")
test_samples = [
    "I feel so stressed and overwhelmed with work",
    "Today is a beautiful day, feeling great!",
    "Having panic attacks and can't sleep at all",
    "Everything is going well in my life"
]

for text in test_samples:
    pred_idx = fusion_model.predict([text])[0]
    proba = fusion_model.predict_proba([text])[0]
    pred_label = le.inverse_transform([pred_idx])[0]
    confidence = proba[pred_idx]
    print(f"\nText: {text[:60]}...")
    print(f"Prediction: {pred_label} (Confidence: {confidence:.2%})")

print("\n" + "="*70)
print("‚úÖ READY! Now create api.py and frontend.html")
print("="*70)

FUSION ENSEMBLE + FLASK API - PUBLICATION READY

1. Loading data...
‚úì Created label encoder: [0 1]
‚úì Data loaded: Train=2270, Test=568

2. Building TOP 4 models from your 77.8% results...

   [1/4] BaggingSGD + FeatureUnion (77.89%)
      Training... ‚úì Features: 15000

   [2/4] LogisticRegression + TF-IDF (77.75%)
      Training... ‚úì Features: 15000

   [3/4] BaggingSGD + TF-IDF (77.75%)
      Training... ‚úì Features: 15000

   [4/4] RidgeClassifier + TF-IDF (77.61%)
      Training... ‚úì Features: 15000

3. Creating FUSION ensemble...
‚úì Fusion ensemble created with 4 top models

4. Evaluating fusion ensemble...

üèÜ FUSION ENSEMBLE RESULTS
Accuracy:  0.7465 (74.65%)
F1 Score:  0.7466
Precision: 0.7470
Recall:    0.7465

üìä Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.75      0.74       270
           1       0.77      0.74      0.76       298

    accuracy                           0.75       568
   macro av