In [1]:
# ==========================================
# üì¶ STANDARDIZED SETUP - MENTAL STRESS DETECTION PROJECT
# ==========================================

# --- Core Imports ---
import os
import json
import logging
from pathlib import Path
from datetime import datetime
import warnings
import pandas as pd
import numpy as np

# --- Visualization (optional for EDA) ---
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# --- Text & NLP Utilities ---
import re
import string
from textblob import TextBlob
from collections import Counter

# --- Machine Learning ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# --- System Utilities ---
import joblib
from typing import Dict, Any

# --- Warnings ---
warnings.filterwarnings("ignore")

# ==========================================
# üöÄ LOGGING CONFIGURATION
# ==========================================
LOG_DIR = Path("logs")
LOG_DIR.mkdir(exist_ok=True)

from logging.handlers import RotatingFileHandler

log_file = LOG_DIR / "stress_detection.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        RotatingFileHandler(log_file, maxBytes=1_000_000, backupCount=3),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# ==========================================
# üóÇÔ∏è DIRECTORY STRUCTURE (MODELS / REPORTS / FRONTEND)
# ==========================================
for folder in ["models", "preprocessors", "reports", "artifacts"]:
    Path(folder).mkdir(exist_ok=True)

# ==========================================
# üß† DATA LOADING FUNCTION
# ==========================================
def load_dataset(file_path: str, encoding: str = "utf-8") -> pd.DataFrame:
    """
    Load dataset with multiple fallback encodings and detailed validation.
    Returns: pandas DataFrame
    """
    encodings = [encoding, "utf-8", "latin-1", "cp1252"]
    dataset = None
    for enc in encodings:
        try:
            dataset = pd.read_csv(file_path, encoding=enc, on_bad_lines="skip", low_memory=False)
            logger.info(f"‚úÖ Loaded dataset successfully using encoding: {enc}")
            break
        except UnicodeDecodeError:
            continue
    if dataset is None:
        raise ValueError(f"‚ùå Failed to load dataset using all encodings: {encodings}")
    
    logger.info(f"üìä Dataset Shape: {dataset.shape}")
    logger.info(f"üìë Columns: {list(dataset.columns)}")
    logger.info(f"üîç Missing Values: {dataset.isnull().sum().sum()} | Duplicates: {dataset.duplicated().sum()}")
    return dataset

# ==========================================
# üîé VALIDATION FUNCTION
# ==========================================
def validate_stress_dataset(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Validate dataset structure for stress detection tasks.
    Identifies potential text and label columns, missing data, duplicates, etc.
    """
    validation = {
        "total_samples": len(df),
        "missing_values": df.isnull().sum().to_dict(),
        "duplicate_rows": int(df.duplicated().sum()),
        "text_columns": [],
        "label_column": None,
        "issues": []
    }

    for col in df.columns:
        if df[col].dtype == "object":
            avg_len = df[col].dropna().astype(str).str.len().mean()
            unique_vals = df[col].nunique(dropna=True)

            if avg_len > 15 or unique_vals > 30:
                validation["text_columns"].append(col)
            elif unique_vals <= 10:
                validation["label_column"] = col

    for text_col in validation["text_columns"]:
        short_count = df[text_col].astype(str).str.len().lt(5).sum()
        if short_count > 0:
            validation["issues"].append(f"Column '{text_col}' has {short_count} very short entries")

    return validation

# ==========================================
# üíæ SAVE DATASET PROFILE
# ==========================================
def save_dataset_profile(df: pd.DataFrame, validation: Dict[str, Any]):
    profile = {
        "dataset_info": {
            "shape": df.shape,
            "columns": list(df.columns),
            "text_columns": validation["text_columns"],
            "label_column": validation["label_column"],
            "duplicates": validation["duplicate_rows"],
            "missing": sum(validation["missing_values"].values())
        },
        "timestamp": datetime.now().isoformat()
    }
    config_path = Path("preprocessors/dataset_config.json")
    with open(config_path, "w") as f:
        json.dump(profile, f, indent=2)
    logger.info(f"üìÅ Dataset profile saved at: {config_path}")
    return profile

# ==========================================
# ‚öôÔ∏è EXECUTION - LOAD AND VALIDATE
# ==========================================
logger.info("üöÄ Loading dataset for stress detection...")
DATA_PATH = "stress.csv"   # modify if needed
try:
    df_raw = load_dataset(DATA_PATH)
    df = df_raw.copy()
    validation = validate_stress_dataset(df)
    profile = save_dataset_profile(df, validation)

    print("=" * 60)
    print(" MENTAL STRESS DETECTION - DATA SUMMARY ")
    print("=" * 60)
    print(f"Shape: {df.shape}")
    print(f"Possible Label Column: {validation['label_column']}")
    print(f"Text Columns: {validation['text_columns']}")
    print(f"Duplicate Rows: {validation['duplicate_rows']}")
    print(f"Missing Values: {sum(validation['missing_values'].values())}")
    if validation['issues']:
        print("\n‚ö†Ô∏è Issues Found:")
        for issue in validation['issues']:
            print(f"  - {issue}")
    print("\n‚úÖ Dataset successfully loaded and validated. Ready for preprocessing.")
except Exception as e:
    logger.error(f"‚ùå Data loading failed: {e}")
    raise


2025-11-07 17:30:27,905 | INFO | üöÄ Loading dataset for stress detection...
2025-11-07 17:30:27,932 | INFO | ‚úÖ Loaded dataset successfully using encoding: utf-8
2025-11-07 17:30:27,933 | INFO | üìä Dataset Shape: (2838, 116)
2025-11-07 17:30:27,933 | INFO | üìë Columns: ['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_an

 MENTAL STRESS DETECTION - DATA SUMMARY 
Shape: (2838, 116)
Possible Label Column: subreddit
Text Columns: ['post_id', 'sentence_range', 'text']
Duplicate Rows: 0
Missing Values: 0

‚úÖ Dataset successfully loaded and validated. Ready for preprocessing.


In [2]:
# ===============================
# üåø STANDARDIZED EDA - Mental Stress Detection
# ===============================

import os
from pathlib import Path
import json
import logging
import warnings
from typing import List, Dict, Any
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use("Agg")  # Allows saving plots on headless servers
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# ===============================
# Setup
# ===============================
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
sns.set_palette("husl")

# Logging
logger = logging.getLogger(__name__)
if not logger.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

# Output directories
REPORT_DIR = Path("reports")
FIG_DIR = REPORT_DIR / "figures"
REPORT_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(exist_ok=True)

# Ensure NLTK stopwords are available
try:
    nltk.data.find("tokenizers/punkt")
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("punkt", quiet=True)
    nltk.download("stopwords", quiet=True)

EN_STOPWORDS = set(stopwords.words("english"))
MENTAL_STOPWORDS = EN_STOPWORDS.union({
    'like','get','would','could','should','really','much','even','also','still',
    'think','feel','know','going','want','need','one','way','people','time',
    'good','bad','well','right','thing'
})

SAVE_FIGURES = True
TEXT_COL_BLACKLIST = {"label", "subreddit"}

# ===============================
# Utility Functions
# ===============================
def dataset_summary(df: pd.DataFrame) -> Dict[str, Any]:
    """Return overall dataset summary."""
    mem = df.memory_usage(deep=True).sum() / 1024**2
    return {
        "shape": df.shape,
        "memory_mb": round(mem, 2),
        "missing_values": df.isnull().sum().to_dict(),
        "duplicates": int(df.duplicated().sum()),
        "numeric_cols": df.select_dtypes(include=[np.number]).columns.tolist(),
        "object_cols": df.select_dtypes(include=["object", "category"]).columns.tolist()
    }

def text_column_stats(df: pd.DataFrame, text_cols: List[str]) -> Dict[str, Dict[str, Any]]:
    """Compute vectorized text statistics."""
    stats = {}
    for col in text_cols:
        s = df[col].astype(str).fillna("")
        lens = s.str.len()
        words = s.str.split().str.len().fillna(0).astype(int)
        stats[col] = {
            "avg_length": float(lens.mean()),
            "avg_word_count": float(words.mean()),
            "max_length": int(lens.max()),
            "min_length": int(lens.min()),
            "short_texts": int((lens < 50).sum()),
            "long_texts": int((lens > 1000).sum()),
            "unique_values": int(df[col].nunique())
        }
    logger.info(f"Text column stats computed: {list(stats.keys())}")
    return stats

def class_distribution(df: pd.DataFrame, label_col: str) -> Dict[str, Any]:
    """Return class distribution and imbalance ratio."""
    if label_col not in df.columns:
        return {}
    counts = df[label_col].value_counts()
    ratio = None
    if len(counts) == 2 and counts.min() > 0:
        ratio = round(float(counts.max() / counts.min()), 2)
    return {
        "counts": counts.to_dict(),
        "proportions": (counts / counts.sum()).round(4).to_dict(),
        "imbalance_ratio": ratio
    }

def save_or_show(fig, name: str):
    """Save figure to reports/figures."""
    if SAVE_FIGURES:
        path = FIG_DIR / f"{name}.png"
        fig.savefig(path, bbox_inches="tight", dpi=150)
        plt.close(fig)
        logger.info(f"Saved figure: {path}")
    else:
        plt.show()

# ===============================
# Visualization Functions
# ===============================
def plot_distribution(df, col):
    if col not in df.columns:
        return
    counts = df[col].value_counts()
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    sns.barplot(x=counts.index, y=counts.values, ax=axes[0])
    axes[0].set_title(f"{col.capitalize()} Distribution")
    axes[0].tick_params(axis="x", rotation=45)
    axes[1].pie(counts.values, labels=counts.index, autopct="%.1f%%", startangle=140)
    axes[1].set_title(f"{col.capitalize()} Percentage")
    save_or_show(fig, f"{col}_distribution")

def subreddit_vs_label(df, subreddit_col="subreddit", label_col="label"):
    if subreddit_col not in df.columns or label_col not in df.columns:
        return
    top_subs = df[subreddit_col].value_counts().head(10).index.tolist()
    sub_df = df[df[subreddit_col].isin(top_subs)]
    fig, ax = plt.subplots(figsize=(14, 7))
    sns.countplot(data=sub_df, x=subreddit_col, hue=label_col, ax=ax, order=top_subs)
    ax.set_title("Subreddit vs Stress Level")
    ax.tick_params(axis="x", rotation=45)
    save_or_show(fig, "subreddit_vs_label")

def generate_wordcloud(df, text_col, label_col=None):
    """Generate per-label and overall word clouds."""
    def clean_text(t):
        t = str(t).lower()
        t = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", t)
        t = re.sub(r"[^a-z\s]", " ", t)
        return re.sub(r"\s+", " ", t).strip()
    
    if text_col not in df.columns:
        return
    
    labels = [None]
    if label_col and label_col in df.columns:
        labels = sorted(df[label_col].dropna().unique(), key=lambda x: str(x))
    
    for label in labels:
        subset = df if label is None else df[df[label_col] == label]
        text = " ".join(clean_text(t) for t in subset[text_col].dropna().astype(str))
        if not text.strip():
            continue
        wc = WordCloud(width=900, height=450, stopwords=MENTAL_STOPWORDS,
                       background_color="white", max_words=150, random_state=42).generate(text)
        fig, ax = plt.subplots(figsize=(10, 5))
        ax.imshow(wc, interpolation="bilinear")
        title = "Overall Word Cloud" if label is None else f"Word Cloud - {label}"
        ax.set_title(title, fontsize=14)
        ax.axis("off")
        fname = f"{text_col}_wordcloud_{label if label else 'overall'}"
        save_or_show(fig, fname)

# ===============================
# Main EDA Runner
# ===============================
def run_eda(df: pd.DataFrame, label_col="label", subreddit_col="subreddit") -> Dict[str, Any]:
    """Perform complete EDA and save results."""
    report = {}
    report["summary"] = dataset_summary(df)
    text_cols = [c for c in df.select_dtypes(include=["object", "category"]).columns if c not in TEXT_COL_BLACKLIST]
    report["text_stats"] = text_column_stats(df, text_cols)
    report["class_distribution"] = class_distribution(df, label_col)
    
    # Visualizations
    for col in [label_col, subreddit_col]:
        if col in df.columns:
            plot_distribution(df, col)
    if subreddit_col in df.columns and label_col in df.columns:
        subreddit_vs_label(df, subreddit_col, label_col)
    for text_col in text_cols:
        generate_wordcloud(df, text_col, label_col)
    
    # Save JSON
    with open(REPORT_DIR / "eda_results.json", "w") as f:
        json.dump(report, f, indent=2)
    logger.info("EDA report saved to reports/eda_results.json")
    return report

# ===============================
# ‚úÖ SAFE EXECUTION BLOCK
# ===============================
from pathlib import Path
import pandas as pd

try:
    stress
except NameError:
    if Path("stress.csv").exists():
        stress = pd.read_csv("stress.csv")
        print("üìÑ 'stress.csv' loaded successfully as 'stress'")
    elif Path("data.csv").exists():
        stress = pd.read_csv("data.csv")
        print("üìÑ 'data.csv' loaded successfully as 'stress'")
    else:
        raise FileNotFoundError("‚ùå No dataset found. Run the data loading cell first.")

try:
    eda_report = run_eda(stress, label_col="label", subreddit_col="subreddit")
    print("\n‚úÖ EDA COMPLETE")
    print("üìä Results saved: reports/eda_results.json")
    print("üñºÔ∏è Figures saved: reports/figures/")
except Exception as e:
    import traceback
    print("‚ùå EDA FAILED:", str(e))
    traceback.print_exc()


2025-11-07 17:30:32,195 | INFO | Text column stats computed: ['post_id', 'sentence_range', 'text']
2025-11-07 17:30:32,214 | INFO | Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-11-07 17:30:32,219 | INFO | Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-11-07 17:30:32,320 | INFO | Saved figure: reports/figures/label_distribution.png


üìÑ 'stress.csv' loaded successfully as 'stress'


2025-11-07 17:30:32,462 | INFO | Saved figure: reports/figures/subreddit_distribution.png
2025-11-07 17:30:32,574 | INFO | Saved figure: reports/figures/subreddit_vs_label.png
2025-11-07 17:30:32,971 | INFO | Saved figure: reports/figures/post_id_wordcloud_overall.png
2025-11-07 17:30:33,316 | INFO | Saved figure: reports/figures/post_id_wordcloud_1.png
2025-11-07 17:30:33,845 | INFO | Saved figure: reports/figures/text_wordcloud_overall.png
2025-11-07 17:30:34,518 | INFO | Saved figure: reports/figures/text_wordcloud_1.png
2025-11-07 17:30:34,520 | INFO | EDA report saved to reports/eda_results.json



‚úÖ EDA COMPLETE
üìä Results saved: reports/eda_results.json
üñºÔ∏è Figures saved: reports/figures/


In [None]:
# ==========================================
# üß† ENHANCED NLP PREPROCESSING - MENTAL STRESS DETECTION
# ==========================================
import re
import time
import json
import string
import warnings
from pathlib import Path
from typing import List
import pandas as pd
import numpy as np
from tqdm import tqdm

# NLP
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ==========================================
# SETUP
# ==========================================
tqdm.pandas()
Path("preprocessors").mkdir(exist_ok=True)

# --- Load required NLTK resources ---
required_nltk_downloads = ["stopwords", "wordnet", "punkt", "omw-1.4"]
for r in required_nltk_downloads:
    try:
        nltk.data.find(f"corpora/{r}")
    except LookupError:
        nltk.download(r, quiet=True)

# --- Initialize spaCy (if available) ---
try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tok2vec", "tagger", "attribute_ruler"])
    LEMMATIZER = "spacy"
except OSError:
    print("‚ö†Ô∏è spaCy model not found. Falling back to NLTK lemmatizer.")
    nlp = None
    LEMMATIZER = "nltk"

# --- Stopwords setup ---
MENTAL_STOPWORDS = set(stopwords.words("english")).union({
    "like", "get", "would", "could", "should", "really", "much", "even", "also", "still",
    "think", "feel", "know", "going", "want", "need", "one", "way", "people", "time",
    "good", "bad", "well", "right", "thing", "reddit", "post", "comment", "subreddit", "thread",
    "op", "edit", "update", "really", "pretty", "quite", "very", "super", "totally", "completely",
    "absolutely", "definitely", "probably", "maybe", "perhaps", "might"
})

PRESERVE_KEYWORDS = {
    "stress", "stressed", "anxiety", "anxious", "depression", "depressed", "panic", "worry",
    "fear", "overwhelmed", "tired", "exhausted", "sad", "happy", "angry", "frustrated",
    "hopeless", "helpless", "therapy", "counseling", "medication", "sleep", "insomnia",
    "work", "job", "family", "relationship", "health", "money", "support", "help",
    "better", "worse", "difficult", "hard", "easy"
}

lemmatizer = WordNetLemmatizer()

# ==========================================
# CLEANING HELPERS
# ==========================================
def clean_social_text(text: str) -> str:
    """Remove social media noise and normalize text."""
    if not isinstance(text, str) or not text.strip():
        return ""
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"/[ur]/[A-Za-z0-9_-]+", "", text)
    text = re.sub(r"@[A-Za-z0-9_]+|#[A-Za-z0-9_]+", "", text)
    text = re.sub(r"\[deleted\]|\[removed\]", "", text)
    text = re.sub(r"[^a-zA-Z\s!?]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def advanced_preprocess(text: str) -> str:
    """Comprehensive text preprocessing for mental health text."""
    if not isinstance(text, str) or not text.strip():
        return ""
    text = clean_social_text(text.lower())

    contractions = {
        "won't": "will not", "can't": "cannot", "n't": " not", "'re": " are",
        "'ve": " have", "'ll": " will", "'d": " would", "'m": " am"
    }
    for c, e in contractions.items():
        text = text.replace(c, e)

    words = text.split()
    cleaned = [
        w for w in words
        if (w in PRESERVE_KEYWORDS or w not in MENTAL_STOPWORDS) and len(w) > 1
    ]
    return " ".join(cleaned)

def lemmatize_text(text: str) -> str:
    """Lemmatize using spaCy or NLTK."""
    if not isinstance(text, str) or not text.strip():
        return ""
    if nlp:
        doc = nlp(text)
        return " ".join(
            token.lemma_ if token.lemma_ != "-PRON-" else token.text
            for token in doc if not token.is_space
        )
    else:
        return " ".join(lemmatizer.lemmatize(w) for w in text.split())

def full_clean_pipeline(text: str) -> str:
    """Full text normalization pipeline."""
    cleaned = advanced_preprocess(text)
    lemmatized = lemmatize_text(cleaned)
    final = re.sub(r"\s+", " ", lemmatized).strip()
    return final if final else "empty_text"

# ==========================================
# BATCH PROCESSOR
# ==========================================
def batch_clean_texts(text_series: pd.Series, batch_size: int = 1000) -> pd.Series:
    """Efficient batch cleaning with progress tracking."""
    processed = []
    n = len(text_series)
    total_batches = (n // batch_size) + (1 if n % batch_size else 0)
    print(f"üßπ Cleaning {n:,} texts in {total_batches} batches of {batch_size}...")
    start = time.time()
    for i in range(0, n, batch_size):
        batch = text_series.iloc[i:i+batch_size]
        processed.extend(batch.progress_apply(full_clean_pipeline))
    print(f"‚úÖ Completed in {time.time()-start:.1f}s")
    return pd.Series(processed, index=text_series.index)

# ==========================================
# EXECUTION
# ==========================================
print("="*60)
print("ENHANCED TEXT PREPROCESSING FOR MENTAL STRESS DETECTION")
print("="*60)

# Detect text column
text_cols = [c for c in stress.columns if stress[c].dtype == "object" and c not in ["label", "subreddit"]]
if not text_cols:
    raise ValueError("‚ö†Ô∏è No text columns found. Please verify your dataset.")
TEXT_COL = text_cols[0]
print(f"üìò Text column detected: '{TEXT_COL}'")

# Display sample before cleaning
print("\nüìÑ Sample Original Texts:")
for i, t in enumerate(stress[TEXT_COL].dropna().head(3), 1):
    print(f"{i}. {t[:100]}...")

# Run cleaning
stress["clean_text"] = batch_clean_texts(stress[TEXT_COL])

# Display after cleaning
print("\nüßæ Sample Cleaned Texts:")
for i, (o, c) in enumerate(zip(stress[TEXT_COL].dropna().head(3),
                               stress["clean_text"].dropna().head(3)), 1):
    print(f"{i}. Original: {o[:80]}...")
    print(f"   Cleaned:  {c[:80]}...")

# ==========================================
# METRICS & SUMMARY
# ==========================================
orig_len = stress[TEXT_COL].astype(str).str.len().mean()
clean_len = stress["clean_text"].astype(str).str.len().mean()
orig_words = stress[TEXT_COL].astype(str).str.split().str.len().mean()
clean_words = stress["clean_text"].astype(str).str.split().str.len().mean()

print("\nüßÆ CLEANING STATISTICS:")
print(f"Average Original Length: {orig_len:.1f} chars")
print(f"Average Cleaned Length:  {clean_len:.1f} chars")
print(f"Length Reduction:        {(1 - clean_len/orig_len)*100:.1f}%")
print(f"Average Original Words:  {orig_words:.1f}")
print(f"Average Cleaned Words:   {clean_words:.1f}")
print(f"Word Reduction:          {(1 - clean_words/orig_words)*100:.1f}%")

empty_texts = (stress["clean_text"] == "empty_text").sum()
if empty_texts > 0:
    print(f"‚ö†Ô∏è {empty_texts} rows resulted in empty text after cleaning.")

# ==========================================
# SAVE CONFIGURATION
# ==========================================
config = {
    "stopwords_count": len(MENTAL_STOPWORDS),
    "preserved_keywords": sorted(list(PRESERVE_KEYWORDS)),
    "text_column": TEXT_COL,
    "lemmatization": LEMMATIZER,
    "settings": {
        "remove_digits": True,
        "preserve_emphasis": True,
        "min_word_length": 2
    }
}
with open("preprocessors/text_preprocessing_config.json", "w") as f:
    json.dump(config, f, indent=2)
print("\nüìÅ Configuration saved ‚Üí preprocessors/text_preprocessing_config.json")

print(f"\n‚úÖ Text preprocessing complete! New column 'clean_text' created.")
print(f"Final dataset shape: {stress.shape}")


ENHANCED TEXT PREPROCESSING FOR MENTAL STRESS DETECTION
üìò Text column detected: 'post_id'

üìÑ Sample Original Texts:
1. 8601tu...
2. 8lbrx9...
3. 9ch1zh...
üßπ Cleaning 2,838 texts in 3 batches of 1000...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 30044.51it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 44157.54it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 838/838 [00:00<00:00, 40760.59it/s]

‚úÖ Completed in 0.1s

üßæ Sample Cleaned Texts:
1. Original: 8601tu...
   Cleaned:  tu...
2. Original: 8lbrx9...
   Cleaned:  lbrx...
3. Original: 9ch1zh...
   Cleaned:  ch zh...

üßÆ CLEANING STATISTICS:
Average Original Length: 6.0 chars
Average Cleaned Length:  4.3 chars
Length Reduction:        27.6%
Average Original Words:  1.0
Average Cleaned Words:   1.1
Word Reduction:          -8.4%
‚ö†Ô∏è 336 rows resulted in empty text after cleaning.

üìÅ Configuration saved ‚Üí preprocessors/text_preprocessing_config.json

‚úÖ Text preprocessing complete! New column 'clean_text' created.
Final dataset shape: (2838, 117)





In [4]:
# ==========================================
# ‚öôÔ∏è ENVIRONMENT SETUP - NLP DEPENDENCIES
# ==========================================
import ssl
import nltk
import subprocess
import sys

# --- Fix SSL issues (for Mac / older Python builds) ---
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# --- Required NLTK resources ---
nltk_resources = {
    "tokenizers": ["punkt", "punkt_tab"],
    "corpora": ["stopwords", "wordnet", "words"],
    "taggers": ["averaged_perceptron_tagger"],
    "chunkers": ["maxent_ne_chunker"],
    "sentiment": ["vader_lexicon"]
}

print("üì¶ Checking & Downloading NLTK Resources...")
for category, resources in nltk_resources.items():
    for resource in resources:
        try:
            nltk.data.find(f"{category}/{resource}")
            print(f"‚úÖ {resource} already available")
        except LookupError:
            print(f"‚¨áÔ∏è Downloading {resource} ...")
            nltk.download(resource, quiet=True)
            print(f"‚úÖ {resource} downloaded successfully")

print("\n‚úÖ All NLTK resources ready!\n")

# ==========================================
# üß† OPTIONAL: INSTALL & LOAD spaCy MODEL
# ==========================================
USE_SPACY = True  # Set False if you want to skip spaCy setup for lightweight runs

if USE_SPACY:
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tok2vec", "tagger", "attribute_ruler"])
        print("‚úÖ spaCy model 'en_core_web_sm' loaded successfully")
    except OSError:
        print("‚ö†Ô∏è spaCy model not found. Attempting to download...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
        import spacy
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tok2vec", "tagger", "attribute_ruler"])
        print("‚úÖ spaCy model 'en_core_web_sm' downloaded and loaded successfully")
    except Exception as e:
        print(f"‚ö†Ô∏è spaCy could not be initialized: {e}")
        nlp = None
else:
    nlp = None
    print("‚öôÔ∏è Skipping spaCy setup (USE_SPACY=False)")

# ==========================================
# ‚úÖ SUMMARY
# ==========================================
print("\nüîß Environment Initialization Complete")
print("NLTK + spaCy (optional) are ready for NLP tasks.\n")


üì¶ Checking & Downloading NLTK Resources...
‚úÖ punkt already available
‚úÖ punkt_tab already available
‚úÖ stopwords already available
‚¨áÔ∏è Downloading wordnet ...
‚úÖ wordnet downloaded successfully
‚úÖ words already available
‚úÖ averaged_perceptron_tagger already available
‚úÖ maxent_ne_chunker already available
‚¨áÔ∏è Downloading vader_lexicon ...
‚úÖ vader_lexicon downloaded successfully

‚úÖ All NLTK resources ready!

‚úÖ spaCy model 'en_core_web_sm' loaded successfully

üîß Environment Initialization Complete
NLTK + spaCy (optional) are ready for NLP tasks.



In [5]:
# ==========================================
# ‚öôÔ∏è MODEL ENVIRONMENT & IMPORTS
# ==========================================

import os
import sys
import re
import json
import time
import joblib
import warnings
from datetime import datetime
from collections import Counter

# Core Data Libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

warnings.filterwarnings("ignore")

# ==========================================
# NLP & TEXT PROCESSING
# ==========================================
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag

# ==========================================
# FEATURE EXTRACTION & VECTORIZATION
# ==========================================
from sklearn.feature_extraction.text import (
    TfidfVectorizer, CountVectorizer, HashingVectorizer
)
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

# ==========================================
# MODEL SELECTION & EVALUATION
# ==========================================
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, 
    cross_val_predict, GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score
)
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

# ==========================================
# MACHINE LEARNING MODELS
# ==========================================
from sklearn.linear_model import (
    LogisticRegression, RidgeClassifier, SGDClassifier
)
from sklearn.naive_bayes import (
    MultinomialNB, ComplementNB, BernoulliNB
)
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, AdaBoostClassifier, 
    ExtraTreesClassifier, GradientBoostingClassifier,
    StackingClassifier, VotingClassifier, BaggingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)

# ==========================================
# ADVANCED MODELS (XGBoost / LightGBM / CatBoost)
# ==========================================
def safe_import(package_name, import_name=None):
    """Safely import a package, installing if necessary."""
    try:
        module = __import__(package_name if import_name is None else import_name)
        print(f"‚úÖ {package_name} available")
        return module
    except ImportError:
        print(f"‚¨áÔ∏è Installing {package_name}...")
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        module = __import__(package_name if import_name is None else import_name)
        print(f"‚úÖ {package_name} installed successfully")
        return module

# ---- XGBoost ----
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
    print("‚úÖ XGBoost available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("‚ö†Ô∏è XGBoost not installed")

# ---- LightGBM ----
try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
    print("‚úÖ LightGBM available")
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("‚ö†Ô∏è LightGBM not installed")

# ---- CatBoost ----
try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
    print("‚úÖ CatBoost available")
except ImportError:
    CATBOOST_AVAILABLE = False
    print("‚ö†Ô∏è CatBoost not installed")

# ==========================================
# TEXT ANALYSIS UTILITIES
# ==========================================
# ---- TextStat (Readability Scores) ----
try:
    from textstat import flesch_reading_ease, flesch_kincaid_grade
    TEXTSTAT_AVAILABLE = True
    print("‚úÖ TextStat available")
except ImportError:
    TEXTSTAT_AVAILABLE = False
    print("‚¨áÔ∏è Installing TextStat...")
    textstat = safe_import("textstat")
    from textstat import flesch_reading_ease, flesch_kincaid_grade
    TEXTSTAT_AVAILABLE = True

# ---- Vader Sentiment ----
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    VADER_AVAILABLE = True
    print("‚úÖ VaderSentiment available")
except ImportError:
    VADER_AVAILABLE = False
    print("‚¨áÔ∏è Installing VaderSentiment...")
    vaderSentiment = safe_import("vaderSentiment")
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    VADER_AVAILABLE = True

# ==========================================
# COMPLETION LOG
# ==========================================
print("\nüéØ All imports and dependencies successfully initialized!\n")

# ==========================================
# VERSION SUMMARY
# ==========================================
import sklearn
print("üì¶ Library Versions:")
print(f"  numpy:      {np.__version__}")
print(f"  pandas:     {pd.__version__}")
print(f"  scikit-learn: {sklearn.__version__}")
print(f"  nltk:       {nltk.__version__}")

try:
    import spacy
    print(f"  spacy:      {spacy.__version__}")
except ImportError:
    print("  spacy:      Not Installed")

if XGBOOST_AVAILABLE:
    import xgboost
    print(f"  xgboost:    {xgboost.__version__}")
if LIGHTGBM_AVAILABLE:
    import lightgbm
    print(f"  lightgbm:   {lightgbm.__version__}")
if CATBOOST_AVAILABLE:
    import catboost
    print(f"  catboost:   {catboost.__version__}")

print("\n‚úÖ Environment fully configured and ready for model training!\n")


‚úÖ XGBoost available
‚úÖ LightGBM available
‚úÖ CatBoost available
‚úÖ TextStat available
‚úÖ VaderSentiment available

üéØ All imports and dependencies successfully initialized!

üì¶ Library Versions:
  numpy:      1.26.4
  pandas:     2.3.1
  scikit-learn: 1.7.1
  nltk:       3.9.1
  spacy:      3.8.7
  xgboost:    3.1.1
  lightgbm:   4.6.0
  catboost:   1.2.8

‚úÖ Environment fully configured and ready for model training!



In [None]:
pip install XGBoost

In [None]:
pip install LightGBM

In [None]:
pip install CatBoost

In [6]:
# ==========================================
# ‚öôÔ∏è OPTIMIZED HIGH-PERFORMANCE TEXT PREPROCESSOR
# ==========================================
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

# Sentiment Analyzer check
if "VADER_AVAILABLE" in globals() and VADER_AVAILABLE:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    USE_VADER = True
else:
    USE_VADER = False

class HighPerformanceTextPreprocessor:
    def __init__(self):
        """Initialize with advanced stress and emotion lexicons."""
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words("english"))
        self.sentiment_analyzer = SentimentIntensityAnalyzer() if USE_VADER else None

        # ===== Mental Health Lexicons =====
        self.stress_keywords = {
            'stress','stressed','stressing','stressful','pressure','pressured','overwhelming',
            'overwhelm','overwhelmed','anxiety','anxious','worried','worry','panic','fear',
            'nervous','afraid','scared','terrified','depression','depressed','sad','hopeless',
            'helpless','miserable','angry','frustrated','tired','exhausted','fatigue','burnout',
            'sleepless','insomnia','pain','crying','breakdown','broken','hurt','lost','trapped',
            'suffocating','drowning','confused','hurt','worse','shattered'
        }

        self.positive_keywords = {
            'happy','joy','joyful','excited','amazing','wonderful','fantastic','great','excellent',
            'beautiful','calm','peaceful','relaxed','content','satisfied','cheerful','optimistic',
            'grateful','thankful','love','loving','supported','helped','better','healing','hopeful'
        }

        # Exclude mental health terms from stopwords
        self.stop_words = self.stop_words - self.stress_keywords - self.positive_keywords

        # Regex pre-compilation for faster processing
        self.url_pattern = re.compile(r"http[s]?://[^\s]+|www\.[^\s]+", re.I)
        self.non_alphanum = re.compile(r"[^a-zA-Z0-9\s!?.,;-]", re.I)
        self.multi_space = re.compile(r"\s+")
    
    # ==========================================
    # üß† TEXT CLEANING
    # ==========================================
    def clean_text_advanced(self, text: str) -> str:
        """Cleans and lemmatizes text efficiently while preserving context."""
        if not isinstance(text, str) or not text.strip():
            return ""

        text = text.lower()
        text = self.url_pattern.sub(" url ", text)
        text = re.sub(r"[!]{2,}", "!!", text)
        text = re.sub(r"[?]{2,}", "??", text)
        text = self.non_alphanum.sub(" ", text)
        text = self.multi_space.sub(" ", text).strip()

        # Tokenize
        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()

        processed = []
        for token in tokens:
            if len(token) < 2 and token not in {"i", "!", "?"}:
                continue

            if token in self.stress_keywords or token in self.positive_keywords:
                processed.append(token)
            elif token not in self.stop_words:
                try:
                    processed.append(self.lemmatizer.lemmatize(token))
                except:
                    processed.append(token)
            elif token in {"i", "me", "my", "no", "not", "!", "?"}:
                processed.append(token)

        return " ".join(processed)

    # ==========================================
    # üß© FEATURE EXTRACTION
    # ==========================================
    def extract_advanced_features(self, text: str) -> dict:
        """Extracts robust linguistic, emotional, and contextual features."""
        if not isinstance(text, str) or not text.strip():
            return {f"feature_{i}": 0 for i in range(25)}

        text = text.lower()
        words = text.split()
        word_count = len(words)
        char_count = len(text)
        features = {}

        # Basic structural stats
        features["char_count"] = char_count
        features["word_count"] = word_count
        features["avg_word_length"] = np.mean([len(w) for w in words]) if words else 0

        sentences = [s.strip() for s in text.split('.') if s.strip()]
        features["sentence_count"] = len(sentences)
        features["avg_sentence_length"] = word_count / max(len(sentences), 1)

        # Keyword-based emotional density
        stress_count = sum(word in self.stress_keywords for word in words)
        positive_count = sum(word in self.positive_keywords for word in words)
        features["stress_density"] = stress_count / max(word_count, 1)
        features["positive_density"] = positive_count / max(word_count, 1)
        features["emotional_polarity"] = features["positive_density"] - features["stress_density"]

        # Intensity & negation
        intensity_words = ["very","extremely","really","so","too","completely","totally","absolutely"]
        negation_words = ["not","no","never","nothing","nobody","none","cant","cannot","wont","dont"]
        features["intensity_ratio"] = sum(text.count(w) for w in intensity_words) / max(word_count, 1)
        features["negation_ratio"] = sum(text.count(w) for w in negation_words) / max(word_count, 1)

        # Pronoun & punctuation
        pronouns = [" i ", " me ", " my ", " myself "]
        features["first_person_ratio"] = sum(text.count(p) for p in pronouns) / max(word_count, 1)
        features["exclamation_ratio"] = text.count("!") / max(char_count, 1)
        features["question_ratio"] = text.count("?") / max(char_count, 1)

        # Sentiment analysis
        if USE_VADER and self.sentiment_analyzer:
            try:
                s = self.sentiment_analyzer.polarity_scores(text)
                features.update({
                    "sentiment_positive": s["pos"],
                    "sentiment_negative": s["neg"],
                    "sentiment_neutral": s["neu"],
                    "sentiment_compound": s["compound"],
                })
            except:
                features.update({
                    "sentiment_positive": 0.0,
                    "sentiment_negative": 0.0,
                    "sentiment_neutral": 1.0,
                    "sentiment_compound": 0.0,
                })
        else:
            features["sentiment_positive"] = positive_count / max(word_count, 1)
            features["sentiment_negative"] = stress_count / max(word_count, 1)
            features["sentiment_neutral"] = 1 - features["sentiment_positive"] - features["sentiment_negative"]
            features["sentiment_compound"] = features["sentiment_positive"] - features["sentiment_negative"]

        # Contextual domains
        work_terms = ["work", "job", "office", "project", "deadline"]
        family_terms = ["family", "husband", "wife", "parents", "children", "home"]
        health_terms = ["health", "doctor", "pain", "sick", "hospital", "medicine"]
        features["work_context"] = sum(text.count(w) for w in work_terms) / max(word_count, 1)
        features["family_context"] = sum(text.count(w) for w in family_terms) / max(word_count, 1)
        features["health_context"] = sum(text.count(w) for w in health_terms) / max(word_count, 1)

        return features

print("‚úÖ High-Performance Text Preprocessor initialized successfully.")


‚úÖ High-Performance Text Preprocessor initialized successfully.


In [7]:
# ==========================================
# üß© ADVANCED VECTORIZER CREATOR
# ==========================================
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def create_advanced_vectorizers(custom_stopwords=None):
    """
    Create a suite of optimized vectorizers for mental health text analysis.
    
    Includes: TF-IDF (unigram‚Äìtrigram), character-level, and CountVectorizers.
    All settings tuned for emotional and linguistic context retention.
    """
    stop_words = custom_stopwords if custom_stopwords else 'english'
    
    vectorizers = {
        # TF-IDF - Unigrams (core semantics)
        'tfidf_unigram': TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 1),
            min_df=3,
            max_df=0.95,
            stop_words=stop_words,
            sublinear_tf=True,
            use_idf=True,
            lowercase=True,
            strip_accents='unicode'
        ),

        # TF-IDF - Bigrams (captures sentiment context like "feel bad")
        'tfidf_bigram': TfidfVectorizer(
            max_features=15000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words=stop_words,
            sublinear_tf=True,
            use_idf=True,
            lowercase=True,
            strip_accents='unicode'
        ),

        # TF-IDF - Trigrams (captures phrases like "hard to breathe")
        'tfidf_trigram': TfidfVectorizer(
            max_features=20000,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.9,
            stop_words=stop_words,
            sublinear_tf=True,
            use_idf=True,
            lowercase=True,
            strip_accents='unicode'
        ),

        # Count Vectorizer - Unigram (raw frequency baseline)
        'count_unigram': CountVectorizer(
            max_features=8000,
            ngram_range=(1, 1),
            min_df=3,
            max_df=0.95,
            stop_words=stop_words,
            lowercase=True,
            strip_accents='unicode'
        ),

        # Count Vectorizer - Bigram (frequency with light context)
        'count_bigram': CountVectorizer(
            max_features=12000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words=stop_words,
            lowercase=True,
            strip_accents='unicode'
        ),

        # TF-IDF Character-level (captures spelling/emotional emphasis like "soooo tired")
        'tfidf_char': TfidfVectorizer(
            max_features=8000,
            analyzer='char',
            ngram_range=(3, 5),
            min_df=3,
            max_df=0.95,
            sublinear_tf=True
        ),
    }
    
    print(f"‚úÖ Created {len(vectorizers)} optimized vectorizers:")
    for name, vec in vectorizers.items():
        ngram_info = f"{vec.ngram_range}" if hasattr(vec, "ngram_range") else "N/A"
        print(f"   ‚Ä¢ {name:<15} | n-gram: {ngram_info} | max_features: {vec.max_features}")
    
    return vectorizers


# ==========================================
# ‚öôÔ∏è Instantiate Vectorizers
# ==========================================
vectorizers = create_advanced_vectorizers()


‚úÖ Created 6 optimized vectorizers:
   ‚Ä¢ tfidf_unigram   | n-gram: (1, 1) | max_features: 10000
   ‚Ä¢ tfidf_bigram    | n-gram: (1, 2) | max_features: 15000
   ‚Ä¢ tfidf_trigram   | n-gram: (1, 3) | max_features: 20000
   ‚Ä¢ count_unigram   | n-gram: (1, 1) | max_features: 8000
   ‚Ä¢ count_bigram    | n-gram: (1, 2) | max_features: 12000
   ‚Ä¢ tfidf_char      | n-gram: (3, 5) | max_features: 8000


In [9]:
# ==========================================
# üß© NOVEL, BoW & ENSEMBLE VECTORIZERS
# ==========================================
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix

# ==========================================
# üìö CREATE NOVEL VECTORIZERS
# ==========================================
def create_novel_vectorizers():
    """Create novel and baseline vectorization approaches (BoW, hybrid, domain-specific)."""
    
    novel_vectorizers = {
        # --- Baseline Bag of Words (Binary & Frequency) ---
        'bow_binary': CountVectorizer(
            max_features=8000, ngram_range=(1, 1),
            min_df=3, max_df=0.95, stop_words='english', binary=True
        ),
        'bow_freq': CountVectorizer(
            max_features=10000, ngram_range=(1, 1),
            min_df=2, max_df=0.95, stop_words='english', binary=False
        ),

        # --- Novel TF-IDF Variant ---
        'tfidf_sublinear': TfidfVectorizer(
            max_features=12000, ngram_range=(1, 2),
            min_df=2, max_df=0.85, stop_words='english',
            sublinear_tf=True, use_idf=True, norm='l1'
        ),

        # --- Hybrid Char + Word TF-IDF ---
        'hybrid_char_word': FeatureUnion([
            ('word_tfidf', TfidfVectorizer(
                max_features=6000, ngram_range=(1, 2),
                min_df=3, stop_words='english')),
            ('char_tfidf', TfidfVectorizer(
                max_features=4000, analyzer='char_wb',
                ngram_range=(3, 5), min_df=3))
        ]),

        # --- Domain-Focused Mental Health Vocabulary ---
        'mental_health_focused': TfidfVectorizer(
            max_features=8000, ngram_range=(1, 2),
            min_df=2, max_df=0.9,
            stop_words=['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
        ),

        # --- Weighted TF-IDF (emotional emphasis) ---
        'weighted_tfidf': TfidfVectorizer(
            max_features=10000, ngram_range=(1, 2),
            min_df=2, max_df=0.9,
            stop_words='english', token_pattern=r'\b\w+\b',
            lowercase=True, use_idf=True, smooth_idf=True
        )
    }
    print(f"‚úÖ Created {len(novel_vectorizers)} novel vectorizers.")
    return novel_vectorizers


# ==========================================
# üß© CREATE ENSEMBLE VECTORIZERS
# ==========================================
def create_ensemble_vectorizers():
    """Create ensemble-based feature unions of multiple vectorizers."""
    ensemble_vectorizers = {
        # --- Multi-TFIDF Ensemble (uni+bi+tri) ---
        'ensemble_tfidf': FeatureUnion([
            ('tfidf_uni', TfidfVectorizer(max_features=4000, ngram_range=(1, 1), min_df=3, stop_words='english')),
            ('tfidf_bi', TfidfVectorizer(max_features=4000, ngram_range=(2, 2), min_df=2, stop_words='english')),
            ('tfidf_tri', TfidfVectorizer(max_features=2000, ngram_range=(3, 3), min_df=2, stop_words='english'))
        ]),

        # --- Count + TF-IDF Ensemble ---
        'count_tfidf_ensemble': FeatureUnion([
            ('count', CountVectorizer(max_features=5000, ngram_range=(1, 1), min_df=3, stop_words='english')),
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, stop_words='english'))
        ])
    }
    print(f"‚úÖ Created {len(ensemble_vectorizers)} ensemble vectorizers.")
    return ensemble_vectorizers


# ==========================================
# üß† CUSTOM STRESS-FOCUSED VECTORIZER
# ==========================================
class CustomStressVectorizer:
    """A novel vectorizer combining TF-IDF with stress-specific emotional features."""

    def __init__(self, max_features=8000, scaler=None):
        self.max_features = max_features
        self.scaler = scaler or StandardScaler()
        self.vectorizer = None

        # Domain-specific lexicons
        self.stress_keywords = [
            'stress','anxiety','panic','worry','fear','overwhelm','pressure',
            'depression','sad','angry','frustrated','tired','exhausted','burnout',
            'breakdown','crisis','help','therapy','medication','hopeless','helpless',
            'lonely','isolated','afraid','scared'
        ]
        self.positive_keywords = [
            'happy','joy','peaceful','calm','relaxed','wonderful','amazing',
            'fantastic','love','blessed','grateful','optimistic','confident',
            'energetic','motivated','successful'
        ]
        self.intensity_words = [
            'very','extremely','really','so','too','quite','rather','completely',
            'totally','absolutely','incredibly','tremendously'
        ]

    def _create_custom_features(self, texts):
        """Extract stress-related density, polarity, punctuation, and personal intensity features."""
        features = []
        for text in texts:
            if not isinstance(text, str):
                text = ''
            text_lower = text.lower()
            word_count = max(len(text.split()), 1)

            stress_count = sum(word in text_lower for word in self.stress_keywords)
            positive_count = sum(word in text_lower for word in self.positive_keywords)
            intensity_count = sum(word in text_lower for word in self.intensity_words)

            personal_pronouns = [' i ', ' me ', ' my ', ' myself ']
            personal_count = sum(text_lower.count(p) for p in personal_pronouns)

            features.append({
                'stress_keyword_density': stress_count / word_count,
                'positive_keyword_density': positive_count / word_count,
                'intensity_amplification': intensity_count / word_count,
                'emotional_polarity': (positive_count - stress_count) / word_count,
                'exclamation_ratio': text.count('!') / max(len(text), 1),
                'question_ratio': text.count('?') / max(len(text), 1),
                'personal_intensity': personal_count / word_count,
            })
        return pd.DataFrame(features)

    def fit_transform(self, texts):
        """Fit TF-IDF and combine with custom emotion features."""
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features - 100,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.9,
            stop_words='english'
        )
        tfidf_features = self.vectorizer.fit_transform(texts)
        custom_features = self._create_custom_features(texts)
        scaled = self.scaler.fit_transform(custom_features)
        return hstack([tfidf_features, csr_matrix(scaled)])

    def transform(self, texts):
        if self.vectorizer is None:
            raise ValueError("CustomStressVectorizer must be fitted before transform.")
        tfidf_features = self.vectorizer.transform(texts)
        custom_features = self._create_custom_features(texts)
        scaled = self.scaler.transform(custom_features)
        return hstack([tfidf_features, csr_matrix(scaled)])


# ==========================================
# üîó COMBINE ALL VECTORIZERS
# ==========================================
novel_vectorizers = create_novel_vectorizers()
ensemble_vectorizers = create_ensemble_vectorizers()
novel_vectorizers['custom_stress'] = CustomStressVectorizer(max_features=8000)

# Merge with earlier `vectorizers` (TF-IDF & Count from previous cell)
all_vectorizers = {**vectorizers, **novel_vectorizers, **ensemble_vectorizers}

# Summary logs
print(f"\nüì¶ Added {len(novel_vectorizers)} novel vectorizers:")
for n in novel_vectorizers.keys():
    print(f"   ‚Ä¢ {n}")

print(f"\nüîó Added {len(ensemble_vectorizers)} ensemble vectorizers:")
for n in ensemble_vectorizers.keys():
    print(f"   ‚Ä¢ {n}")

print(f"\n‚úÖ Total vectorizers now available: {len(all_vectorizers)}")

# Update main dictionary
vectorizers = all_vectorizers


‚úÖ Created 6 novel vectorizers.
‚úÖ Created 2 ensemble vectorizers.

üì¶ Added 7 novel vectorizers:
   ‚Ä¢ bow_binary
   ‚Ä¢ bow_freq
   ‚Ä¢ tfidf_sublinear
   ‚Ä¢ hybrid_char_word
   ‚Ä¢ mental_health_focused
   ‚Ä¢ weighted_tfidf
   ‚Ä¢ custom_stress

üîó Added 2 ensemble vectorizers:
   ‚Ä¢ ensemble_tfidf
   ‚Ä¢ count_tfidf_ensemble

‚úÖ Total vectorizers now available: 15


In [18]:
# ==========================================
# ü§ñ CELL 10 ‚Äî COMPLETE MODEL SUITE
# ==========================================
# Unified, self-contained registry of base, novel, and deep ensemble models
# Run this after all imports & vectorizers are defined 

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# --- Core sklearn imports ---
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier,
    BaggingClassifier, VotingClassifier
)
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin

# --- Optional gradient boosting libraries ---
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False

try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except Exception:
    LIGHTGBM_AVAILABLE = False

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except Exception:
    CATBOOST_AVAILABLE = False


# ------------------------------------------------------------
# 1Ô∏è‚É£ Base Model Factory
# ------------------------------------------------------------
def create_advanced_models():
    """Return well-configured traditional ML models."""
    models = {
        # Linear Models
        "LogisticRegression": LogisticRegression(C=1.0, max_iter=2000, random_state=42, class_weight="balanced"),
        "LogisticRegression_L1": LogisticRegression(
            C=0.5, penalty="l1", solver="liblinear", max_iter=2000, random_state=42, class_weight="balanced"
        ),
        "RidgeClassifier": RidgeClassifier(alpha=1.0),
        "SGDClassifier": SGDClassifier(loss="hinge", alpha=1e-4, max_iter=2000, random_state=42),

        # Naive Bayes
        "MultinomialNB": MultinomialNB(alpha=0.1),
        "ComplementNB": ComplementNB(alpha=0.1),
        "BernoulliNB": BernoulliNB(alpha=0.1),

        # Trees
        "RandomForest": RandomForestClassifier(
            n_estimators=200, max_depth=15, random_state=42, class_weight="balanced", n_jobs=-1
        ),
        "ExtraTrees": ExtraTreesClassifier(
            n_estimators=200, max_depth=15, random_state=42, class_weight="balanced", n_jobs=-1
        ),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=4, random_state=42),

        # Others
        "LinearSVC": LinearSVC(C=1.0, max_iter=2000, random_state=42),
        "MLPClassifier": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=7, weights="distance", n_jobs=-1),
    }

    # Optional libraries
    if XGBOOST_AVAILABLE:
        models["XGBoost"] = XGBClassifier(
            n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42,
            eval_metric="logloss", use_label_encoder=False, n_jobs=-1
        )
    if LIGHTGBM_AVAILABLE:
        models["LightGBM"] = LGBMClassifier(
            n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42,
            class_weight="balanced", n_jobs=-1
        )
    if CATBOOST_AVAILABLE:
        models["CatBoost"] = CatBoostClassifier(
            iterations=200, learning_rate=0.1, depth=6, random_state=42,
            auto_class_weights="Balanced", verbose=False
        )

    return models


# ------------------------------------------------------------
# 2Ô∏è‚É£ Novel / Ensemble Model Factory
# ------------------------------------------------------------
def create_novel_models():
    novel = {
        "linear_stacking_voting": VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(C=1.0, random_state=42)),
                ("ridge", RidgeClassifier(alpha=0.5)),
                ("sgd", SGDClassifier(alpha=1e-4, random_state=42))
            ],
            voting="hard"
        ),
        "ada_boost_tree": AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=3), n_estimators=100, learning_rate=1.0, random_state=42
        ),
        "bagging_lr": BaggingClassifier(
            estimator=LogisticRegression(random_state=42), n_estimators=20, random_state=42, n_jobs=-1
        ),
        "feature_selection_lr": Pipeline([
            ("select", SelectKBest(chi2, k=5000)),
            ("clf", LogisticRegression(C=1.0, random_state=42))
        ]),
    }

    novel["calibrated_rf"] = CalibratedClassifierCV(RandomForestClassifier(n_estimators=100, random_state=42))
    novel["calibrated_svm"] = CalibratedClassifierCV(LinearSVC(C=1.0, max_iter=2000, random_state=42))

    if XGBOOST_AVAILABLE:
        novel["xgb_custom"] = XGBClassifier(
            n_estimators=150, learning_rate=0.15, max_depth=5, random_state=42,
            eval_metric="logloss", use_label_encoder=False, n_jobs=-1
        )
    return novel


# ------------------------------------------------------------
# 3Ô∏è‚É£ Deep & Custom Ensemble Classes
# ------------------------------------------------------------
class DeepFeatureEnsemble(BaseEstimator, ClassifierMixin):
    """Multi-representation (word/char/syntax/emotion) ensemble with meta-learner."""

    def __init__(self):
        self.learners = {
            "word": (TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words="english"),
                     LogisticRegression(max_iter=2000, random_state=42)),
            "char": (TfidfVectorizer(max_features=3000, analyzer="char_wb", ngram_range=(3, 5)),
                     MultinomialNB(alpha=0.1)),
            "syntax": (TfidfVectorizer(max_features=2000, ngram_range=(2, 3)),
                       RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)),
            "emotion": (TfidfVectorizer(max_features=1000), SGDClassifier(alpha=1e-4, random_state=42))
        }
        self.meta = XGBClassifier(
            n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42, eval_metric="logloss"
        ) if XGBOOST_AVAILABLE else LogisticRegression(C=0.5, random_state=42)
        self.fitted = False

    def fit(self, X, y):
        meta_feats = []
        for name, (vec, model) in self.learners.items():
            Xv = vec.fit_transform(X)
            model.fit(Xv, y)
            preds = model.predict_proba(Xv) if hasattr(model, "predict_proba") else model.predict(Xv)[:, None]
            meta_feats.append(preds)
        self.meta.fit(np.hstack(meta_feats), y)
        self.fitted = True
        return self

    def predict(self, X):
        if not self.fitted:
            raise ValueError("DeepFeatureEnsemble not fitted.")
        meta_feats = []
        for vec, model in self.learners.values():
            Xv = vec.transform(X)
            preds = model.predict_proba(Xv) if hasattr(model, "predict_proba") else model.predict(Xv)[:, None]
            meta_feats.append(preds)
        return self.meta.predict(np.hstack(meta_feats))


class AdaptiveBoosting(BaseEstimator, ClassifierMixin):
    """Lightweight adaptive boosting emphasizing hard stress samples."""

    def __init__(self, n_estimators=8):
        self.n_estimators = n_estimators
        self.models, self.weights = [], []
        self.fitted = False

    def fit(self, X, y):
        n = X.shape[0]
        weights = np.ones(n) / n
        for i in range(self.n_estimators):
            idx = np.random.choice(n, n, p=weights)
            Xs, ys = X[idx], y[idx]
            base = [LogisticRegression(random_state=42+i),
                    MultinomialNB(alpha=0.1),
                    RandomForestClassifier(n_estimators=20, max_depth=5, random_state=42+i)][i % 3]
            base.fit(Xs, ys)
            preds = base.predict(X)
            err = np.sum(weights * (preds != y))
            if err == 0 or err >= 0.5:
                continue
            alpha = 0.5 * np.log((1 - err) / err)
            weights *= np.exp(-alpha * (2 * y - 1) * (2 * preds - 1))
            weights /= np.sum(weights)
            self.models.append(base)
            self.weights.append(alpha)
        self.fitted = True
        return self

    def predict(self, X):
        if not self.fitted:
            raise ValueError("AdaptiveBoosting not fitted.")
        agg = np.zeros(X.shape[0])
        for m, w in zip(self.models, self.weights):
            agg += w * (2 * m.predict(X) - 1)
        return (agg > 0).astype(int)


class StressFocusedEnsemble(BaseEstimator, ClassifierMixin):
    """Stacking ensemble focused on stress classification."""

    def __init__(self):
        self.base = {
            "nb": MultinomialNB(alpha=0.05),
            "lr": LogisticRegression(C=0.5, penalty="l1", solver="liblinear", random_state=42),
            "rf": RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)
        }
        self.meta = LogisticRegression(C=0.8, random_state=42)
        self.fitted = False

    def fit(self, X, y):
        base_preds = np.column_stack([
            m.fit(X, y).predict_proba(X)[:, 1] if hasattr(m, "predict_proba") else m.fit(X, y).predict(X)
            for m in self.base.values()
        ])
        self.meta.fit(base_preds, y)
        self.fitted = True
        return self

    def predict(self, X):
        base_preds = np.column_stack([
            m.predict_proba(X)[:, 1] if hasattr(m, "predict_proba") else m.predict(X)
            for m in self.base.values()
        ])
        return self.meta.predict(base_preds)


# ------------------------------------------------------------
# 4Ô∏è‚É£ Register All Models
# ------------------------------------------------------------
if "models" not in globals():
    models = {}
models.update(create_advanced_models())
models.update(create_novel_models())
models.update({
    "deep_feature_ensemble": DeepFeatureEnsemble(),
    "adaptive_boosting": AdaptiveBoosting(),
    "stress_focused_ensemble": StressFocusedEnsemble()
})


# ------------------------------------------------------------
# 5Ô∏è‚É£ Summary
# ------------------------------------------------------------
print("\n‚úÖ Model Suite Assembled Successfully")
print(f"  ‚Ä¢ Total Models: {len(models)}")
print(f"  ‚Ä¢ Example: {list(models.keys())[:10]}")
print(f"  ‚Ä¢ XGBoost Available: {XGBOOST_AVAILABLE}")
print(f"  ‚Ä¢ LightGBM Available: {LIGHTGBM_AVAILABLE}")
print(f"  ‚Ä¢ CatBoost Available: {CATBOOST_AVAILABLE}")
print("  ‚Ä¢ Includes DeepFeatureEnsemble, AdaptiveBoosting, StressFocusedEnsemble")



‚úÖ Model Suite Assembled Successfully
  ‚Ä¢ Total Models: 30
  ‚Ä¢ Example: ['LogisticRegression', 'LogisticRegression_L1', 'RidgeClassifier', 'SGDClassifier', 'MultinomialNB', 'ComplementNB', 'BernoulliNB', 'RandomForest', 'ExtraTrees', 'GradientBoosting']
  ‚Ä¢ XGBoost Available: True
  ‚Ä¢ LightGBM Available: True
  ‚Ä¢ CatBoost Available: True
  ‚Ä¢ Includes DeepFeatureEnsemble, AdaptiveBoosting, StressFocusedEnsemble


In [24]:
# ==========================================
# üß© CELL 11 ‚Äî FEATURE ENGINEERING + DATA PREPARATION
# ==========================================
# This module builds cleaned text + linguistic features for ML & Deep ensembles.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Ensure the AdvancedTextPreprocessor is already defined before this cell
# It must implement:  clean_text_advanced()  and  extract_advanced_features()

# ------------------------------------------------------------
# üß† FEATURE ENGINEERING PIPELINE
# ------------------------------------------------------------
class FeatureEngineeringPipeline:
    """Pipeline for generating linguistic + numerical features."""

    def __init__(self):
        self.preprocessor = AdvancedTextPreprocessor()
        self.feature_names = None

    def create_features(self, texts):
        """Extract advanced linguistic and text-cleaning features."""
        linguistic_features = []
        cleaned_texts = []

        print("üîç Extracting linguistic features and cleaning texts...")
        for i, text in enumerate(texts):
            if i % 1000 == 0 and i != 0:
                print(f"  ‚Ä¢ Processed {i}/{len(texts)} samples...")

            # Extract advanced linguistic (numerical) features
            feats = self.preprocessor.extract_advanced_features(str(text))
            linguistic_features.append(feats)

            # Clean the text for model usage
            cleaned = self.preprocessor.clean_text_advanced(str(text))
            cleaned_texts.append(cleaned)

        # Create dataframe of linguistic/numerical features
        linguistic_df = pd.DataFrame(linguistic_features).fillna(0)
        self.feature_names = list(linguistic_df.columns)

        print(f"‚úÖ Extracted {linguistic_df.shape[1]} linguistic features for {len(texts)} samples.")
        return cleaned_texts, linguistic_df


# ------------------------------------------------------------
# üìò BASIC DATA PREPARATION (text-only)
# ------------------------------------------------------------
def prepare_basic_data(df, text_col='clean_text', label_col='label', test_size=0.25):
    """Prepare dataset for text-only training (no numeric features)."""
    print("\nüìä Preparing dataset with basic preprocessing...")

    # Remove missing rows
    df_clean = df.dropna(subset=[text_col, label_col]).copy()
    print(f"  ‚Ä¢ Removed {len(df) - len(df_clean)} rows with missing values.")

    # Clean text
    preprocessor = AdvancedTextPreprocessor()
    cleaned_texts = [preprocessor.clean_text_advanced(str(t)) for t in df_clean[text_col]]

    # Encode labels
    y = df_clean[label_col]
    if y.dtype == 'object':
        le = LabelEncoder()
        y = le.fit_transform(y)
        mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(f"  ‚Ä¢ Label mapping: {mapping}")
    else:
        le = None
        mapping = None

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(
        cleaned_texts, y, test_size=test_size, stratify=y, random_state=42
    )

    print(f"‚úÖ Data ready ‚Üí Train: {len(X_train)}, Test: {len(X_test)}")
    return pd.Series(X_train), pd.Series(X_test), y_train, y_test, le, mapping


# ------------------------------------------------------------
# üß† ADVANCED DATA PREPARATION (text + linguistic)
# ------------------------------------------------------------
def prepare_advanced_data(df, text_col='clean_text', label_col='label', test_size=0.25):
    """Prepare data with linguistic + advanced feature extraction."""
    print("\nüìä Preparing dataset with advanced preprocessing and feature engineering...")

    # Remove missing rows
    df_clean = df.dropna(subset=[text_col, label_col]).copy()
    print(f"  ‚Ä¢ Removed {len(df) - len(df_clean)} rows with missing values.")

    # Initialize feature pipeline
    feature_pipe = FeatureEngineeringPipeline()
    cleaned_texts, linguistic_df = feature_pipe.create_features(df_clean[text_col])

    # Encode labels
    y = df_clean[label_col]
    if y.dtype == 'object':
        le = LabelEncoder()
        y = le.fit_transform(y)
        mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(f"  ‚Ä¢ Label mapping: {mapping}")
    else:
        le = None
        mapping = None

    # Split into train/test
    X_text_train, X_text_test, X_feat_train, X_feat_test, y_train, y_test = train_test_split(
        cleaned_texts, linguistic_df, y, test_size=test_size, stratify=y, random_state=42
    )

    print(f"‚úÖ Data preparation complete:")
    print(f"   ‚Ä¢ Training samples: {len(X_text_train)}")
    print(f"   ‚Ä¢ Testing samples : {len(X_text_test)}")
    print(f"   ‚Ä¢ Linguistic features: {linguistic_df.shape[1]} columns")

    return (
        pd.Series(X_text_train), pd.Series(X_text_test),
        X_feat_train, X_feat_test,
        y_train, y_test, le, mapping, feature_pipe
    )


print("‚úÖ Feature engineering and data preparation functions ready!")


‚úÖ Feature engineering and data preparation functions ready!


In [21]:
# ==========================================
# ‚öôÔ∏è ENHANCED ADVANCED TEXT PREPROCESSOR
# ==========================================
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

class AdvancedTextPreprocessor:
    """
    Unified high-performance text preprocessor:
    - Cleans and normalizes text
    - Extracts linguistic and emotional features
    - Supports both clean_text() and extract_advanced_features()
    """

    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.sentiment_analyzer = SentimentIntensityAnalyzer()

        # Mental health keyword sets
        self.stress_keywords = {
            'stress', 'stressed', 'pressure', 'overwhelm', 'anxiety', 'panic',
            'fear', 'worry', 'tired', 'exhausted', 'sad', 'depression', 'hopeless'
        }
        self.positive_keywords = {
            'happy', 'joy', 'peaceful', 'calm', 'love', 'grateful', 'relaxed',
            'confident', 'optimistic', 'motivated', 'successful'
        }

    # --------------------------------------------------------------
    # CLEANING
    # --------------------------------------------------------------
    def clean_text_advanced(self, text):
        """Performs advanced cleaning (used in feature extraction)"""
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", text)
        text = re.sub(r"[^a-z\s!?.,;']", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        words = text.split()
        cleaned = [self.lemmatizer.lemmatize(w) for w in words if w not in self.stop_words]
        return " ".join(cleaned)

    def clean_text(self, text):
        """Simplified cleaning (used in fallback/basic mode)"""
        return self.clean_text_advanced(text)

    def transform(self, texts):
        """Apply cleaning to a list/Series of texts"""
        return [self.clean_text_advanced(t) for t in texts]

    # --------------------------------------------------------------
    # FEATURE EXTRACTION
    # --------------------------------------------------------------
    def extract_advanced_features(self, text):
        """Extracts linguistic, emotional, and contextual features"""
        if not isinstance(text, str) or text.strip() == "":
            return {f"feature_{i}": 0 for i in range(20)}

        features = {}
        text_lower = text.lower()

        words = text_lower.split()
        features["char_count"] = len(text_lower)
        features["word_count"] = len(words)
        features["avg_word_length"] = np.mean([len(w) for w in words]) if words else 0

        # Stress/positive density
        stress_count = sum(1 for w in words if w in self.stress_keywords)
        pos_count = sum(1 for w in words if w in self.positive_keywords)
        features["stress_density"] = stress_count / max(len(words), 1)
        features["positive_density"] = pos_count / max(len(words), 1)
        features["emotional_balance"] = features["positive_density"] - features["stress_density"]

        # Sentiment
        sentiment = self.sentiment_analyzer.polarity_scores(text_lower)
        features.update({
            "sent_pos": sentiment["pos"],
            "sent_neg": sentiment["neg"],
            "sent_neu": sentiment["neu"],
            "sent_compound": sentiment["compound"]
        })

        # Punctuation intensity
        features["exclamation_count"] = text_lower.count("!")
        features["question_count"] = text_lower.count("?")

        # Personal references
        features["first_person"] = sum(text_lower.count(p) for p in [" i ", " me ", " my ", " mine "])
        features["first_person_ratio"] = features["first_person"] / max(len(words), 1)

        return features


In [22]:
# ==========================================
# üß† DATA PREPARATION EXECUTION CELL
# ==========================================
print("=" * 80)
print("üß† ENHANCED MENTAL STRESS DETECTION ‚Äî DATA PREPARATION")
print("=" * 80)

# Attempt advanced feature preparation first
try:
    print("üöÄ Attempting advanced data preparation...")
    (
        X_text_train, X_text_test,
        X_feat_train, X_feat_test,
        y_train, y_test,
        label_encoder, label_mapping,
        feature_pipeline
    ) = prepare_advanced_data(stress)

    print("\n‚úÖ Advanced data preparation completed successfully!")
    ADVANCED_FEATURES = True

# -----------------------------------------------------------------
# Fallback to basic preparation if advanced fails
# -----------------------------------------------------------------
except Exception as e:
    print(f"\n‚ùå Advanced preparation failed:\n   ‚Ü≥ {e}")
    print("üîÑ Falling back to basic preparation...")

    try:
        (
            X_text_train, X_text_test,
            y_train, y_test,
            label_encoder, label_mapping
        ) = prepare_basic_data(stress)

        X_feat_train = X_feat_test = None
        feature_pipeline = AdvancedTextPreprocessor()

        print("\n‚úÖ Basic data preparation completed successfully!")
        ADVANCED_FEATURES = False

    except Exception as e2:
        print(f"\n‚ùå Both advanced and basic preparation failed:\n   ‚Ü≥ {e2}")
        print("‚ö†Ô∏è  Please inspect your dataset and preprocessing pipeline.")
        raise e2

# -----------------------------------------------------------------
# Summary of the preparation results
# -----------------------------------------------------------------
print("\nüìä DATA PREPARATION SUMMARY")
print("-" * 60)
print(f"Training samples:       {len(X_text_train):,}")
print(f"Testing samples:        {len(X_text_test):,}")
print(f"Advanced features used: {ADVANCED_FEATURES}")

if ADVANCED_FEATURES and X_feat_train is not None:
    print(f"Linguistic feature dims: {X_feat_train.shape[1]}")

print(f"Label encoder applied:  {'Yes' if label_encoder else 'No'}")
print("-" * 60)
print("‚úÖ Data successfully prepared and ready for vectorization & model training!\n")


üß† ENHANCED MENTAL STRESS DETECTION ‚Äî DATA PREPARATION
üöÄ Attempting advanced data preparation...
üìä Preparing dataset with advanced preprocessing and feature engineering...
Removed 0 rows with missing text or labels.
üîç Extracting linguistic features and cleaning texts...
  Processed 1000/2838 samples
  Processed 2000/2838 samples
‚úÖ Created 14 linguistic features for 2838 samples.
Label mapping: {'0': 0, '1': 1}
‚úÖ Data prepared:
  ‚Ä¢ Training: 2128 samples
  ‚Ä¢ Testing: 710 samples
  ‚Ä¢ Linguistic features: 14

‚úÖ Advanced data preparation completed successfully!

üìä DATA PREPARATION SUMMARY
------------------------------------------------------------
Training samples:       2,128
Testing samples:        710
Advanced features used: True
Linguistic feature dims: 14
Label encoder applied:  Yes
------------------------------------------------------------
‚úÖ Data successfully prepared and ready for vectorization & model training!



In [23]:
# ==========================================
# üî• CELL 9 ‚Äî COMPREHENSIVE MODEL EVALUATION LOOP
# ==========================================
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import time

def run_comprehensive_evaluation():
    """Run full evaluation of all vectorizers and models (standard + custom)."""
    
    print("\nüöÄ Starting comprehensive model evaluation...")
    start_global = time.time()

    all_results = []
    standard_models = {}
    custom_models = {}

    # ------------------------------------------------------------
    # Separate standard and custom (novel/ensemble) models
    # ------------------------------------------------------------
    for name, model in models.items():
        if any(x in name.lower() for x in [
            'deep_feature_ensemble', 'stress_focused_ensemble', 'adaptive_boosting'
        ]):
            custom_models[name] = model
        else:
            standard_models[name] = model

    print(f"üì¶ Standard models: {len(standard_models)}")
    print(f"üß© Custom models:   {len(custom_models)}")

    total_combinations = len(vectorizers) * len(standard_models) + len(custom_models)
    current = 0

    # ------------------------------------------------------------
    # Evaluate Standard Models with Vectorizers
    # ------------------------------------------------------------
    for vec_name, vectorizer in vectorizers.items():
        print(f"\nüìù Using vectorizer: {vec_name}")

        skip_vectorizer = any(
            x in vec_name.lower() for x in ["custom_stress", "hybrid_char_word", "ensemble"]
        )

        for model_name, model in standard_models.items():
            current += 1
            print(f"\n[{current}/{total_combinations}] üîÑ Evaluating {model_name} + {vec_name} ...")

            # Skip heavy combinations for efficiency
            skip_conditions = [
                (skip_vectorizer and 'svm' in model_name.lower()),
                ('svm' in model_name.lower() and 'trigram' in vec_name),
                ('mlpclassifier' in model_name.lower() and 'trigram' in vec_name),
                ('ensemble' in vec_name.lower() and 'ensemble' in model_name.lower()),
            ]
            if any(skip_conditions):
                print(f"   ‚è≠Ô∏è Skipping heavy combination: {model_name} with {vec_name}")
                continue

            try:
                results = evaluate_model(
                    model=model,
                    model_name=model_name,
                    X_text_train=X_text_train,
                    X_text_test=X_text_test,
                    X_feat_train=X_feat_train,
                    X_feat_test=X_feat_test,
                    y_train=y_train,
                    y_test=y_test,
                    vectorizer=vectorizer,
                )

                if results:
                    all_results.append(results)
                    print(
                        f"   ‚úÖ {model_name} done ‚Äî "
                        f"Acc={results['test_accuracy']:.3f}, "
                        f"F1={results['test_f1']:.3f}, "
                        f"MCC={results['test_mcc']:.3f}, "
                        f"Time={results['training_time_sec']:.2f}s"
                    )
            except Exception as e:
                print(f"   ‚ùå Error evaluating {model_name}: {str(e)[:80]}")
                continue

    # ------------------------------------------------------------
    # Evaluate Custom Models (Deep Ensembles, Stress-Focused, etc.)
    # ------------------------------------------------------------
    print("\nüéØ Evaluating custom ensemble models...")
    default_vectorizer = TfidfVectorizer(
        max_features=10000, ngram_range=(1, 1), min_df=3, max_df=0.95, stop_words="english"
    )

    for model_name, model in custom_models.items():
        current += 1
        print(f"\n[{current}/{total_combinations}] üîÑ Evaluating {model_name} (custom)...")

        try:
            results = evaluate_model(
                model=model,
                model_name=model_name,
                X_text_train=X_text_train,
                X_text_test=X_text_test,
                X_feat_train=X_feat_train,
                X_feat_test=X_feat_test,
                y_train=y_train,
                y_test=y_test,
                vectorizer=default_vectorizer,
            )

            if results:
                all_results.append(results)
                print(
                    f"   ‚úÖ {model_name} done ‚Äî "
                    f"Acc={results['test_accuracy']:.3f}, "
                    f"F1={results['test_f1']:.3f}, "
                    f"MCC={results['test_mcc']:.3f}, "
                    f"Time={results['training_time_sec']:.2f}s"
                )

        except Exception as e:
            print(f"   ‚ùå Error evaluating {model_name}: {str(e)[:80]}")
            continue

    elapsed = time.time() - start_global
    print(f"\nüèÅ Comprehensive evaluation complete in {elapsed/60:.1f} minutes!")
    print(f"‚úÖ Successfully evaluated {len(all_results)} model‚Äìvectorizer combinations.\n")

    return all_results


# ------------------------------------------------------------
# Run the Comprehensive Evaluation
# ------------------------------------------------------------
print("\n" + "=" * 70)
print("üî• STARTING COMPREHENSIVE EVALUATION WITH NOVEL APPROACHES")
print("=" * 70)

evaluation_results = run_comprehensive_evaluation()

print("\n‚úÖ Evaluation finished!")
print(f"üßÆ Total combinations tested: {len(evaluation_results)}")
print("üß† Novel methods included: BoW, TF-IDF, Deep Ensemble, Stress-Focused Models, etc.")



üî• STARTING COMPREHENSIVE EVALUATION WITH NOVEL APPROACHES

üöÄ Starting comprehensive model evaluation...
üì¶ Standard models: 27
üß© Custom models:   3

üìù Using vectorizer: tfidf_unigram

[1/408] üîÑ Evaluating LogisticRegression + tfidf_unigram ...
   ‚ùå Error evaluating LogisticRegression: name 'evaluate_model' is not defined

[2/408] üîÑ Evaluating LogisticRegression_L1 + tfidf_unigram ...
   ‚ùå Error evaluating LogisticRegression_L1: name 'evaluate_model' is not defined

[3/408] üîÑ Evaluating RidgeClassifier + tfidf_unigram ...
   ‚ùå Error evaluating RidgeClassifier: name 'evaluate_model' is not defined

[4/408] üîÑ Evaluating SGDClassifier + tfidf_unigram ...
   ‚ùå Error evaluating SGDClassifier: name 'evaluate_model' is not defined

[5/408] üîÑ Evaluating MultinomialNB + tfidf_unigram ...
   ‚ùå Error evaluating MultinomialNB: name 'evaluate_model' is not defined

[6/408] üîÑ Evaluating ComplementNB + tfidf_unigram ...
   ‚ùå Error evaluating ComplementNB: n