In [1]:
# ==========================================
# üì¶ STANDARDIZED SETUP - MENTAL STRESS DETECTION PROJECT
# ==========================================

# --- Core Imports ---
import os
import json
import logging
from pathlib import Path
from datetime import datetime
import warnings
import pandas as pd
import numpy as np

# --- Visualization (optional for EDA) ---
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# --- Text & NLP Utilities ---
import re
import string
from textblob import TextBlob
from collections import Counter

# --- Machine Learning ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# --- System Utilities ---
import joblib
from typing import Dict, Any

# --- Warnings ---
warnings.filterwarnings("ignore")

# ==========================================
# üöÄ LOGGING CONFIGURATION
# ==========================================
LOG_DIR = Path("logs")
LOG_DIR.mkdir(exist_ok=True)

from logging.handlers import RotatingFileHandler

log_file = LOG_DIR / "stress_detection.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        RotatingFileHandler(log_file, maxBytes=1_000_000, backupCount=3),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# ==========================================
# üóÇÔ∏è DIRECTORY STRUCTURE (MODELS / REPORTS / FRONTEND)
# ==========================================
for folder in ["models", "preprocessors", "reports", "artifacts"]:
    Path(folder).mkdir(exist_ok=True)

# ==========================================
# üß† DATA LOADING FUNCTION
# ==========================================
def load_dataset(file_path: str, encoding: str = "utf-8") -> pd.DataFrame:
    """
    Load dataset with multiple fallback encodings and detailed validation.
    Returns: pandas DataFrame
    """
    encodings = [encoding, "utf-8", "latin-1", "cp1252"]
    dataset = None
    for enc in encodings:
        try:
            dataset = pd.read_csv(file_path, encoding=enc, on_bad_lines="skip", low_memory=False)
            logger.info(f"‚úÖ Loaded dataset successfully using encoding: {enc}")
            break
        except UnicodeDecodeError:
            continue
        except FileNotFoundError:
            logger.error(f"‚ùå File not found at {file_path}")
            return None
    if dataset is None:
        raise ValueError(f"‚ùå Failed to load dataset using all encodings: {encodings}")
    
    logger.info(f"üìä Dataset Shape: {dataset.shape}")
    logger.info(f"üìë Columns: {list(dataset.columns)}")
    logger.info(f"üîç Missing Values: {dataset.isnull().sum().sum()} | Duplicates: {dataset.duplicated().sum()}")
    return dataset

# ==========================================
# üîé VALIDATION FUNCTION
# ==========================================
def validate_stress_dataset(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Validate dataset structure for stress detection tasks.
    Identifies potential text and label columns, missing data, duplicates, etc.
    """
    validation = {
        "total_samples": len(df),
        "missing_values": df.isnull().sum().to_dict(),
        "duplicate_rows": int(df.duplicated().sum()),
        "text_columns": [],
        "label_column": None,
        "issues": []
    }

    for col in df.columns:
        if df[col].dtype == "object":
            avg_len = df[col].dropna().astype(str).str.len().mean()
            unique_vals = df[col].nunique(dropna=True)

            if avg_len > 15 or unique_vals > 30:
                validation["text_columns"].append(col)
            elif unique_vals <= 10:
                validation["label_column"] = col

    for text_col in validation["text_columns"]:
        short_count = df[text_col].astype(str).str.len().lt(5).sum()
        if short_count > 0:
            validation["issues"].append(f"Column '{text_col}' has {short_count} very short entries")

    return validation

# ==========================================
# üíæ SAVE DATASET PROFILE
# ==========================================
def save_dataset_profile(df: pd.DataFrame, validation: Dict[str, Any]):
    profile = {
        "dataset_info": {
            "shape": df.shape,
            "columns": list(df.columns),
            "text_columns": validation["text_columns"],
            "label_column": validation["label_column"],
            "duplicates": validation["duplicate_rows"],
            "missing": sum(validation["missing_values"].values())
        },
        "timestamp": datetime.now().isoformat()
    }
    config_path = Path("preprocessors/dataset_config.json")
    with open(config_path, "w") as f:
        json.dump(profile, f, indent=2)
    logger.info(f"üìÅ Dataset profile saved at: {config_path}")
    return profile

# ==========================================
# ‚öôÔ∏è EXECUTION - LOAD AND VALIDATE
# ==========================================
logger.info("üöÄ Loading dataset for stress detection...")
DATA_PATH = "stress.csv"   # modify if needed

try:
    df_raw = load_dataset(DATA_PATH)
    if df_raw is None:
        raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Please ensure it's placed in the project root.")
        
    df = df_raw.copy()
    validation = validate_stress_dataset(df)
    profile = save_dataset_profile(df, validation)

    print("=" * 60)
    print(" MENTAL STRESS DETECTION - DATA SUMMARY ")
    print("=" * 60)
    print(f"Shape: {df.shape}")
    print(f"Possible Label Column: {validation['label_column']}")
    print(f"Text Columns: {validation['text_columns']}")
    print(f"Duplicate Rows: {validation['duplicate_rows']}")
    print(f"Missing Values: {sum(validation['missing_values'].values())}")
    if validation['issues']:
        print("\n‚ö†Ô∏è Issues Found:")
        for issue in validation['issues']:
            print(f"  - {issue}")
    print("\n‚úÖ Dataset successfully loaded and validated. Ready for preprocessing.")
except Exception as e:
    logger.error(f"‚ùå Data loading failed: {e}")
    raise


2025-11-10 20:24:06,819 | INFO | üöÄ Loading dataset for stress detection...
2025-11-10 20:24:06,849 | INFO | ‚úÖ Loaded dataset successfully using encoding: utf-8
2025-11-10 20:24:06,849 | INFO | üìä Dataset Shape: (2838, 116)
2025-11-10 20:24:06,849 | INFO | üìë Columns: ['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_an

 MENTAL STRESS DETECTION - DATA SUMMARY 
Shape: (2838, 116)
Possible Label Column: subreddit
Text Columns: ['post_id', 'sentence_range', 'text']
Duplicate Rows: 0
Missing Values: 0

‚úÖ Dataset successfully loaded and validated. Ready for preprocessing.


In [2]:
# ===============================
# üåø ADVANCED PROJECT-SPECIFIC EDA ‚Äì MENTAL STRESS DETECTION
# ===============================

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
from collections import Counter
from datetime import datetime
import json
import numpy as np

plt.use('Agg')  # ‚úÖ Required for Docker/frontend

# ------------------------------------------------
# üß≠ SMART COLUMN DETECTION
# ------------------------------------------------
def get_text_column(df: pd.DataFrame):
    candidates = ["clean_text", "text", "message", "content", "post", "body"]
    for c in candidates:
        if c in df.columns:
            return c
    text_like = [c for c in df.columns if df[c].dtype == "object"]
    return text_like[0] if text_like else None


# ------------------------------------------------
# üß† SENTIMENT & EMOTION ANALYSIS VISUALS
# ------------------------------------------------
def sentiment_summary(df, text_col):
    df["sentiment_polarity"] = df[text_col].astype(str).apply(lambda x: TextBlob(x).sentiment.polarity)
    df["sentiment_subjectivity"] = df[text_col].astype(str).apply(lambda x: TextBlob(x).sentiment.subjectivity)

    plt.figure(figsize=(6,4))
    sns.histplot(df["sentiment_polarity"], kde=True, bins=40, color="purple")
    plt.title("üí¨ Sentiment Polarity Distribution (-1 ‚Üí +1)")
    plt.xlabel("Polarity")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig("reports/sentiment_polarity.png", dpi=300)
    plt.close()

    plt.figure(figsize=(6,4))
    sns.histplot(df["sentiment_subjectivity"], kde=True, bins=40, color="orange")
    plt.title("üß≠ Sentiment Subjectivity (0 Objective ‚Üí 1 Subjective)")
    plt.xlabel("Subjectivity")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig("reports/sentiment_subjectivity.png", dpi=300)
    plt.close()

    return {
        "avg_polarity": round(df["sentiment_polarity"].mean(), 4),
        "avg_subjectivity": round(df["sentiment_subjectivity"].mean(), 4),
    }


# ------------------------------------------------
# üí¨ EMOTIONAL WORD INTENSITY VISUALIZATION
# ------------------------------------------------
def emotion_intensity_map(df, text_col):
    from textblob import Word
    emotion_words = ["happy", "sad", "angry", "tired", "worried", "relaxed", "peaceful", "anxious", "overwhelmed", "calm"]
    counts = {w: 0 for w in emotion_words}
    for text in df[text_col].astype(str):
        for w in emotion_words:
            if w in text.lower():
                counts[w] += 1
    emo_df = pd.DataFrame(list(counts.items()), columns=["emotion", "count"])
    plt.figure(figsize=(7, 4))
    sns.barplot(x="count", y="emotion", data=emo_df, palette="coolwarm")
    plt.title("üí° Emotion Word Frequency Map")
    plt.tight_layout()
    plt.savefig("reports/emotion_intensity.png", dpi=300)
    plt.close()
    return emo_df


# ------------------------------------------------
# üßÆ WORD COUNT & COMPLEXITY
# ------------------------------------------------
def text_complexity_analysis(df, text_col):
    df["word_count"] = df[text_col].astype(str).apply(lambda x: len(x.split()))
    df["avg_word_length"] = df[text_col].astype(str).apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split())>0 else 0)
    df["punctuation_density"] = df[text_col].astype(str).apply(lambda x: sum([1 for c in x if c in "!?."]) / max(len(x),1))

    fig, axs = plt.subplots(1, 3, figsize=(15, 4))
    sns.histplot(df["word_count"], bins=40, ax=axs[0], color="teal")
    axs[0].set_title("üìò Word Count Distribution")

    sns.histplot(df["avg_word_length"], bins=40, ax=axs[1], color="olive")
    axs[1].set_title("üßÆ Avg Word Length Distribution")

    sns.histplot(df["punctuation_density"], bins=40, ax=axs[2], color="crimson")
    axs[2].set_title("üî£ Punctuation Density")

    plt.tight_layout()
    plt.savefig("reports/text_complexity.png", dpi=300)
    plt.close()

    return df[["word_count","avg_word_length","punctuation_density"]].describe().to_dict()


# ------------------------------------------------
# üî° MOST COMMON WORDS BY CLASS
# ------------------------------------------------
def most_common_words_by_class(df, label_col, text_col, n=15):
    if label_col not in df.columns:
        return None
    df[label_col] = df[label_col].astype(str)
    class_labels = df[label_col].unique()
    all_freqs = {}
    for lbl in class_labels:
        subset = df[df[label_col]==lbl]
        words = " ".join(subset[text_col].astype(str)).lower().split()
        common = Counter(words).most_common(n)
        all_freqs[lbl] = common
        common_df = pd.DataFrame(common, columns=["word", "freq"])
        plt.figure(figsize=(8,5))
        sns.barplot(y="word", x="freq", data=common_df, palette="viridis")
        plt.title(f"üó£Ô∏è Top {n} Words for '{lbl}' Class")
        plt.tight_layout()
        plt.savefig(f"reports/top_words_{lbl}.png", dpi=300)
        plt.close()
    return all_freqs


# ------------------------------------------------
# üß© LABEL BALANCE & CORRELATIONS
# ------------------------------------------------
def plot_label_balance(df, label_col="label"):
    if label_col not in df.columns: return None
    plt.figure(figsize=(6,4))
    sns.countplot(x=label_col, data=df, palette="pastel")
    plt.title("‚öñÔ∏è Label Balance")
    plt.tight_layout()
    plt.savefig("reports/label_balance.png", dpi=300)
    plt.close()
    return "reports/label_balance.png"


# ------------------------------------------------
# üîÅ FINAL EXECUTION FOR EDA
# ------------------------------------------------
print("="*80)
print("üåø Running ADVANCED EDA for MENTAL STRESS DETECTION")
print("="*80)

text_col = get_text_column(df)
label_col = "label" if "label" in df.columns else df.columns[-1]

print(f"‚úÖ Using text column: {text_col}, label column: {label_col}")

sent_summary = sentiment_summary(df, text_col)
emo_map = emotion_intensity_map(df, text_col)
complexity_stats = text_complexity_analysis(df, text_col)
label_plot = plot_label_balance(df, label_col)
word_freqs = most_common_words_by_class(df, label_col, text_col)

# ------------------------------------------------
# üìä EXPORT JSON FOR FRONTEND DASHBOARD
# ------------------------------------------------
eda_summary = {
    "timestamp": datetime.now().isoformat(),
    "sentiment_summary": sent_summary,
    "complexity_stats": complexity_stats,
    "emotion_counts": emo_map.to_dict(),
    "top_words_by_class": word_freqs,
}
with open("reports/advanced_eda_summary.json", "w") as f:
    json.dump(eda_summary, f, indent=2)

print("\n‚úÖ Advanced EDA Completed")
print("üìÅ Visual reports saved in 'reports/'")
print("üß† Ready for frontend integration or deployment container.")


AttributeError: module 'matplotlib.pyplot' has no attribute 'use'

In [3]:
# ==========================================
# üß† ENHANCED NLP PREPROCESSING - MENTAL STRESS DETECTION
# ==========================================

import re
import string
import emoji
import unicodedata
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Ensure NLTK assets are available (for Docker environments)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

# ------------------------------------------------------------
# ‚öôÔ∏è HELPER FUNCTIONS
# ------------------------------------------------------------
def get_wordnet_pos(tag):
    """Map POS tag to WordNet format."""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# ------------------------------------------------------------
# üß© ADVANCED TEXT PREPROCESSOR CLASS
# ------------------------------------------------------------
class AdvancedTextPreprocessor:
    """
    Handles advanced text preprocessing for stress detection.
    Includes normalization, lemmatization, emoji decoding,
    and stress-aware keyword augmentation.
    """

    def __init__(self, extra_stopwords=None):
        self.stop_words = set(stopwords.words('english'))
        if extra_stopwords:
            self.stop_words.update(extra_stopwords)

        self.lemmatizer = WordNetLemmatizer()
        self.stress_keywords = [
            "stress", "anxiety", "depression", "panic", "fear",
            "pressure", "worry", "burnout", "tired", "sad", "angry",
            "hopeless", "calm", "peaceful", "happy", "relaxed"
        ]

        # Contraction map
        self.contraction_map = {
            "can't": "cannot", "won't": "will not", "n't": " not",
            "'re": " are", "'s": " is", "'d": " would",
            "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"
        }

    # --------------------------------------------------------
    def clean_contractions(self, text):
        for c, expanded in self.contraction_map.items():
            text = text.replace(c, expanded)
        return text

    # --------------------------------------------------------
    def handle_negations(self, text):
        """
        Keep negations close to context words
        (e.g., 'not happy' ‚Üí 'not_happy')
        """
        tokens = nltk.word_tokenize(text)
        negation_words = {'not', 'no', 'never', "n't"}
        new_tokens = []
        skip_next = False
        for i in range(len(tokens) - 1):
            if tokens[i].lower() in negation_words and tokens[i + 1].isalpha():
                new_tokens.append(tokens[i] + "_" + tokens[i + 1])
                skip_next = True
            elif not skip_next:
                new_tokens.append(tokens[i])
            else:
                skip_next = False
        if not skip_next:
            new_tokens.append(tokens[-1])
        return " ".join(new_tokens)

    # --------------------------------------------------------
    def replace_emojis(self, text):
        """Convert emojis to text descriptions (üôÇ ‚Üí smiley_face)."""
        return emoji.demojize(text, delimiters=(" ", " "))

    # --------------------------------------------------------
    def remove_noise(self, text):
        """Remove URLs, mentions, hashtags, numbers, and symbols."""
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"@\w+|#\w+", "", text)
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"[%s]" % re.escape(string.punctuation), " ", text)
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    # --------------------------------------------------------
    def lemmatize_text(self, text):
        """POS-based lemmatization."""
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        lemmatized = [
            self.lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
            for word, pos in pos_tags
        ]
        return " ".join(lemmatized)

    # --------------------------------------------------------
    def add_stress_boost(self, text):
        """
        Enrich text by amplifying known stress indicators.
        Adds a small frequency boost to stress-related words.
        """
        for kw in self.stress_keywords:
            if kw in text:
                text += f" {kw}_focus"
        return text

    # --------------------------------------------------------
    def clean_text_advanced(self, text):
        """Apply full pipeline: normalize ‚Üí clean ‚Üí lemmatize."""
        text = str(text).lower()
        text = unicodedata.normalize("NFKD", text)
        text = self.clean_contractions(text)
        text = self.replace_emojis(text)
        text = self.handle_negations(text)
        text = self.remove_noise(text)
        text = self.lemmatize_text(text)
        text = self.add_stress_boost(text)
        tokens = [w for w in text.split() if w not in self.stop_words and len(w) > 2]
        return " ".join(tokens)

    # --------------------------------------------------------
    def extract_advanced_features(self, text):
        """Extract linguistic & sentiment features for ML models."""
        blob = TextBlob(str(text))
        return {
            "char_len": len(text),
            "word_count": len(text.split()),
            "avg_word_len": np.mean([len(w) for w in text.split()]) if text else 0,
            "sentiment_polarity": blob.sentiment.polarity,
            "sentiment_subjectivity": blob.sentiment.subjectivity,
            "stress_word_count": sum(1 for w in text.split() if w in self.stress_keywords)
        }

# ------------------------------------------------------------
# ‚úÖ TEST THE PREPROCESSOR PIPELINE
# ------------------------------------------------------------
sample_text = "I'm feeling soooo stressed üò© about my exams and deadlines!!! Can't sleep at all."
pre = AdvancedTextPreprocessor()
print("\nüß† ORIGINAL:", sample_text)
print("üîß CLEANED:", pre.clean_text_advanced(sample_text))
print("üìä FEATURES:", pre.extract_advanced_features(sample_text))



üß† ORIGINAL: I'm feeling soooo stressed üò© about my exams and deadlines!!! Can't sleep at all.
üîß CLEANED: feel soooo stress weary face exam deadline sleep stress_focus
üìä FEATURES: {'char_len': 80, 'word_count': 14, 'avg_word_len': 4.785714285714286, 'sentiment_polarity': 0.0, 'sentiment_subjectivity': 0.0, 'stress_word_count': 0}


In [None]:
pip install emoji

In [4]:
# ==========================================
# ‚öôÔ∏è ENVIRONMENT SETUP - NLP DEPENDENCIES
# ==========================================

import os
import sys
import subprocess
import importlib
import platform
import torch
import warnings
warnings.filterwarnings("ignore")

# ------------------------------------------------
# üß© UTILITY: Safe Import with Auto-Install
# ------------------------------------------------
def install_and_import(package_name, import_name=None):
    """Safely import a package, auto-install if missing."""
    try:
        return importlib.import_module(import_name or package_name)
    except ImportError:
        print(f"üì¶ Installing missing package: {package_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        return importlib.import_module(import_name or package_name)


# ------------------------------------------------
# üß† CORE NLP LIBRARIES (Auto-managed)
# ------------------------------------------------
nltk = install_and_import("nltk")
spacy = install_and_import("spacy")
emoji = install_and_import("emoji")
textblob = install_and_import("textblob")

# ------------------------------------------------
# üî§ SPACY MODEL SETUP
# ------------------------------------------------
def load_spacy_model(model_name="en_core_web_sm"):
    """Ensure spaCy English model is available."""
    try:
        nlp = spacy.load(model_name)
        print(f"‚úÖ spaCy model '{model_name}' loaded successfully.")
    except OSError:
        print(f"‚öôÔ∏è Downloading spaCy model: {model_name} ...")
        subprocess.run([sys.executable, "-m", "spacy", "download", model_name], check=True)
        nlp = spacy.load(model_name)
    return nlp

nlp = load_spacy_model("en_core_web_sm")

# ------------------------------------------------
# ‚ö° GPU / CPU CHECK
# ------------------------------------------------
def check_gpu_status():
    gpu_available = torch.cuda.is_available()
    gpu_name = torch.cuda.get_device_name(0) if gpu_available else "CPU Mode"
    print(f"üñ•Ô∏è  Compute Device: {'GPU - ' + gpu_name if gpu_available else 'CPU only'}")
    return {"gpu_available": gpu_available, "device_name": gpu_name}

device_info = check_gpu_status()

# ------------------------------------------------
# üìÅ PROJECT DIRECTORY VALIDATION
# ------------------------------------------------
for folder in ["models", "preprocessors", "reports", "artifacts"]:
    os.makedirs(folder, exist_ok=True)

print("\nüìÇ Verified project structure:")
for f in ["models", "preprocessors", "reports", "artifacts"]:
    print(f"   ‚îî‚îÄ‚îÄ {f}/ ‚úÖ")

# ------------------------------------------------
# üß† TEXTBLOB / NLTK SANITY CHECK
# ------------------------------------------------
try:
    from textblob import TextBlob
    from nltk.corpus import stopwords
    nltk.download("punkt", quiet=True)
    nltk.download("stopwords", quiet=True)
    stop_words = set(stopwords.words("english"))
    sample_blob = TextBlob("I am feeling a bit anxious about tomorrow.")
    print(f"üí¨ TextBlob polarity test: {sample_blob.sentiment.polarity:.2f}")
    print(f"üóÇÔ∏è Stopwords loaded: {len(stop_words)} words")
except Exception as e:
    print(f"‚ö†Ô∏è Error verifying NLP dependencies: {e}")

# ------------------------------------------------
# üîç ENVIRONMENT SUMMARY (Safe Version Fetch)
# ------------------------------------------------
from importlib.metadata import version, PackageNotFoundError

def get_pkg_version(pkg_name, fallback="Unknown"):
    """Safely fetch package version even if __version__ missing."""
    try:
        return version(pkg_name)
    except PackageNotFoundError:
        try:
            pkg = importlib.import_module(pkg_name)
            return getattr(pkg, "__version__", fallback)
        except Exception:
            return fallback

env_summary = {
    "python_version": platform.python_version(),
    "system": platform.system(),
    "device_info": device_info,
    "nltk_version": get_pkg_version("nltk"),
    "spacy_version": get_pkg_version("spacy"),
    "textblob_version": get_pkg_version("textblob"),
    "emoji_version": get_pkg_version("emoji"),
}

print("\nüì¶ ENVIRONMENT SUMMARY")
print("=" * 40)
for key, val in env_summary.items():
    print(f"{key:20s}: {val}")
print("=" * 40)
print("‚úÖ NLP environment fully configured and deployment-ready!\n")


‚úÖ spaCy model 'en_core_web_sm' loaded successfully.
üñ•Ô∏è  Compute Device: CPU only

üìÇ Verified project structure:
   ‚îî‚îÄ‚îÄ models/ ‚úÖ
   ‚îî‚îÄ‚îÄ preprocessors/ ‚úÖ
   ‚îî‚îÄ‚îÄ reports/ ‚úÖ
   ‚îî‚îÄ‚îÄ artifacts/ ‚úÖ
üí¨ TextBlob polarity test: -0.25
üóÇÔ∏è Stopwords loaded: 198 words

üì¶ ENVIRONMENT SUMMARY
python_version      : 3.12.2
system              : Darwin
device_info         : {'gpu_available': False, 'device_name': 'CPU Mode'}
nltk_version        : 3.9.1
spacy_version       : 3.8.7
textblob_version    : 0.19.0
emoji_version       : 2.15.0
‚úÖ NLP environment fully configured and deployment-ready!



In [None]:
pip install torch

In [5]:
# ==========================================
# ‚öôÔ∏è OPTIMIZED HIGH-PERFORMANCE TEXT PREPROCESSOR
# ==========================================

import re
import spacy
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from textblob import TextBlob
import emoji
import nltk

tqdm.pandas()

# ---------------------------------------------------------
# ‚úÖ Ensure spacy model loaded
# ---------------------------------------------------------
try:
    nlp
except NameError:
    nlp = spacy.load("en_core_web_sm")

# ---------------------------------------------------------
# üîß Optimized Preprocessor Class
# ---------------------------------------------------------
class HighPerformancePreprocessor:
    """spaCy + cache accelerated text preprocessor for large-scale inference."""

    def __init__(self):
        self.cache = {}
        self.stress_lexicon = {
            "stress", "anxiety", "panic", "pressure", "fear", "depression",
            "tension", "worry", "sad", "angry", "hopeless", "relaxed",
            "peaceful", "calm", "happy", "grateful"
        }
        self.stop_words = nlp.Defaults.stop_words

    # -----------------------------------------------------
    def normalize_text(self, text):
        text = emoji.demojize(str(text).lower(), delimiters=(" ", " "))
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"@\w+|#\w+", "", text)
        text = re.sub(r"[^a-z\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # -----------------------------------------------------
    def process_doc(self, text):
        if text in self.cache:
            return self.cache[text]
        doc = nlp(text)
        tokens = [
            tok.lemma_.lower()
            for tok in doc
            if not tok.is_stop and tok.is_alpha and len(tok) > 2
        ]
        cleaned = " ".join(tokens)
        self.cache[text] = cleaned
        return cleaned

    # -----------------------------------------------------
    def extract_features(self, text):
        blob = TextBlob(text)
        return {
            "char_len": len(text),
            "word_count": len(text.split()),
            "sentiment": blob.sentiment.polarity,
            "subjectivity": blob.sentiment.subjectivity,
            "stress_words": sum(w in self.stress_lexicon for w in text.split())
        }

    # -----------------------------------------------------
    def batch_process(self, texts, max_workers=4):
        cleaned, features = [], []
        print(f"üöÄ Preprocessing {len(texts)} texts using {max_workers} threads...")

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_text = {executor.submit(self.pipeline, t): t for t in texts}
            for future in tqdm(as_completed(future_to_text), total=len(future_to_text)):
                clean_text, feats = future.result()
                cleaned.append(clean_text)
                features.append(feats)

        return cleaned, features

    # -----------------------------------------------------
    def pipeline(self, text):
        text = self.normalize_text(text)
        clean_text = self.process_doc(text)
        feats = self.extract_features(clean_text)
        return clean_text, feats


# ---------------------------------------------------------
# ‚úÖ Test Run (example)
# ---------------------------------------------------------
sample_batch = [
    "I‚Äôm completely stressed about my project deadlines üò©",
    "Feeling super calm and relaxed after yoga üßò",
    "Anxiety levels are going up every day with these exams!",
    "Work pressure is unbearable, but I‚Äôll manage somehow.",
]

hp = HighPerformancePreprocessor()
cleaned, feats = hp.batch_process(sample_batch, max_workers=4)

print("\nüîç Sample Output")
for i in range(len(sample_batch)):
    print(f"\nüß† Original: {sample_batch[i]}")
    print(f"üîß Cleaned:  {cleaned[i]}")
    print(f"üìä Features: {feats[i]}")


üöÄ Preprocessing 4 texts using 4 threads...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 132.38it/s]


üîç Sample Output

üß† Original: I‚Äôm completely stressed about my project deadlines üò©
üîß Cleaned:  work pressure unbearable manage
üìä Features: {'char_len': 31, 'word_count': 4, 'sentiment': 0.0, 'subjectivity': 0.0, 'stress_words': 1}

üß† Original: Feeling super calm and relaxed after yoga üßò
üîß Cleaned:  anxiety level go day exam
üìä Features: {'char_len': 25, 'word_count': 5, 'sentiment': 0.0, 'subjectivity': 0.0, 'stress_words': 1}

üß† Original: Anxiety levels are going up every day with these exams!
üîß Cleaned:  completely stress project deadline weary face
üìä Features: {'char_len': 45, 'word_count': 6, 'sentiment': 0.1, 'subjectivity': 0.4, 'stress_words': 1}

üß† Original: Work pressure is unbearable, but I‚Äôll manage somehow.
üîß Cleaned:  feel super calm relax yoga person lotus position
üìä Features: {'char_len': 48, 'word_count': 8, 'sentiment': 0.31666666666666665, 'subjectivity': 0.7083333333333333, 'stress_words': 1}





In [6]:
# ==========================================
# üß© CELL 6 ‚Äî EXPANDED ADVANCED VECTORIZER CREATOR
# ==========================================

import os, json, numpy as np
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer, HashingVectorizer
)
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler

# optional transformer embeddings
try:
    from sentence_transformers import SentenceTransformer
    SBT_AVAILABLE = True
except Exception:
    SBT_AVAILABLE = False

# spaCy embeddings fallback
try:
    _ = nlp
    SPACY_AVAILABLE = True
except Exception:
    SPACY_AVAILABLE = False


# ---------------------------------------------------------
# üî† Mental-health vocabulary builder
# ---------------------------------------------------------
def build_mental_health_vocab(extra_terms=None):
    vocab = {
        "stress","stressed","anxiety","anxious","panic","pressure","tension",
        "depression","depressed","hopeless","helpless","sad","cry","worry",
        "worried","fear","afraid","angry","frustrated","burnout","tired",
        "exhausted","insomnia","sleep","therapy","counseling","help","relief",
        "relaxed","calm","peace","happy","joy","gratitude"
    }
    if extra_terms: vocab.update(set(extra_terms))
    return sorted(vocab)


# ---------------------------------------------------------
# üß† Embedding wrapper (SentenceTransformer / spaCy)
# ---------------------------------------------------------
class SentenceEmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name, self.model, self.backend = model_name, None, None

    def fit(self, X, y=None):
        if SBT_AVAILABLE:
            try:
                self.model = SentenceTransformer(self.model_name)
                self.backend = "sentence-transformers"
            except Exception:
                self.model = None
        if self.model is None and SPACY_AVAILABLE:
            self.model, self.backend = nlp, "spacy"
        if self.model is None:
            raise RuntimeError("No embedding backend available.")
        return self

    def transform(self, X):
        if self.backend == "sentence-transformers":
            return self.model.encode(list(X), show_progress_bar=False)
        elif self.backend == "spacy":
            return np.vstack([doc.vector for doc in self.model.pipe(list(X), disable=["parser","ner"])])
        else:
            raise RuntimeError("No backend initialized.")


# ---------------------------------------------------------
# ‚öôÔ∏è Build feature unions / specialized vectorizers
# ---------------------------------------------------------
def build_hybrid_char_word():
    return FeatureUnion([
        ('word_tfidf', TfidfVectorizer(max_features=8000, ngram_range=(1,2),
                                       min_df=2, stop_words="english")),
        ('char_tfidf', TfidfVectorizer(max_features=4000, analyzer='char_wb',
                                       ngram_range=(3,5), min_df=2))
    ])

def mental_health_tfidf(vocab=None):
    if vocab is None: vocab = build_mental_health_vocab()
    return TfidfVectorizer(vocabulary=vocab, ngram_range=(1,2), sublinear_tf=True)


# ---------------------------------------------------------
# ‚öôÔ∏è Ensemble wrapper (combine sparse + dense)
# ---------------------------------------------------------
class EnsembleVectorizer(BaseEstimator, TransformerMixin):
    """Combine a sparse TF-IDF and dense embedding space."""
    def __init__(self, word_tfidf=None, embed_vec=None):
        self.word_tfidf = word_tfidf or TfidfVectorizer(max_features=10000)
        self.embed_vec = embed_vec or SentenceEmbeddingVectorizer()
        self.scaler = StandardScaler(with_mean=False)
    def fit(self, X, y=None):
        self.word_tfidf.fit(X)
        self.embed_vec.fit(X)
        return self
    def transform(self, X):
        tfidf_matrix = self.word_tfidf.transform(X)
        dense_matrix = self.embed_vec.transform(X)
        dense_scaled = self.scaler.fit_transform(dense_matrix)
        # horizontally stack sparse + dense
        from scipy.sparse import hstack, csr_matrix
        return hstack([tfidf_matrix, csr_matrix(dense_scaled)])


# ---------------------------------------------------------
# üìö Create complete vectorizer suite
# ---------------------------------------------------------
def create_advanced_vectorizers(extra_vocab_terms=None):
    vocab = build_mental_health_vocab(extra_vocab_terms)
    vects = {
        # Bag-of-Words
        "bow_binary": CountVectorizer(max_features=10000, binary=True, ngram_range=(1,2)),
        "bow_freq": CountVectorizer(max_features=12000, binary=False, ngram_range=(1,2)),

        # TF-IDF variants
        "tfidf_unigram": TfidfVectorizer(max_features=10000, ngram_range=(1,1), sublinear_tf=True),
        "tfidf_bigram":  TfidfVectorizer(max_features=15000, ngram_range=(1,2), sublinear_tf=True),
        "tfidf_char":    TfidfVectorizer(max_features=8000, analyzer='char_wb', ngram_range=(3,5)),

        # Hybrid & vocab
        "hybrid_char_word": build_hybrid_char_word(),
        "mental_health_vocab": mental_health_tfidf(vocab),

        # Hashing
        "hashing": HashingVectorizer(n_features=2**15, alternate_sign=False, norm=None)
    }

    # Add embeddings if available
    if SBT_AVAILABLE or SPACY_AVAILABLE:
        vects["sentence_embeddings"] = SentenceEmbeddingVectorizer()

    # Add ensemble combination
    vects["ensemble_vectorizer"] = EnsembleVectorizer()

    return vects


# ---------------------------------------------------------
# üöÄ Build & summarize
# ---------------------------------------------------------
vectorizers = create_advanced_vectorizers()
print(f"‚úÖ Created {len(vectorizers)} vectorizers:")
for k in vectorizers: print(f"  - {k}")

meta = {
    "vectorizers": list(vectorizers.keys()),
    "sentence_transformers": SBT_AVAILABLE,
    "spacy_available": SPACY_AVAILABLE,
    "mental_health_vocab_size": len(build_mental_health_vocab())
}
os.makedirs("preprocessors", exist_ok=True)
with open("preprocessors/vectorizer_index.json", "w") as f:
    json.dump(meta, f, indent=2)
print("üìÅ Saved vectorizer metadata ‚Üí preprocessors/vectorizer_index.json")


‚úÖ Created 10 vectorizers:
  - bow_binary
  - bow_freq
  - tfidf_unigram
  - tfidf_bigram
  - tfidf_char
  - hybrid_char_word
  - mental_health_vocab
  - hashing
  - sentence_embeddings
  - ensemble_vectorizer
üìÅ Saved vectorizer metadata ‚Üí preprocessors/vectorizer_index.json


In [1]:
# ==========================================
# ü§ñ MEMORY-SAFE MODEL SUITE
# ==========================================
# Load models on-demand to prevent kernel crash

import warnings, os, numpy as np
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier, 
    GradientBoostingClassifier, AdaBoostClassifier
)
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

# ------------------------------
# 1Ô∏è‚É£ Classical ML models (LAZY LOADING)
# ------------------------------
def get_model(model_name):
    """Get a single model on-demand to save memory"""
    models_config = {
        "LogisticRegression": lambda: LogisticRegression(
            max_iter=2000, C=1.0, class_weight='balanced', random_state=42
        ),
        "SVM_Linear": lambda: LinearSVC(C=1.0, max_iter=2000, random_state=42),
        "RandomForest": lambda: RandomForestClassifier(
            n_estimators=300, max_depth=20, random_state=42, n_jobs=-1
        ),
        "ExtraTrees": lambda: ExtraTreesClassifier(
            n_estimators=300, random_state=42, n_jobs=-1
        ),
        "GradientBoosting": lambda: GradientBoostingClassifier(
            n_estimators=150, learning_rate=0.1, random_state=42
        ),
        "AdaBoost": lambda: AdaBoostClassifier(
            n_estimators=100, learning_rate=1.0, random_state=42
        ),
        "NaiveBayes": lambda: MultinomialNB(alpha=0.1),
        "MLPClassifier": lambda: MLPClassifier(
            hidden_layer_sizes=(128, 64), activation='relu', 
            max_iter=500, random_state=42
        )
    }
    
    if model_name not in models_config:
        raise ValueError(f"Unknown model: {model_name}")
    
    return models_config[model_name]()


# ------------------------------
# 2Ô∏è‚É£ Deep Neural Models (ON-DEMAND)
# ------------------------------
def build_ann(input_dim):
    """Build ANN only when needed"""
    try:
        import tensorflow as tf
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Dense, Dropout
        from tensorflow.keras.optimizers import Adam
        
        # Clear previous sessions
        tf.keras.backend.clear_session()
        
        model = Sequential([
            Dense(512, activation='relu', input_dim=input_dim),
            Dropout(0.4),
            Dense(256, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])
        model.compile(
            optimizer=Adam(1e-3), 
            loss='binary_crossentropy', 
            metrics=['accuracy']
        )
        return model
    except ImportError:
        print("‚ö†Ô∏è TensorFlow not installed")
        return None


def build_cnn(vocab_size=10000, max_len=100, embed_dim=128):
    """Build CNN only when needed"""
    try:
        import tensorflow as tf
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import (
            Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
        )
        from tensorflow.keras.optimizers import Adam
        
        tf.keras.backend.clear_session()
        
        model = Sequential([
            Embedding(vocab_size, embed_dim, input_length=max_len),
            Conv1D(128, 5, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])
        model.compile(
            optimizer=Adam(1e-3), 
            loss='binary_crossentropy', 
            metrics=['accuracy']
        )
        return model
    except ImportError:
        print("‚ö†Ô∏è TensorFlow not installed")
        return None


def build_bilstm(vocab_size=10000, max_len=100, embed_dim=128):
    """Build BiLSTM only when needed"""
    try:
        import tensorflow as tf
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import (
            Embedding, Bidirectional, LSTM, Dense, Dropout
        )
        from tensorflow.keras.optimizers import Adam
        
        tf.keras.backend.clear_session()
        
        model = Sequential([
            Embedding(vocab_size, embed_dim, input_length=max_len),
            Bidirectional(LSTM(128)),
            Dropout(0.4),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])
        model.compile(
            optimizer=Adam(1e-3), 
            loss='binary_crossentropy', 
            metrics=['accuracy']
        )
        return model
    except ImportError:
        print("‚ö†Ô∏è TensorFlow not installed")
        return None


# ------------------------------
# 3Ô∏è‚É£ Model Registry (NO PRE-LOADING)
# ------------------------------
def list_available_models():
    """List all available models without loading them"""
    classical = [
        "LogisticRegression", "SVM_Linear", "RandomForest", 
        "ExtraTrees", "GradientBoosting", "AdaBoost", 
        "NaiveBayes", "MLPClassifier"
    ]
    deep = ["ANN", "CNN", "BiLSTM"]
    return {
        "classical": classical,
        "deep": deep,
        "all": classical + deep
    }


# ------------------------------
# üöÄ Safe Execution
# ------------------------------
print("‚úÖ Model suite initialized (lazy loading)")
available = list_available_models()
print(f"üìã Classical models: {len(available['classical'])}")
print(f"üß† Deep models: {len(available['deep'])}")
print("\nüí° Usage:")
print("   model = get_model('LogisticRegression')")
print("   model = build_ann(input_dim=512)")
print("\n‚ö†Ô∏è  Models are created on-demand to prevent memory issues")

‚úÖ Model suite initialized (lazy loading)
üìã Classical models: 8
üß† Deep models: 3

üí° Usage:
   model = get_model('LogisticRegression')
   model = build_ann(input_dim=512)

‚ö†Ô∏è  Models are created on-demand to prevent memory issues


In [None]:
pip install tensorflow