Cell 1 — Upload / prepare data (run once)

In [None]:
# ============================================
# Cell 1 — Upload / Prepare Data (Run once)
# ============================================
# Purpose:
# - Manage dataset uploads and preparation using an object-oriented approach.
# - Robust handling with try/except for errors during file operations.
#
# Expected files:
#   1) training.1600000.processed.noemoticon.csv
#   2) depressive_tweets_processed.csv
#   3) contractions.json
#
# Notes:
# - Safe to re-run; existing files are preserved unless replaced intentionally.
# ============================================

import os
import shutil
from google.colab import files

class DataManager:
    """
    Class to manage dataset uploads and preparation in Google Colab.
    Includes error handling for robust execution.
    """

    def __init__(self, data_dir="/content/project_data"):
        self.data_dir = data_dir
        try:
            os.makedirs(self.data_dir, exist_ok=True)
        except Exception as e:
            print(f"Error creating data directory {self.data_dir}: {e}")

    def list_files(self):
        """Return a list of files currently in the data directory."""
        try:
            return os.listdir(self.data_dir)
        except Exception as e:
            print(f"Error listing files in {self.data_dir}: {e}")
            return []

    def display_status(self):
        """Display current files in the data directory."""
        files = self.list_files()
        print(f"Data directory: {self.data_dir}")
        if files:
            print("Existing files:")
            for f in files:
                print(f" - {f}")
        else:
            print("No files currently in the data directory.")

    def upload_files(self):
        """Launch file upload dialog and move uploaded files to the data directory."""
        print("Please select the required files to upload.")
        try:
            uploaded = files.upload()
            for fname in uploaded.keys():
                try:
                    dest_path = os.path.join(self.data_dir, fname)
                    if os.path.exists(dest_path):
                        print(f"Replacing existing file: {fname}")
                        os.remove(dest_path)
                    shutil.move(fname, dest_path)
                except Exception as file_error:
                    print(f"Error moving file {fname}: {file_error}")
            print("Upload complete.")
        except Exception as upload_error:
            print(f"Error during file upload: {upload_error}")

    def prepare(self):
        """
        Main workflow:
        1. Display current files.
        2. Ask user whether to upload/replace files.
        3. Show final directory contents.
        """
        self.display_status()
        try:
            user_choice = input("\nDo you want to upload or replace files? (y/n): ").strip().lower()
        except Exception as e:
            print(f"Error reading user input: {e}")
            user_choice = 'n'

        if user_choice == 'y':
            self.upload_files()

        print("\nFinal files in data directory:")
        for f in self.list_files():
            print(f" - {f}")
        print("\nData preparation complete.")


# --- Execute the workflow ---
if __name__ == "__main__":
    data_manager = DataManager()
    data_manager.prepare()


Cell 2 — Install & Imports (environment + GloVe helper)

In [None]:
# ============================
# Cell 2 — Install & Imports (environment + GloVe helper)
# ============================
# Purpose:
# - Install small helper packages if missing (kept conservative to avoid binary mismatch).
# - Import all libraries used across the project.
# - Provide helper to download GloVe 100d (optional). If not available, later cells will fall back to random embeddings.
# - Use try/except import pattern so the runtime is robust

# Install light helper packages quietly. If don't want to install, comment the line.


import os
import logging
import random
import numpy as np

class EnvironmentSetup:
    """
    Handles package installation, imports, reproducibility, and GloVe embedding checks.
    """

    def __init__(self, seed: int = 42, glove_dir: str = "/content/glove"):
        self.seed = seed
        self.glove_dir = glove_dir
        self.glove_filename = "glove.6B.100d.txt"
        self.glove_zip = "/content/glove.6B.zip"
        self.glove_path = os.path.join(self.glove_dir, self.glove_filename)

        # Logging setup
        logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
        self.logger = logging.getLogger("DepressionProject")
        self.logger.setLevel(logging.INFO)

    def set_seed(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        os.environ['PYTHONHASHSEED'] = str(self.seed)
        self.logger.info("Random seed set to %d", self.seed)

    def install_packages(self):
        """Install light helper packages if missing."""
        try:
            import preprocessor as _p_test
            from wordcloud import WordCloud as _wc_test
        except Exception:
            !pip install -q tweet-preprocessor wordcloud tqdm

        try:
            import nltk
        except Exception:
            !pip install -q nltk

        try:
            import tensorflow as tf
        except Exception:
            !pip install -q tensorflow

    def import_packages(self):
        """Import all necessary packages after installation."""
        import re
        import time
        import json
        import pickle
        import matplotlib.pyplot as plt
        import seaborn as sns
        from tqdm import tqdm
        import nltk
        import preprocessor as p
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        from wordcloud import WordCloud
        import tensorflow as tf
        from tensorflow.keras.preprocessing.text import Tokenizer
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dense, Dropout, Layer
        from tensorflow.keras.models import Model, load_model
        from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
        from sklearn.model_selection import train_test_split, StratifiedKFold
        from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
        from sklearn.pipeline import Pipeline
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import LinearSVC
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
        import joblib
        self.logger.info("All packages imported successfully.")

    def download_nltk_resources(self):
        """Download required NLTK resources."""
        import nltk
        try:
            nltk.download('punkt', quiet=True)
            nltk.download('punkt_tab', quiet=True)
            nltk.download('stopwords', quiet=True)
            self.logger.info("NLTK resources downloaded successfully.")
        except Exception as e:
            self.logger.warning("NLTK download issue: %s", e)

    def ensure_glove_100d(self, download_if_missing: bool = False) -> bool:
        """Check if GloVe embeddings exist, optionally download them."""
        if os.path.exists(self.glove_path):
            self.logger.info("GloVe found at %s", self.glove_path)
            return True
        if not download_if_missing:
            self.logger.info("GloVe not found at %s. To download, call ensure_glove_100d(download_if_missing=True)", self.glove_path)
            return False
        try:
            self.logger.info("Downloading GloVe (large file, patience required)...")
            !wget -q -nc http://nlp.stanford.edu/data/glove.6B.zip -P /content
            os.makedirs(self.glove_dir, exist_ok=True)
            !unzip -q -o /content/glove.6B.zip -d /content/glove
            found = os.path.exists(self.glove_path)
            if found:
                self.logger.info("GloVe extracted to %s", self.glove_path)
            else:
                self.logger.warning("GloVe 100d not found after extraction.")
            return found
        except Exception as e:
            self.logger.warning("Failed to download/extract GloVe: %s", e)
            return False

    def setup(self):
        """Run full environment setup."""
        self.set_seed()
        self.install_packages()
        self.import_packages()
        self.download_nltk_resources()
        glove_available = self.ensure_glove_100d(download_if_missing=False)
        self.logger.info("GloVe 100d available: %s", glove_available)
        self.logger.info("Environment setup complete.")


# --- Execute environment setup ---
if __name__ == "__main__":
    env_setup = EnvironmentSetup()
    env_setup.setup()


Cell 3 — Configuration Manager — hyperparameters editable at runtime

In [None]:
# ============================
# Cell 3 — Configuration Manager
# ============================
# Purpose:
# - Provide a small wrapper for experiment configuration so you can adjust hyperparameters
#   interactively from one place during testing.
# - Keeps the code clean and consistent.
# - Supports serialization to JSON for experiment tracking.

import os
import json
from dataclasses import dataclass, asdict
from datetime import datetime
import logging
import time

# --- Configuration Dataclass ---
@dataclass
class Config:
    # --- File paths and directories ---
    data_dir: str = "/content/project_data"
    data_path: str = "/content/project_data"
    glove_path: str = "/content/glove.6B.100d.txt"

    sentiment_file: str = "training.1600000.processed.noemoticon.csv"
    depressive_file: str = "depressive_tweets_processed.csv"
    contractions_file: str = "contractions.json"

    cache_dir: str = "/content/cache"
    save_dir: str = "/content/drive/MyDrive/project_results"

    # --- Data sampling ---
    sentiment_sample: int = 18000
    depressive_rows: int = 9000

    # --- Deep Model Parameters ---
    max_num_words: int = 30000
    max_seq_length: int = 200
    embedding_dim: int = 100
    lstm_units: int = 128
    lstm_layers: int = 2
    dropout_rate: float = 0.4
    recurrent_dropout: float = 0.3
    dense_units: int = 128
    learning_rate: float = 2e-4
    deep_batch_size: int = 64
    deep_epochs: int = 10
    patience: int = 3

    # --- Classical Model ---
    classical_cv_folds: int = 3
    grid_search_n_jobs: int = -1

    # --- Experiment Control ---
    seed: int = 42
    fast_mode: bool = False
    log_to_drive: bool = True

    # --- Utility: Convert to dict for saving/logging ---
    def to_dict(self):
        """Return configuration parameters as a dictionary for logging or saving."""
        return asdict(self)

# --- Create directories safely ---
os.makedirs(Config().cache_dir, exist_ok=True)
os.makedirs(Config().save_dir, exist_ok=True)

# --- Logging utility ---
def setup_logger():
    logger = logging.getLogger("DepressionProject")
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        ch = logging.StreamHandler()
        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
        ch.setFormatter(formatter)
        logger.addHandler(ch)
    return logger

logger = setup_logger()

# --- Logging experiment metadata ---
def log_experiment(cfg: Config):
    meta = {
        "timestamp": time.strftime("%Y-%m-%d_%H-%M-%S"),
        "model": "BiLSTM + Attention",
        "max_words": cfg.max_num_words,
        "seq_len": cfg.max_seq_length,
        "embedding_dim": cfg.embedding_dim,
        "lstm_units": cfg.lstm_units,
        "lstm_layers": cfg.lstm_layers,
        "dropout": cfg.dropout_rate,
        "recurrent_dropout": cfg.recurrent_dropout,
        "learning_rate": cfg.learning_rate,
        "batch_size": cfg.deep_batch_size,
        "epochs": cfg.deep_epochs
    }
    log_path = os.path.join(cfg.save_dir, f"experiment_meta_{meta['timestamp']}.txt")
    with open(log_path, "w") as f:
        for k, v in meta.items():
            f.write(f"{k}: {v}\n")
    logger.info(f"✅ Experiment metadata logged to: {log_path}")

# --- Instantiate and log ---
config = Config()
log_experiment(config)

logger.info("✅ Configuration ready. Deep model optimized for improved accuracy and generalization.")

# --- Utility Functions ---
def now_str() -> str:
    """Return current timestamp string for saving results."""
    return datetime.now().strftime("%Y%m%d_%H%M%S")


Cell 4 — Text Cleaning & Preprocessing (class with caching)

In [None]:
# ============================
# Cell 4 — Text Cleaning & Preprocessing (with caching)
# ============================
# Purpose:
# - Encapsulate tweet cleaning steps in a reusable class.
# - Provide caching to skip repeated cleaning across runs (speeds iterative development).
# - Steps implemented:
#     * Lowercasing
#     * tweet-preprocessor cleaning (removes urls/mentions/hashtags normalizations)
#     * Contraction expansion (if contractions.json provided)
#     * Remove non-alphanumeric symbols (keeps # + _ as in earlier code)
#     * Tokenize and remove NLTK stopwords
# - Use tqdm progress bar for long cleaning loops.

import os
import re
import pandas as pd
from typing import List, Dict, Optional
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
import logging

logger = logging.getLogger("DepressionProject")
logger.setLevel(logging.INFO)

class DataLoaderLocal:
    """
    Helper to load CSV files from the configured data_dir.
    Uses the ExperimentConfig object for paths and sampling.
    """

    def __init__(self, cfg: Config):
        self.cfg = cfg

    def _path(self, fname: str) -> str:
        """Return full path for a filename in data_dir."""
        return os.path.join(self.cfg.data_dir, fname)

    def load_sentiment140(self, sample_n: Optional[int] = None, encoding: str = "ISO-8859-1") -> pd.DataFrame:
        """Load a sample of the Sentiment140 dataset, assign label=0."""
        sample_n = sample_n or self.cfg.sentiment_sample
        path = self._path(self.cfg.sentiment_file)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Sentiment140 not found at {path}")
        col_names = ['target','id','date','flag','user','text']
        df = pd.read_csv(path, names=col_names, encoding=encoding)
        df_sample = df[['text']].sample(sample_n, random_state=self.cfg.seed).reset_index(drop=True)
        df_sample['label'] = 0
        return df_sample

    def load_depressive(self, nrows: Optional[int] = None) -> pd.DataFrame:
        """Load depressive tweets dataset, assign label=1."""
        nrows = nrows or self.cfg.depressive_rows
        path = self._path(self.cfg.depressive_file)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Depressive tweets file not found at {path}")
        df = pd.read_csv(path, sep='|', header=None, usecols=[0,5], nrows=nrows, names=['id','text'])
        df = df[['text']].reset_index(drop=True)
        df['label'] = 1
        return df

    def load_contractions(self) -> Dict[str,str]:
        """Load contraction mapping from JSON file; return lowercase-normalized keys."""
        path = self._path(self.cfg.contractions_file)
        if not os.path.exists(path):
            logger.warning("Contractions file not found at %s. Continuing without contractions.", path)
            return {}
        s = pd.read_json(path, typ='series')
        return {str(k).lower(): v for k, v in s.to_dict().items()}


class PreprocessorWithCache:
    """
    Text preprocessor with optional caching.
    Cleans text with lowercasing, tweet-preprocessor, contraction expansion,
    symbol removal, and stopword removal.
    """

    def __init__(self, contractions: Optional[Dict[str,str]] = None, cache_dir: Optional[str] = None):
        self.contractions = contractions or {}
        # Compile contraction regex if available
        if self.contractions:
            keys = sorted(self.contractions.keys(), key=lambda x: -len(x))
            pattern = '|'.join(map(re.escape, keys))
            self.c_re = re.compile(r'(%s)' % pattern)
        else:
            self.c_re = None

        self.BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
        self.stop_words = set(stopwords.words('english'))
        self.cache_dir = cache_dir or config.cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)

    def expand_contractions(self, text: str) -> str:
        """Replace contractions in text using the provided mapping."""
        if not self.c_re:
            return text
        return self.c_re.sub(lambda m: self.contractions.get(m.group(0), m.group(0)), text)

    def clean_text(self, text: str) -> str:
        """
        Clean a single text/tweet:
        - Lowercase
        - Remove URLs/mentions/emoji via tweet-preprocessor
        - Expand contractions
        - Remove unwanted symbols (keep letters, digits, # + _)
        - Tokenize and remove stopwords
        Returns cleaned string (tokens joined by spaces).
        """
        t = str(text).lower()
        t = p.clean(t)  # tweet-preprocessor
        t = self.expand_contractions(t)
        t = self.BAD_SYMBOLS_RE.sub(' ', t)
        toks = word_tokenize(t)
        toks = [tok for tok in toks if tok not in self.stop_words]
        return ' '.join(toks)

    def _cache_file_for_key(self, key: str) -> str:
        """Generate a safe cache filename from a key."""
        safe = re.sub(r'[^0-9a-zA-Z_\-]', '_', key)
        return os.path.join(self.cache_dir, f"{safe}.pkl")

    def clean_texts(self, texts: List[str], cache_key: Optional[str] = None) -> List[str]:
        """
        Clean a list of texts. Uses cache if cache_key provided and cache exists.
        Returns a list of cleaned strings.
        """
        if cache_key:
            cache_file = self._cache_file_for_key(cache_key)
            if os.path.exists(cache_file):
                logger.info("Loading cleaned texts from cache: %s", cache_file)
                try:
                    return pd.read_pickle(cache_file).tolist()
                except Exception as e:
                    logger.warning("Failed to load cache, recomputing: %s", e)

        cleaned = []
        for t in tqdm(texts, desc="Cleaning texts"):
            cleaned.append(self.clean_text(t))

        if cache_key:
            try:
                pd.Series(cleaned).to_pickle(self._cache_file_for_key(cache_key))
                logger.info("Saved cleaned texts cache to %s", self._cache_file_for_key(cache_key))
            except Exception as e:
                logger.warning("Failed to save cache: %s", e)

        return cleaned

# Example short sanity check:
# data_loader = DataLoaderLocal(config)
# small_sent = data_loader.load_sentiment140(sample_n=100)
# print("Loaded sample shape:", small_sent.shape)


Cell 5 — Exploratory Data Analysis (EDA) & Visualizer

In [None]:
# ============================
# Cell 5 — Exploratory Data Analysis (EDA) & Visualizer
# ============================
# Purpose:
# - Provide small EDA utilities for quick dataset checks and visualizations:
#   class distribution, sample tweets, wordclouds, top TF-IDF terms (if TF-IDF built later).
# - These functions use the cleaned 'clean' column once you run the cleaning step in later cells.


from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple, Optional

class EDAVisualizer:
    """
    Small visualizer class to inspect dataset and cleaned text.
    It expects a DataFrame with columns: ['text', 'label'] or ['text', 'label', 'clean'].
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def sample_rows(self, n: int = 5, label: Optional[int] = None):
        """Print sample original and cleaned rows for quick inspection."""
        if label is None:
            sample = self.df.sample(min(n, len(self.df)), random_state=SEED)
        else:
            sample = self.df[self.df['label'] == label].sample(min(n, self.df[self.df['label'] == label].shape[0]), random_state=SEED)
        display(sample.head(n))

    def class_distribution(self, savepath: Optional[str] = None):
        """Plot class distribution bar chart and return counts."""
        counts = self.df['label'].value_counts().sort_index()
        labels = [f"{i}" for i in counts.index.tolist()]
        plt.figure(figsize=(6,4))
        sns.barplot(x=labels, y=counts.values, palette="Blues_d")
        plt.title("Class distribution")
        plt.xlabel("Label")
        plt.ylabel("Count")
        if savepath:
            plt.savefig(savepath, bbox_inches='tight')
        plt.show()
        return counts.to_dict()

    def wordcloud_for_label(self, label: int = 1, max_words: int = 200, savepath: Optional[str] = None):
        """Generate a wordcloud for the cleaned text of a given label."""
        if 'clean' not in self.df.columns:
            raise RuntimeError("DataFrame must contain 'clean' column (run cleaning step first).")
        texts = self.df[self.df['label'] == label]['clean'].dropna().astype(str).tolist()
        if not texts:
            print("No texts for label", label)
            return
        words = " ".join(texts)
        wc = WordCloud(width=800, height=400, collocations=False, background_color='white', max_words=max_words).generate(words)
        plt.figure(figsize=(12,6))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"WordCloud for label={label}")
        if savepath:
            plt.savefig(savepath, bbox_inches='tight')
        plt.show()

    def top_n_tokens(self, label: Optional[int] = None, n: int = 30) -> List[Tuple[str,int]]:
        """Return top-n token counts from cleaned text (optionally for a specific label)."""
        if 'clean' not in self.df.columns:
            raise RuntimeError("DataFrame must contain 'clean' column (run cleaning step first).")
        if label is None:
            texts = self.df['clean'].dropna().astype(str).tolist()
        else:
            texts = self.df[self.df['label'] == label]['clean'].dropna().astype(str).tolist()
        counter = Counter()
        for t in texts:
            counter.update(t.split())
        return counter.most_common(n)

# Example usage template (run after you have df and cleaned column):
# eda = EDAVisualizer(df)
# eda.class_distribution()
# eda.sample_rows(n=5, label=1)
# eda.wordcloud_for_label(label=1)
# print(eda.top_n_tokens(label=1, n=20))


Cell 6 — Feature extraction & EmbeddingManager (GloVe optional, random fallback)

In [None]:
# ============================================
# Cell 6 — FeatureExtractor & EmbeddingManager (GloVe auto-download)
# ============================================
# ============================
# Purpose:
# - Build TF-IDF vectorizer for classical models.
# - Build Keras Tokenizer + padded sequences for deep models.
# - Build embedding matrix aligned to tokenizer.word_index:
#     * Use GloVe if available
#     * Otherwise produce small random Gaussian embeddings (default)
from typing import List, Dict, Optional, Any
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

from typing import Any
import requests, zipfile

# Utility to ensure GloVe embeddings exist
def ensure_glove_100d(download_if_missing=True) -> str:
    """
    Ensure GloVe 100d embeddings exist locally. Downloads if missing.
    Returns the path to glove.6B.100d.txt
    """
    glove_dir = "/content/glove"
    glove_file = "glove.6B.100d.txt"
    os.makedirs(glove_dir, exist_ok=True)
    glove_path = os.path.join(glove_dir, glove_file)

    if os.path.exists(glove_path):
        logger.info("GloVe 100d already exists at %s", glove_path)
        return glove_path

    if download_if_missing:
        logger.info("Downloading GloVe embeddings...")
        url = "http://nlp.stanford.edu/data/glove.6B.zip"
        zip_path = os.path.join(glove_dir, "glove.6B.zip")

        r = requests.get(url, stream=True)
        with open(zip_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extract(glove_file, glove_dir)
        logger.info("GloVe extracted to %s", glove_path)
    else:
        logger.warning("GloVe not found and download_if_missing=False. Will use random embeddings.")

    return glove_path

# Define global GLOVE_PATH for later cells
GLOVE_PATH = ensure_glove_100d()

# ------------------------------------
class FeatureExtractor:
    """
    Builds TF-IDF matrix for classical models and tokenizer+sequences for deep models.
    """
    def __init__(self, max_num_words: int = config.max_num_words, max_seq_len: int = config.max_seq_length):
        self.max_num_words = max_num_words
        self.max_seq_len = max_seq_len
        self.tfidf_vectorizer: Optional[TfidfVectorizer] = None
        self.tokenizer: Optional[Tokenizer] = None

    def build_tfidf(self, texts: List[str]) -> Any:
        self.tfidf_vectorizer = TfidfVectorizer(max_features=self.max_num_words)
        X_tfidf = self.tfidf_vectorizer.fit_transform(texts)
        logger.info("Built TF-IDF: shape=%s", X_tfidf.shape)
        return X_tfidf

    def build_tokenizer_and_sequences(self, texts: List[str]) -> np.ndarray:
        self.tokenizer = Tokenizer(num_words=self.max_num_words, oov_token=None)
        self.tokenizer.fit_on_texts(texts)
        seqs = self.tokenizer.texts_to_sequences(texts)
        X_seq = pad_sequences(seqs, maxlen=self.max_seq_len)
        logger.info("Built tokenizer: vocab_size=%d sequences shape=%s", len(self.tokenizer.word_index), X_seq.shape)
        return X_seq

# ------------------------------------
class EmbeddingManagerLocal:
    """
    Build embedding matrix. Prefer GloVe if available; else fallback to random.
    """
    def __init__(self, glove_path: str = GLOVE_PATH, max_num_words: int = config.max_num_words, emb_dim: int = config.embedding_dim):
        self.glove_path = glove_path
        self.max_num_words = max_num_words
        self.emb_dim = emb_dim
        self.glove_index: Dict[str, np.ndarray] = {}

    def _load_glove_index(self):
        if self.glove_index:
            return self.glove_index
        if not os.path.exists(self.glove_path):
            logger.warning("GloVe file not found at %s. Will use random embeddings.", self.glove_path)
            self.glove_index = {}
            return self.glove_index

        logger.info("Loading GloVe from %s (this may take a while)...", self.glove_path)
        idx = {}
        with open(self.glove_path, 'r', encoding='utf8', errors='ignore') as fh:
            for line in fh:
                parts = line.rstrip().split(" ")
                word = parts[0]
                vals = np.asarray(parts[1:], dtype=np.float32)
                if vals.shape[0] == self.emb_dim:
                    idx[word] = vals
        self.glove_index = idx
        logger.info("Loaded %d GloVe vectors.", len(idx))
        return idx

    def build_embedding_matrix(self, word_index: Dict[str,int]) -> np.ndarray:
        rng = np.random.RandomState(config.seed)
        embedding_matrix = rng.normal(scale=0.01, size=(self.max_num_words, self.emb_dim)).astype(np.float32)

        if os.path.exists(self.glove_path):
            glove_index = self._load_glove_index()
            found = 0
            for word, idx in word_index.items():
                if idx >= self.max_num_words:
                    continue
                vec = glove_index.get(word)
                if vec is not None:
                    embedding_matrix[idx] = vec
                    found += 1
            logger.info("Embedding matrix built: %d tokens found in GloVe (out of %d)", found, min(len(word_index), self.max_num_words))
        else:
            logger.info("GloVe not used. Returning random-initialized embedding matrix of shape %s", embedding_matrix.shape)
        return embedding_matrix



Cell 7 — Classical models manager (GridSearchCV, fit, evaluate, feature importance)

In [None]:
# ============================
# Cell 7 — Classical models manager
# ============================
# Purpose:
# - Provide GridSearchCV tuning for logistic and linear SVM
# - Evaluate models with CV and final holdout
# - Extract feature importance for logistic regression pipeline (top positive / negative tokens)

from typing import List, Dict, Optional
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

class ClassicalPipelineManager:
    """
    Manage classical ML pipelines (CountVectorizer -> TfidfTransformer -> classifier)
    Supports grid search for hyperparameter tuning and returns summary DataFrame.
    """
    def __init__(self, cv_folds: int = config.classical_cv_folds, n_jobs: int = config.grid_search_n_jobs):
        self.cv_folds = cv_folds
        self.n_jobs = n_jobs
        # Base pipelines (we use TfidfTransformer after CountVectorizer)
        self.base_pipelines = {
            'naive_bayes': Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]),
            'logistic': Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(max_iter=2000))]),
            'linear_svc': Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(max_iter=5000))])
        }
        self.trained: Dict[str, Pipeline] = {}

    def grid_search_and_cv(self, X: List[str], y: np.ndarray) -> pd.DataFrame:
        """
        Perform cross-validation and grid search where relevant.
        Returns a DataFrame summarizing model performance (mean F1).
        Trained estimators are stored in self.trained.
        """
        rows = []
        skf = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=config.seed)

        # Naive Bayes: no grid, but CV for baseline
        logger.info("Running cross-val for Naive Bayes...")
        nb_scores = cross_val_score(self.base_pipelines['naive_bayes'], X, y, cv=skf, scoring='f1', n_jobs=self.n_jobs)
        rows.append({'model': 'naive_bayes', 'f1_mean': float(nb_scores.mean()), 'f1_std': float(nb_scores.std())})
        # Train NB on full data so it is available for inference later
        self.base_pipelines['naive_bayes'].fit(X, y)
        self.trained['naive_bayes'] = self.base_pipelines['naive_bayes']

        # Logistic Regression: grid search over C
        logger.info("Grid searching Logistic Regression...")
        param_grid_log = {'clf__C': [0.01, 0.1, 1, 10]}
        grid_log = GridSearchCV(self.base_pipelines['logistic'], param_grid_log, cv=skf, scoring='f1', n_jobs=self.n_jobs)
        grid_log.fit(X, y)
        rows.append({'model': 'logistic', 'f1_mean': float(grid_log.best_score_), 'f1_std': float(np.std(grid_log.cv_results_['mean_test_score'])), 'best_params': grid_log.best_params_})
        self.trained['logistic'] = grid_log.best_estimator_

        # LinearSVC: grid search over C
        logger.info("Grid searching LinearSVC...")
        param_grid_svc = {'clf__C': [0.01, 0.1, 1]}
        grid_svc = GridSearchCV(self.base_pipelines['linear_svc'], param_grid_svc, cv=skf, scoring='f1', n_jobs=self.n_jobs)
        grid_svc.fit(X, y)
        rows.append({'model': 'linear_svc', 'f1_mean': float(grid_svc.best_score_), 'f1_std': float(np.std(grid_svc.cv_results_['mean_test_score'])), 'best_params': grid_svc.best_params_})
        self.trained['linear_svc'] = grid_svc.best_estimator_

        df_results = pd.DataFrame(rows).sort_values('f1_mean', ascending=False).reset_index(drop=True)
        logger.info("Classical model tuning complete. Summary:\n%s", df_results.to_string(index=False))
        return df_results

    def evaluate(self, model_name: str, X_test: List[str], y_test: np.ndarray) -> Dict:
        """
        Evaluate a trained pipeline on test data and return metrics dictionary including classification report (dict).
        """
        if model_name not in self.trained:
            raise RuntimeError(f"Model '{model_name}' is not trained.")
        pipe = self.trained[model_name]
        preds = pipe.predict(X_test)
        probs = None
        # Try to obtain probabilities if available
        try:
            probs = pipe.predict_proba(X_test)[:, 1]
        except Exception:
            # Many classifiers (LinearSVC) do not implement predict_proba
            probs = None
        report = classification_report(y_test, preds, digits=4, output_dict=True)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        logger.info("Evaluation complete for %s: accuracy=%.4f f1=%.4f", model_name, acc, f1)
        return {'accuracy': acc, 'f1': f1, 'report': report, 'preds': preds, 'probs': probs, 'pipeline': pipe}

    def feature_importance_logistic(self, pipeline_obj: Pipeline, top_k: int = 20) -> Dict[str, List[tuple]]:
        """
        Extract top positive/negative features from a trained logistic regression pipeline.
        Expects pipeline to include 'vect' and 'clf' steps.
        Returns dict: {'top_positive': [(token,coef),...], 'top_negative': [...]}
        """
        vect = pipeline_obj.named_steps['vect']
        clf = pipeline_obj.named_steps['clf']
        if not hasattr(clf, 'coef_'):
            raise RuntimeError("Classifier does not expose coefficients.")
        feature_names = vect.get_feature_names_out()
        coefs = clf.coef_.flatten()
        top_pos_idx = np.argsort(coefs)[-top_k:][::-1]
        top_neg_idx = np.argsort(coefs)[:top_k]
        top_pos = [(feature_names[i], float(coefs[i])) for i in top_pos_idx]
        top_neg = [(feature_names[i], float(coefs[i])) for i in top_neg_idx]
        logger.info("Extracted top %d positive and negative logistic features.", top_k)
        return {'top_positive': top_pos, 'top_negative': top_neg}


Cell 8 — Deep model: BiLSTM + Attention (build, train, save, predict)

In [None]:
# ============================
# Cell 8 — Deep model: BiLSTM + Attention
# ============================
# Purpose:
# - Build a BiLSTM model with attention mechanism
# - Provide training utilities with early stopping, checkpointing and LR reduction
# - Prediction helpers and saving/loading

from tensorflow.keras import backend as K
import timeit
import numpy as np
from typing import Optional, Dict
import tensorflow as tf
from tensorflow.keras.layers import Layer, Input, Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

class Attention(Layer):
    """
    Lightweight attention layer that computes a learned weighted sum over time steps.
    The returned context vector has shape (batch_size, features).
    """
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        # one trainable vector per feature dimension
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1],), initializer='glorot_uniform', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        # inputs: (batch, time, features)
        e = tf.tensordot(inputs, self.W, axes=1)  # (batch, time)
        e = tf.nn.tanh(e)
        a = tf.nn.softmax(e, axis=1)              # (batch, time)
        a = tf.expand_dims(a, axis=-1)            # (batch, time, 1)
        context = tf.reduce_sum(inputs * a, axis=1)  # (batch, features)
        return context

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])


class BiLSTMAttn:
    """
    BiLSTM followed by an attention layer and dense layers.
    Methods:
      - build(embedding_matrix=None, trainable_emb=False)
      - train(X_train, y_train, X_val, y_val, ...)
      - predict_proba(X)
      - predict(X)
      - save(path)
      - load(path)
    """
    def __init__(self, max_seq_len: int = config.max_seq_length, max_words: int = config.max_num_words, emb_dim: int = config.embedding_dim):
        self.max_seq_len = max_seq_len
        self.max_words = max_words
        self.emb_dim = emb_dim
        self.model: Optional[Model] = None
        self.checkpoint_path: Optional[str] = None

    def build(self, embedding_matrix: Optional[np.ndarray] = None, trainable_emb: bool = False, dropout_rates=(0.25, 0.3, 0.5)):
        """
        Build and compile the Keras model.
        embedding_matrix: if provided, used to initialize the Embedding layer; otherwise random init.
        trainable_emb: whether embedding layer is trainable (fine-tune).
        dropout_rates: (recurrent_dropout, dropout_after_dense, final_dropout)
        """
        inp = Input(shape=(self.max_seq_len,), name='input_ids')
        if embedding_matrix is not None:
            emb = Embedding(input_dim=self.max_words, output_dim=self.emb_dim, weights=[embedding_matrix],
                            input_length=self.max_seq_len, mask_zero=True, trainable=trainable_emb, name='embedding')(inp)
        else:
            emb = Embedding(input_dim=self.max_words, output_dim=self.emb_dim, input_length=self.max_seq_len, mask_zero=True, name='embedding')(inp)

        # BiLSTM block
        x = Bidirectional(LSTM(128, return_sequences=True, dropout=dropout_rates[0]))(emb)

        # Attention: compress sequence to context vector
        context = Attention()(x)

        # Dense head with dropout regularization
        x = Dense(128, activation='relu')(context)
        x = Dropout(dropout_rates[1])(x)
        x = Dense(64, activation='relu')(x)
        x = Dropout(dropout_rates[2])(x)

        out = Dense(1, activation='sigmoid', name='output')(x)

        self.model = Model(inputs=inp, outputs=out, name='bilstm_attention')
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        logger.info("Built BiLSTM+Attention model with input shape %s and output shape %s", self.model.input_shape, self.model.output_shape)
        self.model.summary()

    def train(self, X_train, y_train, X_val, y_val,
          epochs: int = config.epochs, batch_size: int = config.batch_size, class_weights: Optional[Dict]=None):

        """
        Train the model with early stopping, checkpointing and learning rate reduction.
        Returns (history, duration_seconds).
        """
        assert self.model is not None, "Model not built: call build() first."
        ckpt_path = os.path.join(config.save_dir, f"bilstm_attn_best_{int(time.time())}.h5")
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=config.patience, restore_best_weights=True, verbose=1),
            ModelCheckpoint(ckpt_path, monitor='val_loss', save_best_only=True, verbose=1),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
        ]
        start = timeit.default_timer()
        history = self.model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size,
                                 callbacks=callbacks, class_weight=class_weights, verbose=2)
        duration = timeit.default_timer() - start
        self.checkpoint_path = ckpt_path
        logger.info("Training finished in %.2f seconds. Best weights saved to %s", duration, ckpt_path)
        return history, duration

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        assert self.model is not None, "Model not built or loaded."
        return self.model.predict(X).flatten()

    def predict(self, X: np.ndarray) -> np.ndarray:
        return (self.predict_proba(X) > 0.5).astype(int)

    def save(self, path: str):
        assert self.model is not None, "No model to save."
        self.model.save(path)
        logger.info("Saved Keras model to %s", path)

    def load(self, path: str):
        # load_model must be provided with custom_objects mapping for Attention
        self.model = load_model(path, custom_objects={'Attention': Attention})
        logger.info("Loaded Keras model from %s", path)


Cell 9 — Runner: orchestrate full experiment, persistence.

In [None]:
# ============================
# Cell 9 — Runner (Run Only)
# ============================
# Purpose:
# - Execute all training, preprocessing, evaluation.
# - Keep everything in memory (results, models, vectors).
# - Saving/logging is moved to Cell 10.

import hashlib
import preprocessor as p
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report
import time, joblib, pickle, os, logging, warnings, timeit
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
logging.getLogger("tensorflow").setLevel(logging.ERROR)

class ExperimentRunner:
    """High-level orchestrator for experiments (training only)."""
    def __init__(self, cfg: Config):
        self.cfg = cfg
        self.data_loader = DataLoaderLocal(cfg)
        self.preproc = PreprocessorWithCache(
            contractions=self.data_loader.load_contractions(),
            cache_dir=cfg.cache_dir
        )
        self.feat = FeatureExtractor(
            max_num_words=cfg.max_num_words,
            max_seq_len=cfg.max_seq_length
        )
        self.emb_mgr = EmbeddingManagerLocal(
            glove_path=GLOVE_PATH,
            max_num_words=cfg.max_num_words,
            emb_dim=cfg.embedding_dim
        )
        self.classical_mgr = ClassicalPipelineManager(
            cv_folds=cfg.classical_cv_folds,
            n_jobs=cfg.grid_search_n_jobs
        )
        self.deep_model_wrapper = BiLSTMAttn(
            max_seq_len=cfg.max_seq_length,
            max_words=cfg.max_num_words,
            emb_dim=cfg.embedding_dim
        )

    def run(self, sentiment_sample=None, depressive_rows=None):
        """Execute full pipeline and return results in memory only."""
        sentiment_sample = sentiment_sample or self.cfg.sentiment_sample
        depressive_rows = depressive_rows or self.cfg.depressive_rows

        # 1. Load data
        df_pos = self.data_loader.load_sentiment140(sample_n=sentiment_sample)
        df_neg = self.data_loader.load_depressive(nrows=depressive_rows)
        df = pd.concat([df_pos, df_neg], ignore_index=True).sample(
            frac=1, random_state=self.cfg.seed
        ).reset_index(drop=True)

        # 2. Clean text with cache
        hash_key = hashlib.sha1("".join(df['text'].fillna('').astype(str)).encode('utf8')).hexdigest()[:8]
        df['clean'] = self.preproc.clean_texts(df['text'].fillna('').tolist(), cache_key=f"cleaned_{hash_key}")

        # 3. Features
        X_tfidf = self.feat.build_tfidf(df['clean'].tolist())
        X_seq = self.feat.build_tokenizer_and_sequences(df['clean'].tolist())
        y = df['label'].values

        # 4. Classical
        t0 = timeit.default_timer()
        cv_df = self.classical_mgr.grid_search_and_cv(df['clean'].tolist(), y)
        classical_seconds = timeit.default_timer() - t0
        best_classical = cv_df.iloc[0]['model']
        _, X_test_texts, _, y_test_texts = train_test_split(
            df['clean'].tolist(), y, test_size=0.3, stratify=y, random_state=self.cfg.seed
        )
        classical_eval = self.classical_mgr.evaluate(best_classical, X_test_texts, y_test_texts)

        # 5. Embedding matrix
        embedding_matrix = self.emb_mgr.build_embedding_matrix(self.feat.tokenizer.word_index)

        # 6. Deep model
        self.deep_model_wrapper.build(embedding_matrix=embedding_matrix, trainable_emb=False)
        Xtr, Xte, ytr, yte = train_test_split(
            X_seq, y, test_size=0.3, stratify=y, random_state=self.cfg.seed
        )
        cw_vals = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(ytr), y=ytr)
        class_weights = {int(c): float(w) for c, w in zip(np.unique(ytr), cw_vals)}
        history, deep_seconds = self.deep_model_wrapper.train(
            Xtr, ytr, Xte, yte,
            epochs=self.cfg.deep_epochs,
            batch_size=self.cfg.deep_batch_size,
            class_weights=class_weights
        )

        # Deep eval
        preds = (self.deep_model_wrapper.predict_proba(Xte) > 0.5).astype(int)
        deep_eval = {
            "accuracy": float(accuracy_score(yte, preds)),
            "f1": float(f1_score(yte, preds)),
            "report": classification_report(yte, preds, digits=4, output_dict=True)
        }

        # Feature importance (optional)
        feature_importance = None
        if "logistic" in self.classical_mgr.trained:
            try:
                feature_importance = self.classical_mgr.feature_importance_logistic(
                    self.classical_mgr.trained["logistic"], top_k=30
                )
            except Exception as e:
                logger.warning("Feature importance error: %s", e)

        # Store everything in memory only
        self.df = df
        results = {
            "cv": cv_df,
            "classical_eval": classical_eval,
            "deep_eval": deep_eval,
            "feature_importance": feature_importance,
            "history": history,
            "durations": {"classical_seconds": classical_seconds, "deep_seconds": deep_seconds},
        }
        logger.info("✅ Experiment complete (no saving yet).")
        return results


# --- Run the experiment (train only, no saving) ---
runner = ExperimentRunner(config)
results = runner.run(config.sentiment_sample, config.depressive_rows)
print("✅ Training complete. Proceed to Cell 10 to save artifacts & logs.")


Cell 10 — Save & Log Results

In [None]:
# ============================
# Cell 10 — Save & Log Results
# ============================
# Purpose:
# - Save all experiment artifacts (models, metrics, logs) after training.
# - Avoid re-running heavy computations.
# - Automatically store files in Google Drive or local folder.

import os, json, joblib, pickle
from datetime import datetime

def save_experiment_artifacts(results, runner, cfg, save_to_drive=True):
    """Save trained models, results, and logs safely."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_dir = cfg.save_dir
    os.makedirs(save_dir, exist_ok=True)

    # --- Save Classical Models ---
    artifacts = {}
    for model_name, model_obj in runner.classical_mgr.trained.items():
        path = os.path.join(save_dir, f"{model_name}_model_{timestamp}.joblib")
        joblib.dump(model_obj, path)
        artifacts[f"{model_name}_model"] = path

    # --- Save Deep Model ---
    deep_model_path = os.path.join(save_dir, f"deep_model_{timestamp}.h5")
    runner.deep_model_wrapper.model.save(deep_model_path)
    artifacts["deep_model"] = deep_model_path

    # --- Save Feature Extractor & Tokenizer ---
    tok_path = os.path.join(save_dir, f"tokenizer_{timestamp}.pkl")
    with open(tok_path, "wb") as f:
        pickle.dump(runner.feat.tokenizer, f)
    artifacts["tokenizer"] = tok_path

    # --- Save Metrics and History ---
    results_path = os.path.join(save_dir, f"results_{timestamp}.json")
    def safe(o):
        if isinstance(o, (int, float, str, bool)) or o is None:
            return o
        if isinstance(o, (list, dict)):
            return o
        return str(o)
    with open(results_path, "w") as f:
        json.dump(results, f, default=safe, indent=2)
    artifacts["results_json"] = results_path

    # --- Save Experiment Summary ---
    summary = {
        "timestamp": timestamp,
        "deep_accuracy": results["deep_eval"]["accuracy"],
        "deep_f1": results["deep_eval"]["f1"],
        "best_classical_acc": results["classical_eval"]["accuracy"],
        "classical_time": results["durations"]["classical_seconds"],
        "deep_time": results["durations"]["deep_seconds"]
    }
    summary_path = os.path.join(save_dir, f"summary_{timestamp}.txt")
    with open(summary_path, "w") as f:
        for k, v in summary.items():
            f.write(f"{k}: {v}\n")
    artifacts["summary_txt"] = summary_path

    print(f"✅ All artifacts saved in {save_dir}")
    return artifacts


# --- Execute saving ---
artifacts = save_experiment_artifacts(results, runner, config, save_to_drive=True)
print("✅ Artifacts successfully saved and ready in Google Drive or local folder.")


Cell 11 — Visualization, Inference API, latency, requirements.txt

In [None]:
# ============================
# Cell 11 — Visualization, Inference API, latency & requirements.txt
# ============================
# Purpose:
# - Provide plotting utilities for results (confusion matrices, ROC/PR curves, training plots)
# - Provide an inference service for serving predictions on new texts (both classical & deep)
# - Provide simple requirements.txt writing for reproducibility

# --- Visualization helpers ---
def plot_confusion_normalized(y_true, y_pred, title="Confusion Matrix (normalized)"):
    cm = confusion_matrix(y_true, y_pred)
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(5,4))
    sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap='Blues', xticklabels=['Non','Dep'], yticklabels=['Non','Dep'])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

def plot_roc_pr(y_true, probs, label="Model"):
    # ROC
    fpr, tpr, _ = roc_curve(y_true, probs)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
    plt.plot([0,1],[0,1],'k--')
    plt.title(f"{label} ROC")
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()
    plt.show()
    # Precision-Recall
    prec, rec, _ = precision_recall_curve(y_true, probs)
    plt.figure(figsize=(6,4))
    plt.plot(rec, prec)
    plt.title(f"{label} Precision-Recall")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.show()

# --- Basic reporting using results from runner ---
# Classical test predictions for display
best_classical = results['classical_eval']['pipeline']
# deep predictions already computed during runner.run -> deep_eval
deep_eval = results['deep_eval']
deep_report = deep_eval['report']

print("\n=== Final Summary ===")
print("Classical (best) evaluation:")
print(" - Accuracy:", results['classical_eval']['accuracy'])
print(" - F1:", results['classical_eval']['f1'])
print("\nDeep model evaluation:")
print(" - Accuracy:", deep_eval['accuracy'])
print(" - F1:", deep_eval['f1'])

# If probability arrays exist, show ROC/PR for deep
if 'probs' in deep_eval and deep_eval['probs'] is not None:
    plot_roc_pr(yte_seq, deep_eval['probs'], label="Deep BiLSTM+Attention")
else:
    # We have deep_probs local variable in the runner scope; attempt to use it
    try:
        plot_roc_pr(yte_seq, deep_probs, label="Deep BiLSTM+Attention")
    except Exception:
        logger.info("Deep probability array not available for ROC plotting.")

# Confusion matrices
try:
    plot_confusion_normalized(yte_seq, deep_preds, title="Deep BiLSTM+Attn Confusion (normalized)")
except Exception:
    logger.info("Deep confusion matrix plotting skipped (missing arrays).")

# If logistic feature importance exists, print small lists
if results.get('feature_importance'):
    print("\nTop positive logistic features (indicative of depressive):")
    print(results['feature_importance']['top_positive'][:20])
    print("\nTop negative logistic features (indicative of non-depressive):")
    print(results['feature_importance']['top_negative'][:20])

# --- Inference service for predictions on new texts ---
class InferenceService:
    """
    Lightweight inference wrapper combining preprocessor, tokenizer, classical pipeline, and deep Keras model.
    Example:
      inf = InferenceService(preprocessor, tokenizer, classical_pipeline, deep_model_wrapper)
      inf.predict(["some text"])
    """
    def __init__(self, preprocessor: PreprocessorWithCache, tokenizer: Tokenizer, classical_pipeline: Pipeline, deep_wrapper: BiLSTMAttn):
        self.preprocessor = preprocessor
        self.tokenizer = tokenizer
        self.classical_pipeline = classical_pipeline
        self.deep_wrapper = deep_wrapper

    def prepare(self, texts: List[str]):
        cleaned = [self.preprocessor.clean_text(t) for t in texts]
        seqs = self.tokenizer.texts_to_sequences(cleaned)
        padded = pad_sequences(seqs, maxlen=config.max_seq_length)
        return cleaned, padded

    def predict(self, texts: List[str]):
        cleaned, padded = self.prepare(texts)
        # deep timings
        t0 = timeit.default_timer()
        deep_probs = self.deep_wrapper.predict_proba(padded)
        t1 = timeit.default_timer()
        avg_latency = (t1 - t0) / max(1, len(texts))
        deep_labels = (deep_probs > 0.5).astype(int).tolist()

        # classical predictions (try to get probabilities)
        classical_probs = None
        try:
            classical_probs = self.classical_pipeline.predict_proba(cleaned)[:, 1]
            classical_labels = self.classical_pipeline.predict(cleaned).tolist()
        except Exception:
            classical_labels = self.classical_pipeline.predict(cleaned).tolist()

        results_list = []
        for i, txt in enumerate(texts):
            results_list.append({
                'text': txt,
                'cleaned': cleaned[i],
                'deep_prob': float(deep_probs[i]),
                'deep_label': int(deep_labels[i]),
                'classical_prob': float(classical_probs[i]) if classical_probs is not None else None,
                'classical_label': int(classical_labels[i]) if classical_labels is not None else None
            })
        return {'results': results_list, 'avg_latency_s': avg_latency}

# Build inference service using in-memory artifacts
inference_service = InferenceService(runner.preproc, runner.feat.tokenizer, best_classical, runner.deep_model_wrapper)

# Demo inference (replace with your own sentences)
examples = [
    "I feel hopeless and tired of everything.",
    "I had a really wonderful day and I'm grateful."
]
inf_out = inference_service.predict(examples)
print("Example predictions (deep_label 1 indicates depressive):")
for r in inf_out['results']:
    print(r)

# --- Create a small requirements.txt useful for reproducing the environment ---
reqs = [
    "numpy", "pandas", "scikit-learn", "tensorflow", "nltk", "tqdm",
    "tweet-preprocessor", "wordcloud", "joblib"
]
req_path = os.path.join(config.save_dir, f"requirements_{now_str()}.txt")
with open(req_path, "w") as fh:
    fh.write("\n".join(reqs))
logger.info("Wrote minimal requirements to %s", req_path)

logger.info("All done. Experiment finished successfully.")

