In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Dwnlading the dataset

In [None]:
import kagglehub
import shutil
import os



drive.mount('/content/drive')

drive_base_path = "/content/drive/MyDrive/Depression_Paper_DL"

# Ensure the directory exists
os.makedirs(drive_base_path, exist_ok=True)

def download_and_move_to_drive(dataset_handle, folder_name):
    print(f"Downloading {dataset_handle}...")

    # Download using kagglehub (saves to local runtime cache first)
    cache_path = kagglehub.dataset_download(dataset_handle)

    # Define the final destination for this specific dataset
    destination_path = os.path.join(drive_base_path, folder_name)

    # Move the files from cache to Google Drive
    # dirs_exist_ok=True allows overwriting if you run this multiple times
    shutil.copytree(cache_path, destination_path, dirs_exist_ok=True)

    print(f"✅ Dataset moved to: {destination_path}")

# --- Dataset 1: Suicide Watch ---
download_and_move_to_drive("nikhileswarkomati/suicide-watch", "suicide_watch_data")

# --- Dataset 2: Sentiment140 ---
download_and_move_to_drive("kazanova/sentiment140", "sentiment140_data")

print("\nAll downloads complete. Check your Google Drive folder.")

KeyboardInterrupt: 

# Phase 1: Data Acquisition, Sampling & Relabeling

In [None]:

import pandas as pd
import numpy as np
import torch
import os
from google.colab import drive

# Define your file paths here (YOU MUST EDIT THESE PATHS)
# Upload your raw CSV files to a folder in Drive (e.g., 'Colab Notebooks/Depression_Project')
base_path = '/content/drive/MyDrive/Depression_Paper_DL'
s140_path = os.path.join(base_path, 'training.1600000.processed.noemoticon.csv') # Std Sentiment140 filename
sw_path = os.path.join(base_path, 'Suicide_Detection.csv') # Common Suicide-Watch filename

# Output paths for the 30% sampled files
s140_output_path = os.path.join(base_path, 'sentiment140_sampled_30.csv')
sw_output_path = os.path.join(base_path, 'suicidewatch_sampled_30.csv')


In [None]:

# ==========================================
# 2. Process Sentiment140 Dataset
# ==========================================
print("\n--- Processing Sentiment140 Dataset ---")

try:
    # manually adding headers
    cols = ['target', 'ids', 'date', 'flag', 'user', 'text']
    df_s140 = pd.read_csv(s140_path, encoding='latin-1', names=cols)
    print(f"Original S140 shape: {df_s140.shape}")

    # Sampling (Reduce to 30%)
    df_s140_sample = df_s140.sample(frac=0.3, random_state=42)
    print(f"Sampled S140 shape: {df_s140_sample.shape}")

    label_mapping = {0: 1, 4: 0}
    df_s140_sample['target'] = df_s140_sample['target'].map(label_mapping)

    df_s140_sample = df_s140_sample.dropna(subset=['target'])

    # Save to Drive
    df_s140_sample.to_csv(s140_output_path, index=False)
    print(f" Saved sampled Sentiment140 to: {s140_output_path}")

except FileNotFoundError:
    print(f" Error: Sentiment140 file not found at {s140_path}")





--- Processing Sentiment140 Dataset ---
Original S140 shape: (1600000, 6)
Sampled S140 shape: (480000, 6)
 Saved sampled Sentiment140 to: /content/drive/MyDrive/Depression_Paper_DL/sentiment140_sampled_30.csv


In [None]:
# ==========================================
# 3. Process Suicide-Watch Dataset
# ==========================================
print("\n--- Processing Suicide-Watch Dataset ---")

try:
    # Load Suicide-Watch dataset
    df_sw = pd.read_csv(sw_path)
    print(f"Original Suicide-Watch shape: {df_sw.shape}")

    # Step: Sampling (Reduce to 30%)
    df_sw_sample = df_sw.sample(frac=0.3, random_state=42)
    print(f"Sampled Suicide-Watch shape: {df_sw_sample.shape}")

    if 'class' in df_sw_sample.columns:
        # Standardize labels to 0 and 1
        def map_sw_labels(label):
            label = str(label).lower()
            if 'suicide' in label or 'depression' in label:
                return 1 # Depressed/Suicidal
            elif 'teenager' in label or 'non-suicide' in label:
                return 0 # Non-Depressed
            return None

        df_sw_sample['target'] = df_sw_sample['class'].apply(map_sw_labels)

        # Drop rows where label mapping failed
        df_sw_sample = df_sw_sample.dropna(subset=['target'])

        # Save to Drive
        df_sw_sample.to_csv(sw_output_path, index=False)
        print(f" Saved sampled Suicide-Watch to: {sw_output_path}")
    else:
        print(" 'class' column not found. Check your CSV headers.")

except FileNotFoundError:
    print(f" Error: Suicide-Watch file not found at {sw_path}")

print("\nPhase 1 Complete. Datasets are ready for Preprocessing (Phase 2).")


--- Processing Suicide-Watch Dataset ---
Original Suicide-Watch shape: (232074, 3)
Sampled Suicide-Watch shape: (69622, 3)
 Saved sampled Suicide-Watch to: /content/drive/MyDrive/Depression_Paper_DL/suicidewatch_sampled_30.csv

Phase 1 Complete. Datasets are ready for Preprocessing (Phase 2).


#Phase 2: Data Preprocessing (Cleaning & Lemmatization)

In [None]:

import pandas as pd
import numpy as np
import re
import nltk
import pickle
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive

# Paths (Must match Phase 1 paths)
base_path = '/content/drive/MyDrive/Depression_Paper_DL'
s140_input_path = os.path.join(base_path, 'sentiment140_sampled_30.csv')
sw_input_path = os.path.join(base_path, 'suicidewatch_sampled_30.csv')

# Output paths
s140_clean_path = os.path.join(base_path, 'sentiment140_cleaned.csv')
sw_clean_path = os.path.join(base_path, 'suicidewatch_cleaned.csv')
tfidf_path = os.path.join(base_path, 'tfidf_vectorizer.pkl')


In [None]:

# ==========================================
# 1. Setup NLTK Resources
# ==========================================
print("Downloading NLTK resources...")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') # Required if using nltk tokenizer, though we use split() for speed often
nltk.download('punkt_tab')

# Initialize Lemmatizer and Stopwords [cite: 233, 235]
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


Downloading NLTK resources...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:

# ==========================================
# 2. Define Preprocessing Function
# ==========================================
def preprocess_text(text):
    """
    Implements the cleaning pipeline described in the paper:
    1. Remove special characters/URLs/Emoticons [cite: 227-228]
    2. Lowercase [cite: 229]
    3. Remove Stop words [cite: 233]
    4. Lemmatization [cite: 235]
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove user mentions (e.g., @user)
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special chars & numbers (keep only letters/spaces)
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    tokens = text.split()
    filtered_tokens = [
        lemmatizer.lemmatize(word)  # Lemmatize
        for word in tokens
        if word not in stop_words   # Remove Stop words
    ]

    return " ".join(filtered_tokens)


In [None]:

# ==========================================
# 3. Apply to Datasets
# ==========================================
print("\n--- Preprocessing Sentiment140 ---")
try:
    df_s140 = pd.read_csv(s140_input_path)
    # Apply preprocessing (This takes time on large datasets)
    print("Cleaning text... (this may take a few minutes)")
    df_s140['clean_text'] = df_s140['text'].apply(preprocess_text)

    # Drop rows that became empty after cleaning
    df_s140 = df_s140[df_s140['clean_text'].str.strip().astype(bool)]

    # Save cleaned text version (Needed for Transformers later)
    df_s140.to_csv(s140_clean_path, index=False)
    print(f"✅ Saved clean Sentiment140 to: {s140_clean_path}")

except FileNotFoundError:
    print("❌ Sentiment140 input file not found. Run Phase 1 first.")

print("\n--- Preprocessing Suicide-Watch ---")
try:
    df_sw = pd.read_csv(sw_input_path)
    print("Cleaning text...")
    df_sw['clean_text'] = df_sw['text'].apply(preprocess_text)

    # Drop rows that became empty after cleaning
    df_sw = df_sw[df_sw['clean_text'].str.strip().astype(bool)]

    # Save cleaned text version
    df_sw.to_csv(sw_clean_path, index=False)
    print(f"✅ Saved clean Suicide-Watch to: {sw_clean_path}")

except FileNotFoundError:
    print("❌ Suicide-Watch input file not found. Run Phase 1 first.")



--- Preprocessing Sentiment140 ---
Cleaning text... (this may take a few minutes)
✅ Saved clean Sentiment140 to: /content/drive/MyDrive/Depression_Paper_DL/sentiment140_cleaned.csv

--- Preprocessing Suicide-Watch ---
Cleaning text...
✅ Saved clean Suicide-Watch to: /content/drive/MyDrive/Depression_Paper_DL/suicidewatch_cleaned.csv


In [None]:
# Feature enginering
import pickle
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive

# We need the text to learn the vocabulary (the "fit" step)
if 'df_s140' not in locals():
    print("Loading cleaned Sentiment140 dataset...")
    try:
        df_s140 = pd.read_csv(s140_clean_path)
        # Ensure no NaNs exist after reloading
        df_s140 = df_s140.dropna(subset=['clean_text'])
    except FileNotFoundError:
        print(f"❌ Error: File not found at {s140_clean_path}. Please run the Preprocessing step first.")
        df_s140 = None

if df_s140 is not None:
    # 3. Initialize TF-IDF Vectorizer
    print("\n--- Initializing & Fitting TF-IDF Vectorizer ---")
    # max_features=5000 is selected to keep the vector size manageable for Colab RAM
    # while retaining the most frequent/important words.
    tfidf = TfidfVectorizer(max_features=5000)

    # 4. Fit the Vectorizer
    # This learns the vocabulary (IDF) from the dataset.
    # We fit on Sentiment140 as it is the larger, more general dataset.
    print(f"Fitting on {len(df_s140)} tweets... this might take a moment.")
    tfidf.fit(df_s140['clean_text'])

    # 5. Save the Vectorizer
    # We pickle the object so we can reload it in Phase 3 without refitting.
    with open(tfidf_path, 'wb') as f:
        pickle.dump(tfidf, f)

    print(f" TF-IDF Vectorizer fitted and saved to: {tfidf_path}")
    print(f"   - Vocabulary size: {len(tfidf.vocabulary_)} words")

else:
    print(" Skipping TF-IDF fitting due to missing data.")


--- Initializing & Fitting TF-IDF Vectorizer ---
Fitting on 477587 tweets... this might take a moment.
 TF-IDF Vectorizer fitted and saved to: /content/drive/MyDrive/Depression_Paper_DL/tfidf_vectorizer.pkl
   - Vocabulary size: 5000 words


#Phase 3: Training Traditional ML Models (10-Fold CV)

In [None]:

import pandas as pd
import numpy as np
import os

# Scikit-Learn Imports (CPU-based workflow)
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer # <--- THIS WAS MISSING

# RAPIDS cuML Imports (GPU-based models)
from cuml.linear_model import LogisticRegression
from cuml.naive_bayes import BernoulliNB
from cuml.ensemble import RandomForestClassifier

# Paths
base_path = '/content/drive/MyDrive/Depression_Paper_DL'
s140_path = os.path.join(base_path, 'sentiment140_cleaned.csv')
sw_path = os.path.join(base_path, 'suicidewatch_cleaned.csv')

# Helper function to convert Sparse Matrix -> Dense Array
# (Required for GPU Random Forest)
def to_dense(x):
    return x.toarray()

# Shared Loading Function
def load_data(path):
    if not os.path.exists(path):
        print(f"❌ Error: File not found at {path}")
        return None, None
    df = pd.read_csv(path).dropna(subset=['clean_text', 'target'])
    X = df['clean_text']
    y = df['target'].astype('float32') # cuML expects float32
    return X, y

print("✅ Setup Complete. Ready to train individual models.")

✅ Setup Complete. Ready to train individual models.


In [None]:
def build_pipeline(use_sklearn=True):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from cuml.linear_model import LogisticRegression as cuLogReg

    vectorizer = TfidfVectorizer(
        max_features=50000,
        ngram_range=(1,2),
        stop_words='english'
    )

    if use_sklearn:
        clf = LogisticRegression(
            C=1.0,               # SAFE VALUE
            max_iter=2000,
            solver="lbfgs",
            n_jobs=-1
        )
    else:
        clf = cuLogReg(
            C=1.0,
            max_iter=500,
            tol=1e-4
        )

    from sklearn.pipeline import Pipeline
    return Pipeline([("tfidf", vectorizer), ("clf", clf)])

def run_logistic_regression(dataset_path, dataset_name, force_sklearn=False):
    print(f"\n🔹 Training Logistic Regression on {dataset_name}...")

    df = pd.read_csv(dataset_path)

    # --- FIX: auto-detect label column ---
    possible_label_cols = ["label", "target", "sentiment", "class", "Label"]
    label_col = None

    for col in possible_label_cols:
        if col in df.columns:
            label_col = col
            break

    if label_col is None:
        raise ValueError(f"❌ No label column found in: {df.columns.tolist()}")

    # --- FIX: use correct text column name ---
    possible_text_cols = ["text", "tweet", "content", "message", "clean_text"]
    text_col = None

    for col in possible_text_cols:
        if col in df.columns:
            text_col = col
            break

    if text_col is None:
        raise ValueError(f"❌ No text column found in: {df.columns.tolist()}")

    X = df[text_col]
    y = df[label_col]

    # Clean Sentiment140 labels (0 and 4 → 0 and 1)
    # This part is likely redundant if loading 'cleaned.csv' files but kept for robustness
    if dataset_name == "Sentiment140":
        # Ensure 'target' column is present before trying to modify it
        if 'target' in df.columns:
             y = y.replace({4: 1})
        else:
             print("Warning: 'target' column not found for Sentiment140, skipping label remapping.")

    # Convert y to float32 early for cuML compatibility if used, and for consistency
    y = y.astype('float32')

    # --- NEW: Check for single class before cross-validation ---
    if len(np.unique(y)) < 2:
        print(f"❌ Error: Only one class found in {dataset_name} for target column '{label_col}'. Cannot perform binary classification.")
        print(f"   Unique classes found: {np.unique(y)}")
        return # Exit the function if only one class is found

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    scoring = ['accuracy', 'precision', 'recall', 'f1']

    # Use sklearn for Suicide Watch (or if force_sklearn is True)
    use_sklearn = force_sklearn or (dataset_name == "Suicide-Watch")

    pipeline = build_pipeline(use_sklearn=use_sklearn)

    results = cross_validate(
        pipeline, X, y,
        cv=kfold, scoring=scoring,
        error_score="raise"
    )

    print(f"   Accuracy:  {results['test_accuracy'].mean():.4f}")
    print(f"   Precision: {results['test_precision'].mean():.4f}")
    print(f"   Recall:    {results['test_recall'].mean():.4f}")
    print(f"   F1-Score:  {results['test_f1'].mean():.4f}")

In [None]:

run_logistic_regression(s140_path, "Sentiment140")



🔹 Training Logistic Regression on Sentiment140...
   Accuracy:  0.7813
   Precision: 0.7927
   Recall:    0.7604
   F1-Score:  0.7762


In [None]:
run_logistic_regression(sw_path, "Suicide-Watch")


🔹 Training Logistic Regression on Suicide-Watch...
❌ Error: Only one class found in Suicide-Watch for target column 'target'. Cannot perform binary classification.
   Unique classes found: [1.]


In [None]:
# @title  Train Naive Bayes
def run_naive_bayes(dataset_path, dataset_name):
    print(f"\n🔹 Training Bernoulli Naive Bayes on {dataset_name}...")

    X, y = load_data(dataset_path)
    if X is None: return

    pipeline = make_pipeline(
        TfidfVectorizer(max_features=5000),
        BernoulliNB()
    )

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    try:
        results = cross_validate(pipeline, X, y, cv=kfold, scoring=scoring)
        print(f"   Accuracy:  {results['test_accuracy'].mean():.4f}")
        print(f"   Precision: {results['test_precision'].mean():.4f}")
        print(f"   Recall:    {results['test_recall'].mean():.4f}")
        print(f"   F1-Score:  {results['test_f1'].mean():.4f}")
    except Exception as e:
        print(f"⚠️ Error: {e}")

# Run on both datasets
run_naive_bayes(s140_path, "Sentiment140")
run_naive_bayes(sw_path, "Suicide-Watch")


🔹 Training Bernoulli Naive Bayes on Sentiment140...
   Accuracy:  0.7639
   Precision: 0.7724
   Recall:    0.7468
   F1-Score:  0.7594

🔹 Training Bernoulli Naive Bayes on Suicide-Watch...
   Accuracy:  1.0000
   Precision: 1.0000
   Recall:    1.0000
   F1-Score:  1.0000


In [None]:
# @title Cell 4: Train Random Forest
def run_random_forest(dataset_path, dataset_name):
    print(f"\n🔹 Training Random Forest on {dataset_name}...")
    print("   (Note: Reducing features to 1000 and densifying data for GPU compatibility)")

    X, y = load_data(dataset_path)
    if X is None: return

    pipeline = make_pipeline(
        TfidfVectorizer(max_features=1000),
        FunctionTransformer(to_dense, accept_sparse=True),
        RandomForestClassifier(n_estimators=100)
    )

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    try:
        results = cross_validate(pipeline, X, y, cv=kfold, scoring=scoring)
        print(f"   Accuracy:  {results['test_accuracy'].mean():.4f}")
        print(f"   Precision: {results['test_precision'].mean():.4f}")
        print(f"   Recall:    {results['test_recall'].mean():.4f}")
        print(f"   F1-Score:  {results['test_f1'].mean():.4f}")
    except Exception as e:
        print(f"⚠️ Error: {e}")

# Run on both datasets
run_random_forest(s140_path, "Sentiment140")
run_random_forest(sw_path, "Suicide-Watch")


🔹 Training Random Forest on Sentiment140...
   (Note: Reducing features to 1000 and densifying data for GPU compatibility)
   Accuracy:  0.6916
   Precision: 0.7762
   Recall:    0.5364
   F1-Score:  0.6343

🔹 Training Random Forest on Suicide-Watch...
   (Note: Reducing features to 1000 and densifying data for GPU compatibility)
   Accuracy:  1.0000
   Precision: 1.0000
   Recall:    1.0000
   F1-Score:  1.0000


#Phase 4: Training Transformer Models

In [None]:
!pip install -q transformers accelerate evaluate datasets


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Disable Weights & Biases logging
%env WANDB_DISABLED=true
import torch
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate


env: WANDB_DISABLED=true


In [None]:

# Paths
base_path = '/content/drive/MyDrive/Depression_Paper_DL'
s140_path = os.path.join(base_path, 'sentiment140_cleaned.csv')
sw_path = os.path.join(base_path, 'suicidewatch_cleaned.csv')


In [None]:
# ==========================================
# 3. Training Function
# ==========================================
def train_transformer(dataset_path, dataset_name, model_checkpoint):
    print(f"\n{'='*10} Fine-Tuning {model_checkpoint} on {dataset_name} {'='*10}")

    # 1. Load Data
    if not os.path.exists(dataset_path):
        print(f"❌ Error: File not found at {dataset_path}")
        return

    df = pd.read_csv(dataset_path).dropna(subset=['clean_text', 'target'])

    # METHODOLOGY CHECK: The paper limits Transformers to 50,000 entries per fold
    # We sample 50k rows to match the paper's constraint and ensure Colab stability.
    if len(df) > 50000:
        print(f"ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...")
        df = df.sample(n=50000, random_state=42)

    # Ensure targets are integers for CrossEntropyLoss
    df['label'] = df['target'].astype(int)
    # Ensure text is strictly string format to avoid tokenization errors
    df['clean_text'] = df['clean_text'].astype(str)
    df = df[['clean_text', 'label']] # Keep only relevant columns

    # [cite_start]2. Split Data (80% Train, 20% Eval) [cite: 290]
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

    # Convert to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    # 3. Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    def preprocess_function(examples):
        return tokenizer(examples["clean_text"], truncation=True, padding=True, max_length=128)

    print("🔹 Tokenizing data...")
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 4. Initialize Model
    # num_labels=2 because we have binary classes (Depressed vs Non-Depressed)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    # [cite_start]5. Define Metrics (Accuracy, F1, Precision, Recall) [cite: 301]
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        acc = accuracy_metric.compute(predictions=predictions, references=labels)
        prec = precision_metric.compute(predictions=predictions, references=labels)
        rec = recall_metric.compute(predictions=predictions, references=labels)
        f1 = f1_metric.compute(predictions=predictions, references=labels)

        return {
            "accuracy": acc["accuracy"],
            "precision": prec["precision"],
            "recall": rec["recall"],
            "f1": f1["f1"],
        }

    # 6. Training Arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{dataset_name}_{model_checkpoint.replace('/', '-')}",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        fp16=torch.cuda.is_available(),
    )

    # 7. Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 8. Train & Evaluate
    print("🔹 Starting Training...")
    trainer.train()

    print("🔹 Final Evaluation:")
    metrics = trainer.evaluate()
    print(metrics)

#training with different transformers

distilbert-base-uncased

In [None]:

MODEL_CHECKPOINT = "distilbert-base-uncased"

# ==========================================
# 4. Execution
# ==========================================
# Run DistilBERT on Suicide-Watch (Simple Patterns)
train_transformer(sw_path, "Suicide-Watch", MODEL_CHECKPOINT)



ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...
🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔹 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,1e-06,1.0,1.0,1.0,1.0
2,0.0,0.0,1.0,1.0,1.0,1.0


🔹 Final Evaluation:


{'eval_loss': 2.4358033101634646e-07, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 11.3357, 'eval_samples_per_second': 882.168, 'eval_steps_per_second': 55.136, 'epoch': 2.0}


In [None]:
train_transformer(s140_path, "Sentiment140", MODEL_CHECKPOINT)


ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...
🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔹 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4725,0.489963,0.7746,0.812758,0.711245,0.758621
2,0.3935,0.476619,0.7841,0.782609,0.784337,0.783472


🔹 Final Evaluation:


{'eval_loss': 0.47661879658699036, 'eval_accuracy': 0.7841, 'eval_precision': 0.782608695652174, 'eval_recall': 0.7843373493975904, 'eval_f1': 0.7834720690001002, 'eval_runtime': 6.8199, 'eval_samples_per_second': 1466.296, 'eval_steps_per_second': 91.644, 'epoch': 2.0}


roberta-base

In [None]:

MODEL_CHECKPOINT = "roberta-base"

# ==========================================
# 4. Execution
# ==========================================
# Run DistilBERT on Suicide-Watch (Simple Patterns)
train_transformer(sw_path, "Suicide-Watch", MODEL_CHECKPOINT)



ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔹 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,2e-06,1.0,1.0,1.0,1.0
2,0.0,1e-06,1.0,1.0,1.0,1.0


🔹 Final Evaluation:


{'eval_loss': 1.0733604085544357e-06, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 19.809, 'eval_samples_per_second': 504.821, 'eval_steps_per_second': 31.551, 'epoch': 2.0}


In [None]:
train_transformer(s140_path, "Sentiment140", MODEL_CHECKPOINT)


ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...
🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔹 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4695,0.507239,0.7793,0.834661,0.694378,0.758084
2,0.4064,0.465947,0.7969,0.792734,0.801807,0.797245


🔹 Final Evaluation:


{'eval_loss': 0.4659474194049835, 'eval_accuracy': 0.7969, 'eval_precision': 0.7927337701012508, 'eval_recall': 0.8018072289156627, 'eval_f1': 0.7972446840371369, 'eval_runtime': 11.0862, 'eval_samples_per_second': 902.026, 'eval_steps_per_second': 56.377, 'epoch': 2.0}


In [None]:
import torch
import gc

# 1. Delete variables that might be holding GPU memory
# We wrap in try/except in case they are not defined yet
try:
    del model
    del trainer
    del optimizer
except NameError:
    pass

# 2. Force Python's Garbage Collector to release unreferenced memory
gc.collect()

# 3. Clear PyTorch's internal cache
torch.cuda.empty_cache()

# 4. Verify memory status
print(f"Current GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Current GPU Memory Reserved:  {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Current GPU Memory Allocated: 0.60 GB
Current GPU Memory Reserved:  0.66 GB


squeezebert/squeezebert-uncased

In [None]:
MODEL_CHECKPOINT = "squeezebert/squeezebert-uncased"

# ==========================================
# 4. Execution
# ==========================================
# Run DistilBERT on Suicide-Watch (Simple Patterns)
train_transformer(sw_path, "Suicide-Watch", MODEL_CHECKPOINT)



ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...


config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of SqueezeBertForSequenceClassification were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/103M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔹 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,2.1e-05,1.0,1.0,1.0,1.0
2,0.0,8e-06,1.0,1.0,1.0,1.0


🔹 Final Evaluation:


{'eval_loss': 8.17704221844906e-06, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 294.549, 'eval_samples_per_second': 33.95, 'eval_steps_per_second': 2.122, 'epoch': 2.0}


In [None]:
MODEL_CHECKPOINT = "squeezebert/squeezebert-uncased"
train_transformer(s140_path, "Sentiment140", MODEL_CHECKPOINT)


ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of SqueezeBertForSequenceClassification were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔹 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4945,0.511261,0.7539,0.803567,0.669478,0.73042
2,0.4557,0.487372,0.7731,0.764076,0.78755,0.775635


🔹 Final Evaluation:


{'eval_loss': 0.4873715937137604, 'eval_accuracy': 0.7731, 'eval_precision': 0.7640755893239821, 'eval_recall': 0.7875502008032128, 'eval_f1': 0.7756353208741225, 'eval_runtime': 18.0894, 'eval_samples_per_second': 552.81, 'eval_steps_per_second': 34.551, 'epoch': 2.0}


microsoft/deberta-base

In [None]:
# ==========================================
# 3. Training Function (FP16 Disabled)
# ==========================================
def train_transformer_microsoft(dataset_path, dataset_name, model_checkpoint):
    print(f"\n{'='*10} Fine-Tuning {model_checkpoint} on {dataset_name} {'='*10}")

    # 1. Load Data
    if not os.path.exists(dataset_path):
        print(f"❌ Error: File not found at {dataset_path}")
        return

    df = pd.read_csv(dataset_path).dropna(subset=['clean_text', 'target'])

    # Paper Methodology: 50,000 entries per fold
    if len(df) > 50000:
        print(f"ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...")
        df = df.sample(n=50000, random_state=42)

    df['label'] = df['target'].astype(int)
    df['clean_text'] = df['clean_text'].astype(str)
    df = df[['clean_text', 'label']]

    # [cite_start]2. Split Data (80% Train, 20% Eval) [cite: 290]
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    # 3. Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    def preprocess_function(examples):
        return tokenizer(examples["clean_text"], truncation=True, padding=True, max_length=128)

    print("🔹 Tokenizing data...")
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 4. Initialize Model
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    # 5. Define Metrics
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        acc = accuracy_metric.compute(predictions=predictions, references=labels)
        prec = precision_metric.compute(predictions=predictions, references=labels)
        rec = recall_metric.compute(predictions=predictions, references=labels)
        f1 = f1_metric.compute(predictions=predictions, references=labels)

        return {
            "accuracy": acc["accuracy"],
            "precision": prec["precision"],
            "recall": rec["recall"],
            "f1": f1["f1"],
        }

    # 6. Training Arguments
    # ⚠️ KEY CHANGE: fp16=False to prevent DeBERTa overflow error
    training_args = TrainingArguments(
        output_dir=f"./results_{dataset_name}_{model_checkpoint.replace('/', '-')}",
        learning_rate=2e-5,
        per_device_train_batch_size=8,   # Reduced batch size to 8 to compensate for higher memory usage of FP32
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        fp16=False,                     # <--- DISABLED FP16 HERE
    )

    # 7. Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 8. Train & Evaluate
    print("🔹 Starting Training...")
    trainer.train()

    print("🔹 Final Evaluation:")
    metrics = trainer.evaluate()
    print(metrics)

    # Cleanup to save RAM
    del model
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

In [None]:

MODEL_CHECKPOINT = "microsoft/deberta-base"

# ==========================================
# 4. Execution
# ==========================================
# Run DistilBERT on Suicide-Watch (Simple Patterns)
train_transformer_microsoft(sw_path, "Suicide-Watch", MODEL_CHECKPOINT)



ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...
🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


🔹 Starting Training...


RuntimeError: value cannot be converted to type at::Half without overflow

In [None]:
train_transformer_microsoft(s140_path, "Sentiment140", MODEL_CHECKPOINT)


ℹ️ Sampling 50,000 rows (Paper Methodology Constraint)...
🔹 Tokenizing data...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


🔹 Starting Training...


RuntimeError: value cannot be converted to type at::Half without overflow

In [None]:
import torch
import gc

# Delete potential leftovers
try:
    del model
    del trainer
except NameError:
    pass

gc.collect()
torch.cuda.empty_cache()
print("✅ GPU Memory Cleared")

✅ GPU Memory Cleared


In [None]:
import torch
from transformers import AutoModelForSequenceClassification

model_checkpoint = "microsoft/deberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

print("CUDA available:", torch.cuda.is_available())
# Print first 10 parameter dtypes
dtypes = list({p.dtype for p in list(model.parameters())[:10]})
print("Some parameter dtypes (sample):", dtypes)

# Do a tiny dummy forward (CPU) to check dtype of computed tensors
model.eval()
with torch.no_grad():
    ids = torch.tensor([[0,1,2,3,4,5]])
    mask = torch.ones_like(ids)
    out = model(input_ids=ids, attention_mask=mask)
    # find dtype of an internal output if present
    if hasattr(out, "logits"):
        print("logits dtype:", out.logits.dtype)
    else:
        print("output type:", type(out))


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CUDA available: True
Some parameter dtypes (sample): [torch.float32]
logits dtype: torch.float32


In [None]:
# @title  Training DeBERTa V3 (Stable)
# 1. Install Dependencies (V3 requires sentencepiece)
!pip install -q transformers accelerate evaluate datasets sentencepiece

import os
import torch
import gc
import numpy as np
import pandas as pd
from transformers import (
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate

# 2. Setup Paths & Config
# Update this path if needed
base_path = '/content/drive/MyDrive/Depression_Paper_DL'
csv_path = os.path.join(base_path, 'suicidewatch_cleaned.csv')

# SWITCHING TO V3 FOR STABILITY
MODEL_CHECKPOINT = "microsoft/deberta-v3-base"

# 3. Load & Clean Data
if not os.path.exists(csv_path):
    print(f"⚠️ File not found at {csv_path}. Checking local directory...")
    if os.path.exists("suicidewatch_cleaned.csv"):
        csv_path = "suicidewatch_cleaned.csv"
    else:
        # Fallback to creating dummy data if you are just testing the code structure
        print("❌ Dataset not found. Please upload 'suicidewatch_full_cleaned.csv'")

try:
    df = pd.read_csv(csv_path)

    # Strict Cleaning
    df['target'] = pd.to_numeric(df['target'], errors='coerce')
    df = df.dropna(subset=['clean_text', 'target'])
    df = df[df['target'].isin([0, 1])]

    # Sampling 50k
    if len(df) > 50000:
        df = df.sample(n=50000, random_state=42)

    df['label'] = df['target'].astype(int)
    df['clean_text'] = df['clean_text'].astype(str)

    print(f"✅ Data Loaded: {len(df)} rows")

    # 4. Split
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    # 5. Tokenizer (V3)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    def preprocess_function(examples):
        return tokenizer(examples["clean_text"], truncation=True, padding=True, max_length=128)

    print("🔹 Tokenizing...")
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 6. Model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

    # 7. Metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {
            "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
            "f1": f1.compute(predictions=predictions, references=labels)["f1"],
        }

    # 8. Training Arguments
    # V3 is usually stable with FP16, but we keep False just to be 100% safe given your errors.
    training_args = TrainingArguments(
        output_dir="./results_deberta_v3",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        fp16=False, # Keeping FP32 for maximum safety
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("🔹 Starting Training (DeBERTa V3)...")
    trainer.train()

    print("\n✅ Final Evaluation:")
    metrics = trainer.evaluate()
    print(metrics)

except Exception as e:
    print(f"\n❌ An error occurred: {e}")

✅ Data Loaded: 50000 rows


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



🔹 Tokenizing...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


🔹 Starting Training (DeBERTa V3)...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0



✅ Final Evaluation:


{'eval_loss': 9.536742923144104e-11, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 40.2657, 'eval_samples_per_second': 248.35, 'eval_steps_per_second': 31.044, 'epoch': 2.0}


In [None]:
# @title P Training DeBERTa V3 (Stable)
# 1. Install Dependencies (V3 requires sentencepiece)
!pip install -q transformers accelerate evaluate datasets sentencepiece

import os
import torch
import gc
import numpy as np
import pandas as pd
from transformers import (
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate

# 2. Setup Paths & Config
# Update this path if needed
base_path = '/content/drive/MyDrive/Depression_Paper_DL'
csv_path = os.path.join(base_path, 'sentiment140_cleaned.csv')

# SWITCHING TO V3 FOR STABILITY
MODEL_CHECKPOINT = "microsoft/deberta-v3-base"

# 3. Load & Clean Data
if not os.path.exists(csv_path):
    print(f"⚠️ File not found at {csv_path}. Checking local directory...")
    if os.path.exists("suicidewatch_cleaned.csv"):
        csv_path = "suicidewatch_cleaned.csv"
    else:
        # Fallback to creating dummy data if you are just testing the code structure
        print("❌ Dataset not found. Please upload 'suicidewatch_full_cleaned.csv'")

try:
    df = pd.read_csv(csv_path)

    # Strict Cleaning
    df['target'] = pd.to_numeric(df['target'], errors='coerce')
    df = df.dropna(subset=['clean_text', 'target'])
    df = df[df['target'].isin([0, 1])]

    # Sampling 50k
    if len(df) > 50000:
        df = df.sample(n=50000, random_state=42)

    df['label'] = df['target'].astype(int)
    df['clean_text'] = df['clean_text'].astype(str)

    print(f"✅ Data Loaded: {len(df)} rows")

    # 4. Split
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    # 5. Tokenizer (V3)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    def preprocess_function(examples):
        return tokenizer(examples["clean_text"], truncation=True, padding=True, max_length=128)

    print("🔹 Tokenizing...")
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 6. Model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

    # 7. Metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {
            "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
            "f1": f1.compute(predictions=predictions, references=labels)["f1"],
        }

    # 8. Training Arguments
    # V3 is usually stable with FP16, but we keep False just to be 100% safe given your errors.
    training_args = TrainingArguments(
        output_dir="./results_deberta_v3",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        fp16=False, # Keeping FP32 for maximum safety
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("🔹 Starting Training (DeBERTa V3)...")
    trainer.train()

    print("\n✅ Final Evaluation:")
    metrics = trainer.evaluate()
    print(metrics)

except Exception as e:
    print(f"\n❌ An error occurred: {e}")

✅ Data Loaded: 50000 rows




🔹 Tokenizing...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


🔹 Starting Training (DeBERTa V3)...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.457,0.48588,0.7878,0.768138
2,0.3791,0.495064,0.8,0.79992



✅ Final Evaluation:


{'eval_loss': 0.4858803153038025, 'eval_accuracy': 0.7878, 'eval_f1': 0.7681381118881119, 'eval_runtime': 36.3848, 'eval_samples_per_second': 274.84, 'eval_steps_per_second': 34.355, 'epoch': 2.0}


#Phase6

In [None]:
# @title Inference with Logistic Regression (CPU/GPU)
import joblib
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 1. Setup Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(filtered_tokens)

# 2. Prediction Function
def predict_depression_ml(text, vectorizer, model):
    # Clean
    clean_text = preprocess_text(text)
    # Vectorize
    vectorized_text = vectorizer.transform([clean_text])
    # Predict
    try:
        prediction = model.predict(vectorized_text)[0]
        label = "Depressed" if prediction == 1 else "Non-Depressed"
        return label
    except Exception as e:
        return f"Error: {e}"

# 3. Usage Example

print("\n--- Testing Custom Inputs (Traditional ML) ---")
custom_tweets = [
    "I feel absolutely hopeless and I don't see a way out of this darkness.",
    "Had a great day at the park with friends! heavy sunshine.",
    "I'm so tired of trying, nothing ever gets better."
]

if 'pipeline' in locals():
    # Extract parts from the pipeline for clarity
    vect = pipeline.named_steps['tfidfvectorizer']
    clf = pipeline.named_steps['logisticregression']

    for tweet in custom_tweets:
        result = predict_depression_ml(tweet, vect, clf)
        print(f"Input: '{tweet}'\nPrediction: {result}\n")
else:
    print("⚠️ Pipeline not found in memory. Run Phase 3 first.")


--- Testing Custom Inputs (Traditional ML) ---
⚠️ Pipeline not found in memory. Run Phase 3 first.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# @title Inference with Transformer (RoBERTa/DistilBERT)
from transformers import pipeline as hf_pipeline

import glob
import os

result_dir = "./results_Suicide-Watch_distilbert-base-uncased"

try:
    # Find the latest checkpoint folder
    checkpoints = glob.glob(f"{result_dir}/checkpoint-*")
    latest_checkpoint = max(checkpoints, key=os.path.getctime)
    print(f"Loading model from: {latest_checkpoint}")

    # 2. Create Inference Pipeline' tool
    classifier = hf_pipeline("text-classification", model=latest_checkpoint, tokenizer="roberta-base")

    # 3. Prediction Function
    def predict_depression_transformer(text):
        clean_text = preprocess_text(text)
        result = classifier(clean_text)

        label_map = {'LABEL_1': 'Depressed', 'LABEL_0': 'Non-Depressed'}
        label_str = label_map.get(result[0]['label'], "Unknown")
        score = result[0]['score']

        return label_str, score

    print("\n--- Testing Custom Inputs (Transformer) ---")
    custom_tweets = [
        "I feel absolutely hopeless and I don't see a way out of this darkness.",
        "Had a great day at the park with friends! heavy sunshine.",
        "I'm so tired of trying, nothing ever gets better."
    ]

    for tweet in custom_tweets:
        label, conf = predict_depression_transformer(tweet)
        print(f"Input: '{tweet}'\nPrediction: {label} (Confidence: {conf:.4f})\n")

except ValueError:
    print(f"⚠️ Checkpoint not found in {result_dir}. Did Phase 4 finish successfully?")
except Exception as e:
    print(f"⚠️ Error: {e}")

Loading model from: ./results_Suicide-Watch_distilbert-base-uncased/checkpoint-5000


Device set to use cuda:0



--- Testing Custom Inputs (Transformer) ---
⚠️ Error: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



MODIFICATION (XG BOOST, XLM-RoBERTa)

In [None]:
# Import necessary libraries
import xgboost as xgb
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Function for XLM-RoBERTa model implementation
def train_xlm_roberta_model(X_train, y_train, X_val, y_val, num_epochs=3, batch_size=16):
    """
    Train XLM-RoBERTa model for text classification
    """
    print("Initializing XLM-RoBERTa model...")
    
    # Load pre-trained XLM-RoBERTa model and tokenizer
    model_name = 'xlm-roberta-base'
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=len(np.unique(y_train))
    )
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Tokenize the training and validation data
    print("Tokenizing data...")
    
    train_encodings = tokenizer(
        X_train.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='pt'
    )
    
    val_encodings = tokenizer(
        X_val.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='pt'
    )
    
    # Create PyTorch datasets
    train_dataset = TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train)
    )
    
    val_dataset = TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val)
    )
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Set up optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    
    # Training loop
    print("Training XLM-RoBERTa model...")
    model.train()
    
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")
    
    # Validation
    print("Evaluating XLM-RoBERTa model...")
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    print(f"\nXLM-RoBERTa Validation Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))
    
    return model, tokenizer, accuracy

# Function for XGBoost model implementation
def train_xgboost_model(X_train, y_train, X_val, y_val, use_bert_embeddings=True, bert_model=None, tokenizer=None):
    """
    Train XGBoost model for text classification
    Option to use BERT embeddings or TF-IDF features
    """
    print("Training XGBoost model...")
    
    # Convert text to features
    if use_bert_embeddings and bert_model is not None and tokenizer is not None:
        print("Extracting BERT embeddings for XGBoost...")
        X_train_features = extract_bert_embeddings(bert_model, tokenizer, X_train)
        X_val_features = extract_bert_embeddings(bert_model, tokenizer, X_val)
    else:
        # Use TF-IDF as fallback
        print("Using TF-IDF features for XGBoost...")
        from sklearn.feature_extraction.text import TfidfVectorizer
        
        vectorizer = TfidfVectorizer(max_features=5000)
        X_train_features = vectorizer.fit_transform(X_train).toarray()
        X_val_features = vectorizer.transform(X_val).toarray()
    
    # Train XGBoost model
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss'
    )
    
    xgb_model.fit(X_train_features, y_train)
    
    # Make predictions
    y_pred = xgb_model.predict(X_val_features)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    print(f"\nXGBoost Validation Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    
    return xgb_model, accuracy

def extract_bert_embeddings(model, tokenizer, texts, batch_size=16):
    """
    Extract BERT embeddings from text
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encodings = tokenizer(
            batch_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True
            )
            
            # Use last hidden state [CLS] token
            last_hidden_state = outputs.hidden_states[-1]
            cls_embeddings = last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())
    
    return np.vstack(all_embeddings)

# Main training function combining both models
def train_hybrid_model(X_train, y_train, X_val, y_val):
    """
    Train XLM-RoBERTa and XGBoost models
    """
    print("="*60)
    print("Training XLM-RoBERTa Model")
    print("="*60)
    
    # Train XLM-RoBERTa
    xlm_model, tokenizer, xlm_accuracy = train_xlm_roberta_model(
        X_train, y_train, X_val, y_val,
        num_epochs=3,
        batch_size=16
    )
    
    print("\n" + "="*60)
    print("Training XGBoost with BERT Embeddings")
    print("="*60)
    
    # Train XGBoost using XLM-RoBERTa embeddings
    xgb_model, xgb_accuracy = train_xgboost_model(
        X_train, y_train, X_val, y_val,
        use_bert_embeddings=True,
        bert_model=xlm_model,
        tokenizer=tokenizer
    )
    
    # Create ensemble predictions (simple voting)
    print("\n" + "="*60)
    print("Creating Ensemble Predictions")
    print("="*60)
    
    # Get XLM-RoBERTa predictions
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    xlm_model.eval()
    
    val_encodings = tokenizer(
        X_val.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='pt'
    )
    
    val_dataset = TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val)
    )
    
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    xlm_predictions = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, _ = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            outputs = xlm_model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            preds = torch.argmax(outputs.logits, dim=1)
            xlm_predictions.extend(preds.cpu().numpy())
    
    # Get XGBoost predictions
    val_features = extract_bert_embeddings(xlm_model, tokenizer, X_val)
    xgb_predictions = xgb_model.predict(val_features)
    
    # Ensemble voting
    ensemble_predictions = []
    for xlm_pred, xgb_pred in zip(xlm_predictions, xgb_predictions):
        # Simple voting (you can modify this logic)
        if xlm_pred == xgb_pred:
            ensemble_predictions.append(xlm_pred)
        else:
            # In case of disagreement, use XLM-RoBERTa (usually more accurate for text)
            ensemble_predictions.append(xlm_pred)
    
    # Calculate ensemble accuracy
    ensemble_accuracy = accuracy_score(y_val, ensemble_predictions)
    print(f"\nEnsemble Model Accuracy: {ensemble_accuracy:.4f}")
    print("\nEnsemble Classification Report:")
    print(classification_report(y_val, ensemble_predictions))
    
    return {
        'xlm_roberta': {'model': xlm_model, 'tokenizer': tokenizer, 'accuracy': xlm_accuracy},
        'xgboost': {'model': xgb_model, 'accuracy': xgb_accuracy},
        'ensemble_accuracy': ensemble_accuracy
    }

# Example usage in your notebook
"""
# Assuming you have your data loaded as:
# X_train, X_val, y_train, y_val

# Train the hybrid model
results = train_hybrid_model(X_train, y_train, X_val, y_val)

print(f"\n{'='*60}")
print("FINAL RESULTS")
print('='*60)
print(f"XLM-RoBERTa Accuracy: {results['xlm_roberta']['accuracy']:.4f}")
print(f"XGBoost Accuracy: {results['xgboost']['accuracy']:.4f}")
print(f"Ensemble Accuracy: {results['ensemble_accuracy']:.4f}")
print('='*60)
"""