In [None]:
import os
import re
import json
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score, roc_curve
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet", quiet=True)

In [None]:
# CONFIG
DATA_PATH = "data/Suicide_Detection.csv"
MODELS_DIR = "models"
TOKENIZER_PATH = os.path.join(MODELS_DIR, "tokenizer.json")
MODEL_PATH = os.path.join(MODELS_DIR, "suicide_model.keras")
TEMP_MODEL_PATH = os.path.join(MODELS_DIR, "suicide_model_tmp.keras")

MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100

BATCH_SIZE = 32
EPOCHS = 5
TEST_SIZE = 0.2
RANDOM_STATE = 42

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs("static", exist_ok=True)

In [None]:
# Preprocessing
contraction_map = {
    "i'm": "i am", "ain't": "is not", "can't": "can not",
    "won't": "will not", "i've": "i have", "i'll": "i will",
    "you're": "you are", "they're": "they are", "we're": "we are",
    "it's": "it is", "that's": "that is"
}

slang_map = {
    "gonna": "going to", "wanna": "want to", "idk": "i do not know",
    "lol": "", "lmao": "", "pls": "please"
}

lemmatizer = WordNetLemmatizer()

def expand_contractions_and_slang(text):
    t = text.lower()
    for k, v in contraction_map.items():
        t = re.sub(r'\b' + re.escape(k) + r'\b', v, t)
    for k, v in slang_map.items():
        t = re.sub(r'\b' + re.escape(k) + r'\b', v, t)
    return t

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    t = text.strip()
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = expand_contractions_and_slang(t)
    t = t.encode("ascii", errors="ignore").decode("ascii")
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    words = [lemmatizer.lemmatize(w) for w in t.split()]
    return " ".join(words)

In [None]:
# LOAD DATA
print("Loading dataset:", DATA_PATH)
df = pd.read_csv(DATA_PATH)

# auto-detect columns
if "text" not in df.columns:
    candidates = [c for c in df.columns if "text" in c.lower()]
    if candidates:
        df.rename(columns={candidates[0]: "text"}, inplace=True)
    else:
        raise ValueError("No 'text' column found.")

if "class" not in df.columns:
    candidates = [c for c in df.columns if "class" in c.lower() or "label" in c.lower()]
    if candidates:
        df.rename(columns={candidates[0]: "class"}, inplace=True)
    else:
        raise ValueError("No label column.")

df.dropna(subset=["text", "class"], inplace=True)

df["clean_text"] = df["text"].apply(clean_text)
df = df[df["clean_text"].str.strip() != ""]

label_map = {"suicide": 1, "non-suicide": 0}
df["label"] = df["class"].astype(str).str.lower().map(label_map)

In [None]:
# Tokenizer
if os.path.exists(TOKENIZER_PATH):
    print("Loading tokenizer...")
    with open(TOKENIZER_PATH, "r") as f:
        tokenizer = tokenizer_from_json(f.read())
else:
    print("Training tokenizer...")
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(df["clean_text"])
    with open(TOKENIZER_PATH, "w") as f:
        f.write(tokenizer.to_json())

sequences = tokenizer.texts_to_sequences(df["clean_text"])
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = df["label"].values

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=42
)

# class weight
counts = Counter(y_train)
total = sum(counts.values())
class_weight = {c: total / (len(counts) * n) for c, n in counts.items()}

# Model
vocab_size = min(MAX_NUM_WORDS, len(tokenizer.word_index) + 1)

def build_model():
    model = Sequential([
        Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        Conv1D(128, 5, activation="relu"),
        MaxPooling1D(4),
        Dropout(0.4),
        Bidirectional(LSTM(64, dropout=0.3)),
        Dense(32, activation="relu"),
        Dropout(0.4),
        Dense(1, activation="sigmoid", kernel_regularizer=l2(0.01)),
    ])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

# Load or train
if os.path.exists(MODEL_PATH):
    print("Loading saved model...")
    model = load_model(MODEL_PATH)
    history = None
else:
    print("Training new model...")
    model = build_model()

    early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2)
    checkpoint = ModelCheckpoint(TEMP_MODEL_PATH, save_best_only=True)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stop, reduce_lr, checkpoint],
        class_weight=class_weight,
        verbose=2
    )

    if os.path.exists(TEMP_MODEL_PATH):
        os.replace(TEMP_MODEL_PATH, MODEL_PATH)
    else:
        model.save(MODEL_PATH)

In [None]:
# Evaluation
y_prob = model.predict(X_test).ravel()
auc = roc_auc_score(y_test, y_prob)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

best_idx = np.argmax(tpr - fpr)
best_thresh = thresholds[best_idx]

y_pred = (y_prob >= best_thresh).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n----- MODEL PERFORMANCE -----")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("AUC:", auc)


plt.figure()
plt.bar(["Accuracy", "Precision", "Recall", "F1", "AUC"], [acc, prec, rec, f1, auc])
plt.title("Metrics")
plt.ylim(0, 1)
plt.savefig("static/metrics.png")
plt.show()

if history:
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history["accuracy"], label="Train")
    plt.plot(history.history["val_accuracy"], label="Val")
    plt.title("Accuracy")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history["loss"], label="Train")
    plt.plot(history.history["val_loss"], label="Val")
    plt.title("Loss")
    plt.legend()
    plt.savefig("static/training_curves.png")
    plt.show()

cm = confusion_matrix(y_test, y_pred)
plt.figure()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.savefig("static/confusion_matrix.png")
plt.show()

In [None]:
# Text
def predict_texts(texts):
    cleaned = [clean_text(t) for t in texts]
    seqs = tokenizer.texts_to_sequences(cleaned)
    pad = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)
    probs = model.predict(pad).ravel()
    return [(t, int(p >= best_thresh), float(p)) for t, p in zip(cleaned, probs)]

# quick test
examples = ["I want to die", "Life is beautiful", "I feel like killing myself"]
print("\n--- TEST PREDICTIONS ---")
for txt, lbl, prob in predict_texts(examples):
    print(f"{txt} â†’ label={lbl}, prob={prob}")