In [3]:
# BiLSTM MODEL

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, Bidirectional, LSTM,
    Dense, Dropout
)

data_file = "suicide_detection_rare_word_removed.csv"
df = pd.read_csv(data_file)

print("Dataset loaded successfully")

df["text"] = df["text"].astype(str)

X = df["text"]
y = df["class"]


y = y.map({"non-suicide": 0, "suicide": 1})


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train/Test split completed")

MAX_WORDS = 20000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

print("Tokenization & padding completed")


model = Sequential([
    Embedding(
        input_dim=MAX_WORDS,
        output_dim=128,
        input_length=MAX_LEN
    ),

    Bidirectional(
        LSTM(64, return_sequences=False)
    ),

    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.5),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()


history = model.fit(
    X_train_pad,
    y_train,
    epochs=10,
    batch_size=128,
    validation_split=0.1,
    verbose=1
)


y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=["non-suicide", "suicide"]
))


model.save("bilstm_model.h5")
joblib.dump(tokenizer, "bilstm_tokenizer.pkl")

print("\nBiLSTM model and tokenizer saved successfully")



Dataset loaded successfully
Train/Test split completed
Tokenization & padding completed




Epoch 1/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 25ms/step - accuracy: 0.8834 - loss: 0.2799 - val_accuracy: 0.9435 - val_loss: 0.1479
Epoch 2/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 25ms/step - accuracy: 0.9535 - loss: 0.1303 - val_accuracy: 0.9486 - val_loss: 0.1411
Epoch 3/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 24ms/step - accuracy: 0.9600 - loss: 0.1126 - val_accuracy: 0.9445 - val_loss: 0.1594
Epoch 4/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 24ms/step - accuracy: 0.9635 - loss: 0.0998 - val_accuracy: 0.9445 - val_loss: 0.1493
Epoch 5/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 25ms/step - accuracy: 0.9720 - loss: 0.0779 - val_accuracy: 0.9447 - val_loss: 0.1694
Epoch 6/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 24ms/step - accuracy: 0.9766 - loss: 0.0635 - val_accuracy: 0.9413 - val_loss: 0.1914
Epoc




Accuracy: 0.9400250075453801

Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.94      0.94      0.94     23191
     suicide       0.94      0.94      0.94     23195

    accuracy                           0.94     46386
   macro avg       0.94      0.94      0.94     46386
weighted avg       0.94      0.94      0.94     46386


BiLSTM model and tokenizer saved successfully
