In [3]:
# GRU MODEL

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, GRU,
    Dense, Dropout
)

data_file = "suicide_detection_rare_word_removed.csv"
df = pd.read_csv(data_file)

print("Dataset loaded successfully")

df["text"] = df["text"].astype(str)

X = df["text"]
y = df["class"]


y = y.map({"non-suicide": 0, "suicide": 1})

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train/Test split completed")


MAX_WORDS = 20000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

print("Tokenization & padding completed")


model = Sequential([
    Embedding(
        input_dim=MAX_WORDS,
        output_dim=128,
        input_length=MAX_LEN
    ),

    GRU(64, return_sequences=False),

    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.5),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()


history = model.fit(
    X_train_pad,
    y_train,
    epochs=10,
    batch_size=128,
    validation_split=0.1,
    verbose=1
)


y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=["non-suicide", "suicide"]
))


model.save("gru_model.h5")
joblib.dump(tokenizer, "gru_tokenizer.pkl")

print("\nGRU model and tokenizer saved successfully")


Dataset loaded successfully
Train/Test split completed
Tokenization & padding completed




Epoch 1/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m358s[0m 271ms/step - accuracy: 0.7189 - loss: 0.4799 - val_accuracy: 0.9417 - val_loss: 0.1575
Epoch 2/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 295ms/step - accuracy: 0.9464 - loss: 0.1501 - val_accuracy: 0.9471 - val_loss: 0.1442
Epoch 3/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 298ms/step - accuracy: 0.9583 - loss: 0.1158 - val_accuracy: 0.9445 - val_loss: 0.1567
Epoch 4/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 300ms/step - accuracy: 0.9673 - loss: 0.0893 - val_accuracy: 0.9412 - val_loss: 0.1878
Epoch 5/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 302ms/step - accuracy: 0.9733 - loss: 0.0723 - val_accuracy: 0.9387 - val_loss: 0.2403
Epoch 6/10
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 298ms/step - accuracy: 0.9788 - loss: 0.0554 - val_accuracy: 0.9378 - val_loss:




Accuracy: 0.9351959642995732

Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.94      0.93      0.93     23191
     suicide       0.93      0.94      0.94     23195

    accuracy                           0.94     46386
   macro avg       0.94      0.94      0.94     46386
weighted avg       0.94      0.94      0.94     46386


GRU model and tokenizer saved successfully
