In [23]:
import pandas as pd

In [24]:
df = pd.read_csv('Suicide_Detection.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  232074 non-null  int64 
 1   text        232074 non-null  object
 2   class       232074 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


In [26]:
df.duplicated().sum()

0

In [27]:
df['class'].value_counts()

class
suicide        116037
non-suicide    116037
Name: count, dtype: int64

In [28]:
from sklearn.model_selection import train_test_split

X = df['text']       # features
y = df['class']      # target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Data Cleaning

In [29]:
from tqdm import tqdm
import neattext.functions as nfx

def clean_text(text_series):
    text_length = []
    cleaned_text = []

    for sent in tqdm(text_series):
        sent = sent.lower()
        sent = nfx.remove_special_characters(sent)
        sent = nfx.remove_stopwords(sent)
        text_length.append(len(sent.split()))
        cleaned_text.append(sent)
    
    return cleaned_text, text_length

In [30]:
# Clean the training text
X_train_cleaned, X_train_length = clean_text(X_train)

# Clean the test text
X_test_cleaned, X_test_length = clean_text(X_test)

  0%|          | 0/185659 [00:00<?, ?it/s]

100%|██████████| 185659/185659 [00:20<00:00, 8906.08it/s]
100%|██████████| 46415/46415 [00:05<00:00, 8757.67it/s]


In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_cleaned)

In [32]:
# convert text to squences of integers

X_train_seq = tokenizer.texts_to_sequences(X_train_cleaned)
X_test_seq = tokenizer.texts_to_sequences(X_test_cleaned)
    
# pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')

### Glove Embeddings

In [33]:
from sklearn.preprocessing import LabelEncoder

lbl_target = LabelEncoder()
train_output = lbl_target.fit_transform(y_train)
test_output = lbl_target.transform(y_test)

In [34]:
print(lbl_target.classes_)

['non-suicide' 'suicide']


In [35]:
import pickle
import numpy as np

# Load GloVe pickle
with open('glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

# Build embedding matrix
vocab_size = len(tokenizer.word_index)
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim), dtype=float)

found = 0
for word, idx in tokenizer.word_index.items():
    embedding_vector = glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
        found += 1

print(f"Found embeddings for {found}/{vocab_size} words ({found/vocab_size:.2%})")

Found embeddings for 85471/271463 words (31.49%)


In [36]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(patience = 5)
reducelr = ReduceLROnPlateau(patience = 3)

### Keras Sequential Model Construction

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import SGD

# Model
model = Sequential()
model.add(Embedding(
    input_dim=vocab_size + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=100,       
    trainable=False
))
model.add(LSTM(20, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile
optimizer = SGD(learning_rate=0.1, momentum=0.09)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 300)          81439200  
                                                                 
 lstm_2 (LSTM)               (None, 100, 20)           25680     
                                                                 
 global_max_pooling1d_2 (Glo  (None, 20)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 256)               5376      
                                                                 
 dense_5 (Dense)             (None, 1)                 257       
                                                                 
Total params: 81,470,513
Trainable params: 31,313
Non-trainable params: 81,439,200
_____________________________________

In [38]:
# Train model
history = model.fit(
    X_train_pad,
    train_output,
    validation_data=(X_test_pad, test_output),
    epochs=10,
    batch_size=256,
    callbacks=[early_stop, reducelr],
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluate Model

In [39]:
loss, acc = model.evaluate(X_test_pad, test_output)
print(f"Test Accuracy: {acc*100:.2f}%")

Test Accuracy: 92.86%


In [40]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = (model.predict(X_test_pad) > 0.5).astype(int)

print(classification_report(test_output, y_pred, target_names=lbl_target.classes_))


              precision    recall  f1-score   support

 non-suicide       0.90      0.96      0.93     23208
     suicide       0.96      0.90      0.93     23207

    accuracy                           0.93     46415
   macro avg       0.93      0.93      0.93     46415
weighted avg       0.93      0.93      0.93     46415



### Functions to make Predictions on New Text

In [42]:
def predict_suicide(text):
    cleaned = nfx.remove_special_characters(text.lower())
    cleaned = nfx.remove_stopwords(cleaned)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=100, padding='post')
    pred = model.predict(pad)[0][0]
    label = lbl_target.inverse_transform([int(pred > 0.5)])[0]
    print(f"Prediction: {label} ({pred:.2f} confidence)")

In [46]:
predict_suicide("I don't want to live anymore.")
predict_suicide("I feel hopeless and tired of everything.")
predict_suicide("Life is going great, I’m excited for tomorrow!")

Prediction: suicide (0.93 confidence)
Prediction: suicide (0.86 confidence)
Prediction: non-suicide (0.23 confidence)


In [45]:
import pickle
model.save("suicide_detection_lstm_glove.h5")

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(lbl_target, f)