Implementation of the paper proposed by Ghosh et al. entitled
*"An attention-based hybrid architecture with explainability for depressive social media text detection in Bangla"* published in 2023.


In [None]:
! pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import tensorflow as tf
# from tensorflow.keras.layers import *
# from tensorflow.keras.models import *
from nltk.tokenize import word_tokenize
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import keras.backend as K
import gensim
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings(action = 'ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Preprocessing function
def preprocessing(data_frame):
    ## Preprocessing
    # Removing URLs whithin the tweets
    data_frame["Text"] = data_frame["Text"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
    # Removing emails, hashtags and punctuations
    data_frame['Text'] = data_frame["Text"].str.replace(r'\S*@\S*\s?', ' ').str.strip()
    data_frame['Text'] = data_frame['Text'].str.replace(r'#\S*\s?', ' ').str.strip()
    data_frame['Text'] = data_frame['Text'].str.replace(r'[^\w\s]+', ' ').str.strip()

    # # Removing stopwords
    stop = stopwords.words('english')
    data_frame['Text'].apply(lambda x: [item for item in str(x) if item not in stop])

    # Removing newline characters
    data_frame['Text'] = data_frame['Text'].str.rstrip()

    # Tokenizing Posts and counting the length of each post
    data_frame['Tokens'] = data_frame.apply(lambda row: word_tokenize(str(row['Text'])), axis=1)
    data_frame['Length'] = data_frame.apply(lambda row: len(row['Tokens']), axis=1)

    return data_frame

# Data

In [None]:
DS_Path = "/Datasets"

### Twitter 1

In [None]:
# ## Preparing the data
# ## Twitter 10000
Twitter_path = DS_Path + "/Twitter/twitter-suicidal_data_10000.csv"
df = pd.read_csv(Twitter_path, encoding='latin-1')
df = df.rename(columns={'tweet': 'Text', 'intention': 'Label'})
df = preprocessing(df)
df


Unnamed: 0,Text,Label,Tokens,Length
0,my life is meaningless i just want to end my l...,1,"[my, life, is, meaningless, i, just, want, to,...",79
1,muttering i wanna die to myself daily for a fe...,1,"[muttering, i, wan, na, die, to, myself, daily...",46
2,work slave i really feel like my only purpose ...,1,"[work, slave, i, really, feel, like, my, only,...",69
3,i did something on the 2 of october i overdose...,1,"[i, did, something, on, the, 2, of, october, i...",77
4,i feel like no one cares i just want to die ma...,1,"[i, feel, like, no, one, cares, i, just, want,...",18
...,...,...,...,...
9114,have you ever laid on your bed at night and cr...,1,"[have, you, ever, laid, on, your, bed, at, nig...",33
9115,the fault the blame the pain s still there i m...,1,"[the, fault, the, blame, the, pain, s, still, ...",20
9116,stop asking me to trust you when i m still cou...,1,"[stop, asking, me, to, trust, you, when, i, m,...",22
9117,i never know how to handle sadness crying make...,1,"[i, never, know, how, to, handle, sadness, cry...",12


### Twitter 2

In [None]:
twitter_path = DS_Path + "/suicidal-tendency-tweets.csv"
df = pd.read_csv(twitter_path, encoding='latin-1', usecols=['tweet', 'intention'], nrows = 17142)
df = df.rename(columns={'tweet': 'Text', 'intention': 'Label'})

df = preprocessing(df)
df

Unnamed: 0,Text,Label,Tokens,Length
0,to go to treatment if they are actively suicid...,1,"[to, go, to, treatment, if, they, are, activel...",41
1,Thy should be putting these kids in stabiliza...,1,"[Thy, should, be, putting, these, kids, in, st...",49
2,Might want to talk to her work again,0,"[Might, want, to, talk, to, her, work, again]",8
3,Just a heads head up she think she going goofy...,0,"[Just, a, heads, head, up, she, think, she, go...",35
4,I donâ t believe in religious dogma at all Fo...,0,"[I, donâ, t, believe, in, religious, dogma, at...",47
...,...,...,...,...
17137,Make sure you are happy in real life Not just...,0,"[Make, sure, you, are, happy, in, real, life, ...",40
17138,With one of our keyrings you will never be alo...,0,"[With, one, of, our, keyrings, you, will, neve...",20
17139,Always here if you ever need a chat ð,0,"[Always, here, if, you, ever, need, a, chat, ð]",9
17140,Itâ s not selfish Suicide is usually caused b...,1,"[Itâ, s, not, selfish, Suicide, is, usually, c...",47


In [None]:
print(list(df['Label']).count(1), list(df['Label']).count(0))

3754 13388


## Reddit SNS

In [None]:
# ## Preparing the data

Reddit_path = DS_Path + "/Reddit_non suicide  suicide/Suicide_Detection.csv"

## reading the abovementioned datasets
# df_r = pd.read_csv(Reddit_path, encoding='latin-1', usecols=['text', 'class'])
df_r = pd.read_csv(Reddit_path, encoding='latin-1', usecols=['text', 'class'], nrows = 20000)
df_r = df_r.rename(columns={'text': 'Text', 'class': 'Label'})

df = preprocessing(df_r)

label_dict = {'suicide': 1, 'non-suicide': 0}
df['Label'] = df['Label'].apply(lambda row: label_dict[row])
df['Label']

df

Unnamed: 0,Text,Label,Tokens,Length
0,Ex Wife Threatening SuicideRecently I left my ...,1,"[Ex, Wife, Threatening, SuicideRecently, I, le...",147
1,Am I weird I don t get affected by compliments...,0,"[Am, I, weird, I, don, t, get, affected, by, c...",29
2,Finally 2020 is almost over So I can never he...,0,"[Finally, 2020, is, almost, over, So, I, can, ...",27
3,i need helpjust help me im crying so hard,1,"[i, need, helpjust, help, me, im, crying, so, ...",9
4,Iâ m so lostHello my name is Adam 16 and Iâ...,1,"[Iâ, m, so, lostHello, my, name, is, Adam, 16,...",459
...,...,...,...,...
19995,I just wish I wasn t like thisI read a lot of ...,1,"[I, just, wish, I, wasn, t, like, thisI, read,...",523
19996,Future druggie Why do i see the only reason to...,1,"[Future, druggie, Why, do, i, see, the, only, ...",62
19997,I just tried to kill myself Help I just tried...,1,"[I, just, tried, to, kill, myself, Help, I, ju...",44
19998,Someone downvoted my post about me being a pie...,1,"[Someone, downvoted, my, post, about, me, bein...",47


In [None]:
print(list(df['Label']).count(1), list(df['Label']).count(0))

9865 10135


## Reddit _ SD

In [None]:
## Preparing the data
reddit_path = DS_Path + "/Reddit_depressionSuicide/reddit_depression_suicidewatch.csv"

df = pd.read_csv(reddit_path, encoding='latin-1')
df = df.rename(columns={'text':'Text', 'label':'Label'})
label_dict = {'depression': 0, 'SuicideWatch': 1}
df['Label'] = df['Label'].apply(lambda row: label_dict[row])
df = preprocessing(df)
df

Unnamed: 0,Text,Label,Tokens,Length
0,I recently went through a breakup and she said...,0,"[I, recently, went, through, a, breakup, and, ...",121
1,I do not know how to navigate these feelings ...,0,"[I, do, not, know, how, to, navigate, these, f...",308
2,So I have been with my bf for 5 months and h...,0,"[So, I, have, been, with, my, bf, for, 5, mont...",131
3,I am so exhausted of this Just when I think I...,1,"[I, am, so, exhausted, of, this, Just, when, I...",349
4,I have been severly bullied since i was 5 till...,0,"[I, have, been, severly, bullied, since, i, wa...",198
...,...,...,...,...
20358,I took 50mg of seroquel a few hours after I dr...,1,"[I, took, 50mg, of, seroquel, a, few, hours, a...",35
20359,that is what has happened to me last week And...,0,"[that, is, what, has, happened, to, me, last, ...",119
20360,Ever just feel alone in a house full of people...,0,"[Ever, just, feel, alone, in, a, house, full, ...",20
20361,Politicians Neighbors Corporations Society ...,0,"[Politicians, Neighbors, Corporations, Society...",31


## Reddit_SD (Positive Suicide Samples) + Facebook (Negative samples)

In [None]:
## Preparing the data
reddit_path = DS_Path + "/Reddit_depressionSuicide/reddit_depression_suicidewatch.csv"

df = pd.read_csv(reddit_path, encoding='latin-1')
df = df.rename(columns={'text':'Text', 'label':'Label'})
label_dict = {'depression': 0, 'SuicideWatch': 1}
df['Label'] = df['Label'].apply(lambda row: label_dict[row])

df.drop(df[df['Label'] <1].index, inplace = True)
df.reset_index(drop=True)

## Adding negative samples

zero_sampels = DS_Path + "/Facebook/facebook_posts.csv"

df_zero = pd.read_csv(zero_sampels, encoding='latin-1', nrows=df.shape[0])
df_zero = df_zero.rename(columns={'message':'Text'})
df_zero = df_zero.assign(Label=0)

In [None]:
df

Unnamed: 0,Text,Label
3,I am so exhausted of this. Just when I think I...,1
5,I am 20 year old with some good friends but I ...,1
8,it is looming around the corner again. It alwa...,1
9,there is.....foodAnd other things I will be ju...,1
13,I am on zoloft and focalin and it is changed m...,1
...,...,...
20353,I am sorry I am sorry i made studying as Joke ...,1
20354,Being told to end my life by the only person I...,1
20356,Last week I did the most disgusting thing by c...,1
20357,I do not see a way out of my head and a way to...,1


In [None]:
df_zero

Unnamed: 0,Text,Label
0,This was a waste of taxpayer dollars. Save you...,0
1,About time. We should have never been spending...,0
2,There's a lot more important things that our m...,0
3,Those transgendered people do not deserve tax ...,0
4,Republicans live to bully citizens and limit o...,0
...,...,...
9987,Glad to see that the United States can protect...,0
9988,They know the imbecile that's in the White Hou...,0
9989,US has the right to do whatever they want & if...,0
9990,And North Korea condemns this. Lol. Sanctions.,0


In [None]:
df = pd.concat([df, df_zero])

In [None]:
df.reset_index(drop=True)

Unnamed: 0,Text,Label
0,I am so exhausted of this. Just when I think I...,1
1,I am 20 year old with some good friends but I ...,1
2,it is looming around the corner again. It alwa...,1
3,there is.....foodAnd other things I will be ju...,1
4,I am on zoloft and focalin and it is changed m...,1
...,...,...
19979,Glad to see that the United States can protect...,0
19980,They know the imbecile that's in the White Hou...,0
19981,US has the right to do whatever they want & if...,0
19982,And North Korea condemns this. Lol. Sanctions.,0


In [None]:
df = preprocessing(df)

# Word Embeddings

## Downloading Pre-trained Models

In [None]:
import gensim.downloader as api

import json
info = api.info()
print(info.keys())

for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )

dict_keys(['corpora', 'models'])
__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-new

In [None]:
embeddings = {0: "word2vec-google-news-300", 1: "fasttext-wiki-news-subwords-300", 2: "glove-twitter-100"}
embedding_format = 1

embedding_model = api.load(embeddings[embedding_format])



In [None]:
similar_words = embedding_model.most_similar('kill')
print(similar_words)

[('kills', 0.7860026955604553), ('killing', 0.7399880290031433), ('destroy', 0.7350039482116699), ('maim', 0.7278950810432434), ('decapitate', 0.7270759344100952), ('slay', 0.7220808267593384), ('killy', 0.721636176109314), ('kill-', 0.7112785577774048), ('kill.', 0.705057680606842), ('killed', 0.7015096545219421)]


In [None]:
vocab = embedding_model.key_to_index

In [None]:
## Creating the embedding matrix

# vocab = wd2vc_model.wv.vocab ## gensim 3
vocab = embedding_model.key_to_index
print("The total number of words are : ", len(vocab))
vocab = list(vocab.keys())

word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = embedding_model.get_vector(word)
print("The no of key-value pairs : ", len(word_vec_dict))  # should come equal to vocab size

max_length = max(df['Length'])
print("maximum length = ", max_length)

tok = Tokenizer()
tok.fit_on_texts(df['Tokens'])
vocab_size = len(tok.word_index) + 1

encd_rev = tok.texts_to_sequences(df['Tokens'])

embed_dim = 300
pad_rev = pad_sequences(encd_rev, maxlen=max_length, padding='post')
print("pad_shape = ", pad_rev.shape)

# the embedding matrix
embed_matrix = np.zeros(shape=(vocab_size, embed_dim))
print(embed_matrix.shape)
for word, i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

The total number of words are :  999999
The no of key-value pairs :  999999
maximum length =  9360
pad_shape =  (20363, 9360)
(35627, 300)


## Train_Validation_Test data

In [None]:
## Spliting the data
X, X_test, y, y_test = train_test_split(pad_rev, df['Label'], shuffle=True, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=33)

print("Train = ", X_train.shape, y_train.shape)
print("Test = ", X_test.shape, y_test.shape)
print("Validation = ",X_val.shape, y_val.shape)

Train =  (13032, 9360) (13032,)
Test =  (4073, 9360) (4073,)
Validation =  (3258, 9360) (3258,)


# Attention_based hybrid BiLSTM - CNN model

In [None]:
class LuongAttention(tf.keras.Model):
    def __init__(self, rnn_size, attention_func):
        super(LuongAttention, self).__init__()
        self.attention_func = attention_func

        if attention_func not in ['dot', 'general', 'concat']:
            raise ValueError(
                'Unknown attention score function! Must be either dot, general or concat.')

        if attention_func == 'general':
            # General score function
            self.wa = tf.keras.layers.Dense(rnn_size)
        elif attention_func == 'concat':
            # Concat score function
            self.wa = tf.keras.layers.Dense(rnn_size, activation='tanh')
            self.va = tf.keras.layers.Dense(1)

    def call(self, decoder_output, encoder_output):
        if self.attention_func == 'dot':
            # Dot score function: decoder_output (dot) encoder_output
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # => score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, encoder_output, transpose_b=True)
        elif self.attention_func == 'general':
            # General score function: decoder_output (dot) (Wa (dot) encoder_output)
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # => score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, self.wa(
                encoder_output), transpose_b=True)
        elif self.attention_func == 'concat':
            # Concat score function: va (dot) tanh(Wa (dot) concat(decoder_output + encoder_output))
            # Decoder output must be broadcasted to encoder output's shape first
            decoder_output = tf.tile(
                decoder_output, [1, encoder_output.shape[1], 1])

            # Concat => Wa => va
            # (batch_size, max_len, 2 * rnn_size) => (batch_size, max_len, rnn_size) => (batch_size, max_len, 1)
            score = self.va(
                self.wa(tf.concat((decoder_output, encoder_output), axis=-1)))

            # Transpose score vector to have the same shape as other two above
            # (batch_size, max_len, 1) => (batch_size, 1, max_len)
            score = tf.transpose(score, [0, 2, 1])

        # alignment a_t = softmax(score)
        alignment = tf.nn.softmax(score, axis=2)

        # context vector c_t is the weighted average sum of encoder output
        context = tf.matmul(alignment, encoder_output)

        return context, alignment

In [None]:
# from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Input, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.layers import Bidirectional, Dense, Input, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import regularizers
from tensorflow.keras.models import *

Attention = LuongAttention(20, 'dot')

def hybrid_model(kernel_size = 3, input_dim = vocab_size, output_dim=100, max_length = None, emb_matrix = None):

    inp = Input(shape=(max_length,))
    x = Embedding(input_dim=input_dim,
                  output_dim=output_dim,
                  input_length=max_length,
                  weights = [emb_matrix],
                  trainable = False)(inp)

    x = Bidirectional(CuDNNLSTM(100, stateful=False, return_sequences = True))(x)
    # x = tf.keras.activations.sigmoid(x)
    x = CuDNNLSTM(100, stateful=False, return_sequences = True)(x)
    # x = tf.keras.activations.sigmoid(x)

    x = Conv1D(50, kernel_size, activation='relu')(x)

    print("conv shape", x.shape)
    x = Attention(x,x)[0]
    x = Dense(50, activation="relu", kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4))(x)
    # x = tf.keras.activations.relu(x)
    # regularizer = regularizers.L1L2(l1=1e-5, l2=1e-4)
    # regularizer(x)

    x = GlobalMaxPooling1D()(x)
    x = Dense(250, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.5)(x)
    print("before output shape", x.shape)
    outp = Dense(2, activation="sigmoid")(x)
    # outp = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=outp)
    model.compile( loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

    return model

model= hybrid_model(kernel_size = 3, input_dim = vocab_size, output_dim=embed_dim, max_length = max_length, emb_matrix = embed_matrix)

# model.summary()

conv shape (None, 9358, 50)
before output shape (None, 50)


In [None]:
epochs = 30
batch_size = 32
r = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

# Prediction

In [None]:
#Prediction on the test data
pred = model.predict(X_test)
print(pred)

[[9.8512417e-01 3.6647186e-02]
 [9.8512417e-01 3.6647186e-02]
 [9.8512417e-01 3.6647186e-02]
 ...
 [9.8512417e-01 3.6647186e-02]
 [9.8512417e-01 3.6647186e-02]
 [1.7490360e-05 9.9888724e-01]]


In [None]:
y_pred = np.zeros(len(pred))
for item in range(len(pred)):
  y_pred[item] = list(pred[item]).index(max(pred[item]))
  # y_pred[item] = 0 if pred[item]<0.5 else 1

print(y_pred)

[0. 0. 0. ... 0. 0. 1.]


In [None]:
import sklearn

report = sklearn.metrics.classification_report(y_test, y_pred)
print(report)
accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
precision = sklearn.metrics.precision_score(y_test, y_pred)
recall = sklearn.metrics.recall_score(y_test, y_pred)
f1score = sklearn.metrics.f1_score(y_test, y_pred)
print("-----------------------")
print(accuracy)
print(precision)
print(recall)
print(f1score)

              precision    recall  f1-score   support

           0       0.99      0.92      0.96      2002
           1       0.93      0.99      0.96      1995

    accuracy                           0.96      3997
   macro avg       0.96      0.96      0.96      3997
weighted avg       0.96      0.96      0.96      3997

-----------------------
0.9582186639979985
0.9279026217228464
0.993483709273183
0.9595739530380054


## Reddit+FB

In [None]:
epochs = 15
batch_size = 16
r = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
model.save('/trained_model_Reddit_Facebook.h5')

### Prediction

In [None]:
#Prediction on the test data
pred = model.predict(X_test)
print(pred)

[[9.8512417e-01 3.6647186e-02]
 [9.8512417e-01 3.6647186e-02]
 [9.8512417e-01 3.6647186e-02]
 ...
 [9.8512417e-01 3.6647186e-02]
 [9.8512417e-01 3.6647186e-02]
 [1.7490360e-05 9.9888724e-01]]


In [None]:
y_pred = np.zeros(len(pred))
for item in range(len(pred)):
  y_pred[item] = list(pred[item]).index(max(pred[item]))
  # y_pred[item] = 0 if pred[item]<0.5 else 1

print(y_pred)

[0. 0. 0. ... 0. 0. 1.]


In [None]:
import sklearn

report = sklearn.metrics.classification_report(y_test, y_pred)
print(report)
accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
precision = sklearn.metrics.precision_score(y_test, y_pred)
recall = sklearn.metrics.recall_score(y_test, y_pred)
f1score = sklearn.metrics.f1_score(y_test, y_pred)
print("-----------------------")
print(accuracy)
print(precision)
print(recall)
print(f1score)

              precision    recall  f1-score   support

           0       0.99      0.92      0.96      2002
           1       0.93      0.99      0.96      1995

    accuracy                           0.96      3997
   macro avg       0.96      0.96      0.96      3997
weighted avg       0.96      0.96      0.96      3997

-----------------------
0.9582186639979985
0.9279026217228464
0.993483709273183
0.9595739530380054
