# Suicide prediction with natural language processing 

In [1]:
import numpy as np
import pandas as pd

Suicide = pd.read_csv("Suicide_Detection.csv")
print(Suicide.isnull().sum()) #No missing values. 
print(Suicide.head(n = 10))

print(Suicide["class"].value_counts()) #Perfect balance between classes

Unnamed: 0    0
text          0
class         0
dtype: int64
   Unnamed: 0                                               text        class
0           2  Ex Wife Threatening SuicideRecently I left my ...      suicide
1           3  Am I weird I don't get affected by compliments...  non-suicide
2           4  Finally 2020 is almost over... So I can never ...  non-suicide
3           8          i need helpjust help me im crying so hard      suicide
4           9  I’m so lostHello, my name is Adam (16) and I’v...      suicide
5          11  Honetly idkI dont know what im even doing here...      suicide
7          13   It ends tonight.I can’t do it anymore. \nI quit.      suicide
8          16  Everyone wants to be "edgy" and it's making me...  non-suicide
9          18  My life is over at 20 years oldHello all. I am...      suicide
suicide        116037
non-suicide    116037
Name: class, dtype: int64


## Label encoding the target 

In [2]:
from sklearn.preprocessing import LabelEncoder

target = Suicide['class']
lab = LabelEncoder()
lab.fit(target)
targ_tran = lab.transform(target)
print(targ_tran[:10])

print("Original letter encoding ",  target[:15])
print("Label encoding transformed non-suicide = 0 and suicide = 1", targ_tran[:10])

Suicide['targ_tran'] = targ_tran #adding transformed variable to data frame
Suicide.head(n= 10) #Can see new transformed variable next to original target.



[1 0 0 1 1 1 1 1 0 1]
Original letter encoding  0         suicide
1     non-suicide
2     non-suicide
3         suicide
4         suicide
5         suicide
6         suicide
7         suicide
8     non-suicide
9         suicide
10        suicide
11        suicide
12        suicide
13        suicide
14        suicide
Name: class, dtype: object
Label encoding transformed non-suicide = 0 and suicide = 1 [1 0 0 1 1 1 1 1 0 1]


Unnamed: 0.1,Unnamed: 0,text,class,targ_tran
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide,1
1,3,Am I weird I don't get affected by compliments...,non-suicide,0
2,4,Finally 2020 is almost over... So I can never ...,non-suicide,0
3,8,i need helpjust help me im crying so hard,suicide,1
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,1
5,11,Honetly idkI dont know what im even doing here...,suicide,1
6,12,[Trigger warning] Excuse for self inflicted bu...,suicide,1
7,13,It ends tonight.I can’t do it anymore. \nI quit.,suicide,1
8,16,"Everyone wants to be ""edgy"" and it's making me...",non-suicide,0
9,18,My life is over at 20 years oldHello all. I am...,suicide,1


## Tokenization 

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

#Keeping 60000 most common words 
toke = Tokenizer(num_words = 60000, lower = 1, oov_token = "<OOV>", filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
toke.fit_on_texts(Suicide['text'])
word_index = toke.word_index


In [4]:
len(word_index) #203,027 words were found in the dataset

203027

## Sequencing 

In [5]:
#Each sample will be a list of integers: each interger refers to a word which was done previous panel
Seq = toke.texts_to_sequences(Suicide['text'])
print(len(Seq[4])) #First sample has 27 words 
print(type(Seq))

436
<class 'list'>


## Padding 

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

#Padding. #Each text sample will be padded with 0's at the end to equal same length as longest sample.
pad = pad_sequences(Seq, padding = 'post')

In [7]:
#Each text sample been transformed to a sequence of integers that represent words 
print(len(Seq))
print(len(Suicide['text']))
print(len(pad))

232074
232074
232074


## Creating feature matrix and target 

In [8]:
from sklearn.model_selection import train_test_split

X = pad 
y = Suicide['targ_tran']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, shuffle = True)

print(len(X_train)/2) #half will go into training and half will go into validation when running the neural network
print(len(X_test))
print(len(y_train))
print(len(y_test))

87027.5
58019
174055
58019


# Building the neural network

In [14]:
from tensorflow.keras.layers import Activation, BatchNormalization, Dropout 
from tensorflow import keras

voc_size = 60000 

model = keras.Sequential([
    keras.layers.Embedding(input_dim = voc_size, output_dim = 30),
    BatchNormalization(),
    keras.layers.GlobalAveragePooling1D(),
    
    keras.layers.Dense(30, activation = 'relu'),
    BatchNormalization(),
    Dropout(0.5), 
    
    keras.layers.Dense(15, activation = 'relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    keras.layers.Dense(1, activation = 'sigmoid'),

])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 30)          1800000   
_________________________________________________________________
batch_normalization_3 (Batch (None, None, 30)          120       
_________________________________________________________________
global_average_pooling1d_1 ( (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                930       
_________________________________________________________________
batch_normalization_4 (Batch (None, 30)                120       
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 15)               

## Compiling the model

In [15]:
from keras.optimizers import Adam 

#Call backs while training to reduce learning rate or stop the training if val accuracy does not improve
learn_reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=3, verbose = 1)
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience= 10, mode='auto', verbose = 1)

model.compile(optimizer = Adam(learning_rate = 0.0001),
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

#Dividing training data to add half to validation

x_val = X_train[:87027]
partial_Xtrain = X_train[87027:]

y_val = y_train[:87027]
partial_ytrain = y_train[87027:]

model.fit(partial_Xtrain,
         partial_ytrain,
         epochs = 50,
         batch_size = 32,
         validation_data = (x_val, y_val),
         verbose = 2,
         callbacks = [learn_reduce, early_stop])


Epoch 1/50
2720/2720 - 1089s - loss: 0.4015 - accuracy: 0.8289 - val_loss: 1.3487 - val_accuracy: 0.5507
Epoch 2/50
2720/2720 - 1279s - loss: 0.2623 - accuracy: 0.9014 - val_loss: 1.2310 - val_accuracy: 0.6296
Epoch 3/50
2720/2720 - 1021s - loss: 0.2329 - accuracy: 0.9150 - val_loss: 0.6440 - val_accuracy: 0.6813
Epoch 4/50
2720/2720 - 926s - loss: 0.2179 - accuracy: 0.9204 - val_loss: 2.8032 - val_accuracy: 0.5016
Epoch 5/50
2720/2720 - 911s - loss: 0.2079 - accuracy: 0.9234 - val_loss: 17.4440 - val_accuracy: 0.5002
Epoch 6/50
2720/2720 - 919s - loss: 0.1934 - accuracy: 0.9300 - val_loss: 0.4488 - val_accuracy: 0.7126
Epoch 7/50
2720/2720 - 906s - loss: 0.1863 - accuracy: 0.9324 - val_loss: 2.8293 - val_accuracy: 0.5043
Epoch 8/50
2720/2720 - 918s - loss: 0.1809 - accuracy: 0.9357 - val_loss: 2.2244 - val_accuracy: 0.5027
Epoch 9/50

Epoch 00009: ReduceLROnPlateau reducing learning rate to 8.999999772640876e-05.
2720/2720 - 916s - loss: 0.1758 - accuracy: 0.9374 - val_loss: 21.0932 -

KeyboardInterrupt: 

## Evaluating model 1 on the test set 

In [16]:
model.evaluate(X_test, y_test)



[1.3531148433685303, 0.7612580060958862]

# Second neural network model

In [9]:
from tensorflow.keras.layers import Activation, BatchNormalization, Dropout 
from tensorflow import keras
from keras.optimizers import Adam 

voc_size = 60000 

model2 = keras.Sequential([
    keras.layers.Embedding(input_dim = voc_size, output_dim = 30),
    BatchNormalization(),
    keras.layers.GlobalAveragePooling1D(),
    
    keras.layers.Dense(30, activation = 'relu'),
    BatchNormalization(),
    Dropout(0.5), 
    
    keras.layers.Dense(15, activation = 'relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    keras.layers.Dense(1, activation = 'sigmoid'),

])

model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 30)          1800000   
_________________________________________________________________
batch_normalization (BatchNo (None, None, 30)          120       
_________________________________________________________________
global_average_pooling1d (Gl (None, 30)                0         
_________________________________________________________________
dense (Dense)                (None, 30)                930       
_________________________________________________________________
batch_normalization_1 (Batch (None, 30)                120       
_________________________________________________________________
dropout (Dropout)            (None, 30)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 15)                4

## Compiling and fitting the second model

In [10]:
model2.compile(optimizer = Adam(learning_rate = 0.0001),
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

x_val = X_train[:87027]
partial_Xtrain = X_train[87027:]

y_val = y_train[:87027]
partial_ytrain = y_train[87027:]

model2.fit(partial_Xtrain,
         partial_ytrain,
         epochs = 3,
         batch_size = 32,
         validation_data = (x_val, y_val),
         verbose = 2)

Epoch 1/3
2720/2720 - 973s - loss: 0.4386 - accuracy: 0.8103 - val_loss: 0.5848 - val_accuracy: 0.6071
Epoch 2/3
2720/2720 - 899s - loss: 0.2665 - accuracy: 0.9004 - val_loss: 9.9949 - val_accuracy: 0.5106
Epoch 3/3
2720/2720 - 902s - loss: 0.2347 - accuracy: 0.9138 - val_loss: 0.4825 - val_accuracy: 0.6641


<tensorflow.python.keras.callbacks.History at 0x1381322c370>

## Evaluating model 2 on test set 

In [11]:
model2.evaluate(X_test, y_test)



[0.48672303557395935, 0.6602836847305298]