# Word Embedding
This notebook aims to use perform word2vec.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from time import time
from collections import Counter

In [None]:
# Set constant

SEED = 4222
EPOCHS = 5

In [None]:
# Change to own directory
try:
    os.chdir("/content/drive/MyDrive/MyProject_SIDetection")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Load dataset

In [None]:
# Load dataset
suicide_detection_df = pd.read_csv('data_heavyclean_emoji.csv')
suicide_detection_df

Unnamed: 0,text,label
0,ex wife threatening suiciderecently left wife ...,1
1,weird get affected compliment coming someone k...,0
2,finally 2020 almost never hear 2020 bad year e...,0
3,need helpjust help im cry hard,1
4,end tonight anymore quit,1
...,...,...
174290,something today went sledding friend may seem ...,0
174291,like rock going get anything go,0
174292,tell many friend lonely everything deprived pr...,0
174293,pee probably taste like salty tea someone dran...,0


In [None]:
# Split dataset into train, validation and test sets
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['label'],
                                                                    random_state=SEED,
                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['label'])

### Building a vocab

In [None]:
# define vocab
vocab = Counter()
# tokenise each sentence
tokens_list = [(s.split()) for s in train_text]
# add each sentence to vocab
for i in tokens_list:
  vocab.update(i)
# removing words with a low occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

32104


In [None]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

# save tokens to a vocabulary file
save_list(vocab, 'vocab.txt')

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

### Removing out-of-vocab words

In [None]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

In [None]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)

### Training the model

In [None]:
# set up the parameters of the model
model = Word2Vec(vector_size=300, window=10, min_count=1, epochs=EPOCHS, seed=SEED)

# it builds the vocabulary from a sequence of sentences and thus initialized the model.
t = time()
model.build_vocab(train_clean, progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

# training the model
t = time()
model.train(train_clean, total_examples=model.corpus_count, epochs=EPOCHS, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.03 mins
Time to train the model: 0.38 mins


In [None]:
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
# check similar words and their respective similarity scores
similar_words = model.wv.most_similar('suicide')
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

suicidei: 0.8106
readeveryone: 0.7721
nolife: 0.7531
suicidepeople: 0.6902
desparately: 0.6897
embellishing: 0.6721
questionfor: 0.6504
suicidetonight: 0.6408
killing: 0.6362
broadcasted: 0.6207


# Glove

In [None]:
import urllib.request

# Download the file
urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', 'glove.6B.zip')

# Unzip the file
import zipfile

with zipfile.ZipFile('glove.6B.zip', 'r') as z:
  z.extractall()

In [None]:
#to check path
#!ls
#!pwd

In [None]:
%%time
print('Indexing word vectors.')
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.
CPU times: user 8.99 s, sys: 532 ms, total: 9.52 s
Wall time: 9.63 s


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM
from sklearn.model_selection import train_test_split
import numpy as np

texts = suicide_detection_df['text']
labels = suicide_detection_df['label']

# Step 1: Tokenize the text
MAX_NUM_WORDS = 20000  # This is the maximum number of words to consider
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Step 2: Create the embedding matrix
MAX_SEQUENCE_LENGTH = 1000  # This is the maximum sequence length
embedding_matrix = np.zeros((len(word_index) + 1, 100))  # 100 for glove.6B.100d
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Step 3: Define the neural network model
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    100,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' if you have more than two classes

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',  # Use 'categorical_crossentropy' if you have more than two classes
              metrics=['acc'])

# Step 4: Prepare the data
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.asarray(labels)

Found 84261 unique tokens.


In [None]:
# Split the data into a training set and a validation set
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=SEED, stratify=labels)

# Step 5: Train the model
model.fit(x_train, y_train, batch_size=128, epochs=EPOCHS, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ac06dee2500>

In [None]:
from keras.callbacks import EarlyStopping

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model with early stopping
model.fit(x_train, y_train, batch_size=128, epochs=EPOCHS, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ac06db267a0>

breakline