# **LSTM Language Model**



## **Imports**

In [None]:
import csv
import tensorflow as tf
from tqdm.auto import tqdm 
from gensim.models import Word2Vec

# Adapt this variable to the path of the cloned repository
path = "YourPathHere"

# For importing custom modules
import sys
sys.path.append(f'{path}/Modules')

Mount Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **Hyperparameters**

In [None]:
EMBEDDING_SIZE = 256 
HIDDEN_SIZE = 1024
LEARNING_RATE = 0.001
NUM_EPOCHS = 20
BATCH_SIZE = 256

## **Dataset**

In [None]:
from helper_functions import dataset_cleanup

In [None]:
data_path = f"{path}/Dataset/news_data_preprocessed.csv"
min_sent_len = 10
max_sent_len = 28

cleaned_data, max_seq_length = dataset_cleanup(data_path=data_path, 
                                               min_sent_len=min_sent_len, 
                                               max_sent_len=max_sent_len)

In [None]:
# Create data for training, consisting of (input,target) pairs
train_data = []
for sent in cleaned_data:
    train_data.append((sent[:-1],sent[1:]))

### **Train Word2Vec embeddings**


We use gensim's word2vec function that trains a skip-gram model (with negative sampling) for 50 epochs to create 256 dimensional word embeddings:

In [None]:
word2vec_model = Word2Vec(sentences=cleaned_data, size=EMBEDDING_SIZE, window=5, min_count=1, workers=24, sg=1, negative=50, iter=50)
# Save the trained embeddings
word2vec_model.save(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

In [None]:
# Load the previously saved embeddings
word2vec_model = Word2Vec.load(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

print("Examine the trained embeddings: ")
word2vec_model.most_similar("<NUM>", topn=10)

Import custom function that converts the word2vec model word vectors into a numpy matrix that is suitable for insertion into our TensorFlow/Keras embedding layer:



In [None]:
from helper_functions import word2vec_to_matrix

In [None]:
embedding_matrix, vocab_size = word2vec_to_matrix(word2vec_model=word2vec_model, embedding_size=EMBEDDING_SIZE)

Create a word2index dict in order to convert each token in our train_data dataset to its respective index in the embedding matrix:

In [None]:
word2index_dict = {token: token_index for token_index, token in enumerate(word2vec_model.wv.index2word)}

sent2index_input = []
sent2index_target = []

for input, target in train_data:
    input = [word2index_dict[key] for key in input]
    target = [word2index_dict[key] for key in target]
    sent2index_input.append(input)
    sent2index_target.append(target)

# Take a look at one input, target pair
print("Input sentence: ")
print(sent2index_input[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_input[0]]))
print()
print("Target sentence: ")
print(sent2index_target[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_target[0]]))
print()

Extract the indices of the **sos** and **eos** tokens for the inference mode:

In [None]:
start_token = word2index_dict["<Start>"]
end_token = word2index_dict["<End>"]
print(f"<Start>: {start_token}")
print(f"<End>: {end_token}")

<Start>: 0
<End>: 2


### **Data Pipeline**
Creating tf.Dataset objects that are then cached, shuffled, batched and prefetched for efficient training.


Train data is of form (input, target), where:


*   **Input** contains the sentences that will be fed into our LSTMLM (serving also as teacher forcing input).

*   **Target** contains the sentences that will be used to calculate the loss of our LSTMLM.

In [None]:
# We split the data into train data (85%) and test data (15%)
train_dataset_input = tf.data.Dataset.from_tensor_slices(sent2index_input[:int(len(sent2index_input)*0.85)])
train_dataset_target = tf.data.Dataset.from_tensor_slices(sent2index_target[:int(len(sent2index_target)*0.85)])

train_dataset = tf.data.Dataset.zip((train_dataset_input, train_dataset_target)).cache().shuffle(buffer_size=50000, reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)


# Repeat for test data
test_dataset_input = tf.data.Dataset.from_tensor_slices(sent2index_input[int(len(sent2index_input)*0.85):-1])
test_dataset_target = tf.data.Dataset.from_tensor_slices(sent2index_target[int(len(sent2index_target)*0.85):-1])

test_dataset = tf.data.Dataset.zip((test_dataset_input, test_dataset_target)).cache().shuffle(buffer_size=50000, reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

## **Training LSTMLM**

Create and train the model:

In [None]:
from lstmlm import LSTMLM, trainModel

In [None]:
News_LSTMLM = LSTMLM(vocab_size=vocab_size, 
                     embedding_matrix=embedding_matrix,
                     embedding_size=EMBEDDING_SIZE, 
                     hidden_size=HIDDEN_SIZE)

In [None]:
save_path = "YourPathHere"
save_every = 10 # Number of epochs before saving model weights and plots 

trainModel(model=News_LSTMLM, 
           word2vec_model=word2vec_model,
           start_token=start_token,
           end_token=end_token, 
           max_seq_length=max_seq_length,
           save_every=save_every,
           save_path=save_path, 
           train_dataset=train_dataset, 
           test_dataset=test_dataset, 
           loss_function=tf.losses.SparseCategoricalCrossentropy(from_logits=True), 
           num_epochs=1,
           learning_rate=LEARNING_RATE)

## **Evaluation**

Import evaluation module:

In [None]:
import evaluation

Load the trained model:

In [None]:
News_LSTMLM = LSTMLM(vocab_size=vocab_size, 
                     embedding_matrix=embedding_matrix, 
                     embedding_size=EMBEDDING_SIZE, 
                     hidden_size=HIDDEN_SIZE)
News_LSTMLM.compile()

# Feed input through the network to ensure correct loading of the weights
News_LSTMLM.inference_mode(start_token=start_token, 
                           end_token=end_token, 
                           max_seq_length=max_seq_length)

News_LSTMLM.load_weights(f"{path}Model Weights/Thesis_Model_Weights/LSTMLM")

Model Summary:

In [None]:
News_LSTMLM.summary()

Create csv containing sentences for InferSent: 

In [None]:
sentences = []
for _ in tqdm(range(10000)):
    sentences.append([word2vec_model.wv.index2word[i.numpy()[0]] for i in News_LSTMLM.inference_mode(start_token=start_token, 
                                                                                                     end_token=end_token, 
                                                                                                     max_seq_length=max_seq_length-1, 
                                                                                                     states=None)])

with open(f"{path}/Evaluation/FID/News_LSTMLM_InferSent.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(sentences)

Generate Sentences:

In [None]:
evaluation.generate_sentences(model=News_LSTMLM, 
                              index_decoder=word2vec_model.wv.index2word, 
                              print_sentences=True, 
                              model_name="News_LSTMLM", 
                              latent_sample_gen=None, 
                              num_sent=10, 
                              start_token=start_token, 
                              end_token=end_token, 
                              max_seq_length=max_seq_length)

Average Sentence Length:

In [None]:
evaluation.generate_sentences(model=News_LSTMLM,
                              index_decoder=word2vec_model.wv.index2word, 
                              print_sentences=False, 
                              model_name="News_LSTMLM", 
                              latent_sample_gen=None, 
                              num_sent=10000, 
                              start_token=start_token,
                              end_token=end_token, 
                              max_seq_length=max_seq_length)

Prepare the reference data used for Bleu, Self-Bleu and Word Frequency calculations:

In [None]:
reference_data = []
for sent in sent2index_target[int(len(sent2index_target)*0.85):int(len(sent2index_target)*0.85)+10000]:
    temp = []
    for token_id in sent:
        if token_id == end_token:
            break
        temp.append(word2vec_model.wv.index2word[token_id])
    reference_data.append(temp)

Calculate JS Distance for sentence length frequencies and word counts:

In [None]:
jsd_sents, jsd_words = evaluation.js_distance(model=News_LSTMLM, 
                                              index_decoder=word2vec_model.wv.index2word,
                                              reference_data=reference_data, 
                                              model_name="News_LSTMLM",
                                              latent_sample_gen=None,
                                              start_token=start_token,
                                              end_token=end_token, 
                                              max_seq_length=max_seq_length)

print(f"Jensen-Shannon distance for the sentence length frequencies: {jsd_sents}")
print(f"Jensen-Shannon distance for the word counts: {jsd_words}")

Calculate Bleu-4 Score:

In [None]:
evaluation.bleu_score(model=News_LSTMLM,
                      index_decoder=word2vec_model.wv.index2word,
                      reference_data=reference_data,
                      model_name="News_LSTMLM",
                      latent_sample_gen=None, 
                      num_sent=10000,
                      n_grams=4,
                      start_token=start_token,
                      end_token=end_token,
                      max_seq_length=max_seq_length)

Calculate Self-Bleu-4 Score:

In [None]:
evaluation.self_bleu_score(model=News_LSTMLM, 
                           index_decoder=word2vec_model.wv.index2word, 
                           model_name="News_LSTMLM", 
                           latent_sample_gen=None, 
                           num_sent=10000, 
                           n_grams=4, 
                           start_token=start_token, 
                           end_token=end_token, 
                           max_seq_length=max_seq_length)

Count Word Frequency:

In [None]:
top_k = 12
ref_freq, gen_freq = evaluation.word_freq(model=News_LSTMLM, 
                                          index_decoder=word2vec_model.wv.index2word,
                                          reference_data=reference_data,
                                          model_name="News_LSTMLM",
                                          latent_sample_gen=None,
                                          start_token=start_token,
                                          end_token=end_token,
                                          max_seq_length=max_seq_length)

In [None]:
list(ref_freq.items())[:top_k]

In [None]:
list(gen_freq.items())[:top_k]

Word frequency plot:

In [None]:
save_path = "YourPathHere"
evaluation.word_freq_plots(reference_freq_dict=ref_freq, 
                           generated_freq_dict=gen_freq, 
                           top_k=top_k,
                           save_plots=False, 
                           save_path=save_path)