# **cVAELM**

## **Imports**

In [1]:
import csv
import tensorflow as tf
from tqdm.auto import tqdm 
from gensim.models import Word2Vec
import tensorflow_probability as tfp
tfd = tfp.distributions

# Adapt this variable to the path of the cloned repository
path = "YourPathHere"

# For importing custom modules
import sys
sys.path.append(f'{path}/Modules')

Mount google drive:

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **Hyperparameters**

In [3]:
EMBEDDING_SIZE = 256 
HIDDEN_SIZE = 1024    
LEARNING_RATE = 0.0005
NUM_EPOCHS = 180
NUM_CYCLES = 30
BATCH_SIZE = 256

## **Dataset**

In [4]:
from helper_functions import dataset_cleanup

In [5]:
data_path =  f"{path}/Dataset/news_data_preprocessed.csv"
min_sent_len=10
max_sent_len=28

cleaned_data, max_seq_length = dataset_cleanup(data_path=data_path, 
                                               min_sent_len=min_sent_len, 
                                               max_sent_len=max_sent_len)

In [6]:
# Create data for training, consisting of (input,target,teacher) pairs
train_data = []
for sent in cleaned_data:
    train_data.append((sent, sent[1:], sent[:-1]))

### **Train word2vec embeddings**

We use gensim's word2vec function that trains a skip-gram model (with negative sampling) for 50 epochs to create 256 dimensional word embeddings:

In [None]:
word2vec_model = Word2Vec(sentences=cleaned_data, size=EMBEDDING_SIZE, window=5, min_count=1, workers=24, sg=1, negative=50, iter=50)
# Save the trained embeddings
word2vec_model.save(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

In [None]:
# Load previously saved embeddings
word2vec_model = Word2Vec.load(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

print("Examine the trained embeddings: ")
word2vec_model.most_similar("<NUM>", topn=10)

Import custom function that converts the word2vec model word vectors into a numpy matrix that is suitable for insertion into our TensorFlow/Keras embedding layer:

In [8]:
from helper_functions import word2vec_to_matrix

In [None]:
embedding_matrix, vocab_size = word2vec_to_matrix(word2vec_model=word2vec_model, embedding_size=EMBEDDING_SIZE, create_unk=True)

Create a word2index dict in order to convert each token in our train_data dataset to its respective index in the embedding matrix:

In [None]:
word2index_dict = {token: token_index for token_index, token in enumerate(word2vec_model.wv.index2word)}

sent2index_input = []
sent2index_target = []
sent2index_teacher_forcing = []

for input, target, teacher in train_data:
    input = [word2index_dict[key] for key in input]
    target = [word2index_dict[key] for key in target]
    # Replace 50% of the teacher forcing words with the <UNK> token 
    teacher = [word2index_dict[key] if tf.random.uniform(shape=[1], dtype=tf.dtypes.float32) <= 0.5 else word2index_dict["<UNK>"] for key in teacher]
    sent2index_input.append(input)
    sent2index_target.append(target)
    sent2index_teacher_forcing.append(teacher)

# Take a look at one input, target, teacher pair
print("Input sentence: ")
print(sent2index_input[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_input[0]]))
print()
print("Target sentence: ")
print(sent2index_target[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_target[0]]))
print()
print("Teacher sentence: ")
print(sent2index_teacher_forcing[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_teacher_forcing[0]]))

Extract the indices of the **sos** and **eos** tokens for the inference mode:

In [None]:
start_token = word2index_dict["<Start>"]
end_token = word2index_dict["<End>"]
print(f"<Start>: {start_token}")
print(f"<End>: {end_token}")

### **Data Pipeline**
Creating tf.Dataset objects that are then cached, shuffled, batched and prefetched for efficient training.


Train data is of form (input, target, teacher), where:


*   **Input** contains the sentences that will be fed into the encoder of our cVAELM.

*   **Target** contains the sentences that will be used to calculate the loss of our cVAELM.

*   **Teacher** contains the sentences that will be used as the input for the decoder during training since we use teacher forcing.








In [13]:
# We split the data into train data (85%) and test data (15%)
train_dataset_input = tf.data.Dataset.from_tensor_slices(sent2index_input[:int(len(sent2index_input)*0.85)])
train_dataset_target = tf.data.Dataset.from_tensor_slices(sent2index_target[:int(len(sent2index_target)*0.85)])
train_dataset_teacher = tf.data.Dataset.from_tensor_slices(sent2index_teacher_forcing[:int(len(sent2index_teacher_forcing)*0.85)])

train_dataset = tf.data.Dataset.zip((train_dataset_input, train_dataset_target, train_dataset_teacher)).cache().shuffle(buffer_size=500000, reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)


# Repeat for test data
test_dataset_input = tf.data.Dataset.from_tensor_slices(sent2index_input[int(len(sent2index_input)*0.85):-1])
test_dataset_target = tf.data.Dataset.from_tensor_slices(sent2index_target[int(len(sent2index_target)*0.85):-1])
test_dataset_teacher = tf.data.Dataset.from_tensor_slices(sent2index_teacher_forcing[int(len(sent2index_teacher_forcing)*0.85):-1])

test_dataset = tf.data.Dataset.zip((test_dataset_input, test_dataset_target, test_dataset_teacher)).cache().shuffle(buffer_size=50000, reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

## **Training**

In [14]:
from cvaelm import CVAELM, trainModel

Define the prior distribution:

In [15]:
prior = tfd.Independent(tfd.Normal(loc=tf.zeros(HIDDEN_SIZE), scale=1), reinterpreted_batch_ndims=1)

Create and train the model:

*Note: The peaks occur in the epoch after the kl weight reset*

In [16]:
News_cVAELM = CVAELM(vocab_size=vocab_size, 
                     prior=prior, 
                     embedding_matrix=embedding_matrix, 
                     embedding_size=EMBEDDING_SIZE, 
                     hidden_size=HIDDEN_SIZE)

In [None]:
save_path = "YourPathHere"
save_every = 1 # Number of epochs before saving model weights and plots

# Use sum for loss, see: https://stats.stackexchange.com/questions/502314/variational-autoencoder-balance-kl-divergence-and-reconstructionloss
# loss: reduction=tf.keras.losses.Reduction.SUM
trainModel(model=News_cVAELM, 
           word2vec_model=word2vec_model,
           save_every=save_every,
           save_path=save_path, 
           train_dataset=train_dataset,
           test_dataset=test_dataset, 
           loss_function=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM), 
           num_epochs=NUM_EPOCHS, 
           num_cycles=NUM_CYCLES, 
           learning_rate=LEARNING_RATE)

## **Evaluation**

Import evaluation module:

In [18]:
import evaluation

Load the trained model:

In [None]:
News_cVAELM = CVAELM(vocab_size=vocab_size, 
                     prior=prior, 
                     embedding_matrix=embedding_matrix, 
                     embedding_size=EMBEDDING_SIZE, 
                     hidden_size=HIDDEN_SIZE)
News_cVAELM.compile()

# Feed input through the network to ensure correct loading of the weights 
hs = News_cVAELM.Encoder(tf.convert_to_tensor([sent2index_input[-5]]))
hs = tf.convert_to_tensor(hs)
out = News_cVAELM.Decoder.inference_mode(start_token=start_token, end_token=end_token, max_seq_length=max_seq_length, states=hs)

News_cVAELM.load_weights(f"{path}/Model Weights/Thesis_Model_Weights/cVAELM")

Model Summaries:

In [21]:
for input, taget, teacher in train_dataset.take(1):
    News_cVAELM(input, teacher)

In [None]:
News_cVAELM.Encoder.summary()

In [None]:
News_cVAELM.Decoder.summary()

Create csv containing sentences for InferSent: 

In [None]:
sentences = []
for _ in tqdm(range(10000)):
    sample = tf.expand_dims(prior.sample(), axis=0)
    sentences.append([word2vec_model.wv.index2word[i.numpy()[0]] for i in News_cVAELM.Decoder.inference_mode(start_token=start_token, 
                                                                                                             end_token=end_token, 
                                                                                                             max_seq_length=max_seq_length-1, 
                                                                                                             states=sample)])

with open(f"{path}/Evaluation/FID/News_cVAELM_InferSent.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(sentences)

Generate Sentences:

In [None]:
evaluation.generate_sentences(model=News_cVAELM.Decoder, 
                              index_decoder=word2vec_model.wv.index2word, 
                              print_sentences=True,
                              model_name="News_cVAELM",
                              latent_sample_gen=prior,
                              num_sent=10,
                              start_token=start_token,
                              end_token=end_token,
                              max_seq_length=max_seq_length)

Average Sentence Length:

In [None]:
evaluation.generate_sentences(model=News_cVAELM.Decoder,
                              index_decoder=word2vec_model.wv.index2word,
                              print_sentences=False,
                              model_name="News_cVAELM",
                              latent_sample_gen=prior,
                              num_sent=10000,
                              start_token=start_token,
                              end_token=end_token,
                              max_seq_length=max_seq_length)

Prepare the reference data used for Bleu, Self-Bleu and Word Frequency calculations:

In [26]:
reference_data = []
for sent in sent2index_target[int(len(sent2index_target)*0.85):int(len(sent2index_target)*0.85)+10000]:
    temp = []
    for token_id in sent:
        if token_id == end_token:
            break
        temp.append(word2vec_model.wv.index2word[token_id])
    reference_data.append(temp)

Calculate JS Distance for sentence length frequencies and word counts:

In [None]:
jsd_sents, jsd_words = evaluation.js_distance(model=News_cVAELM.Decoder,
                                              reference_data=reference_data,
                                              index_decoder=word2vec_model.wv.index2word,
                                              model_name="News_cVAELM",
                                              latent_sample_gen=prior,
                                              start_token=start_token,
                                              end_token=end_token,
                                              max_seq_length=max_seq_length)

print(f"Jensen-Shannon distance for the sentence length frequencies: {jsd_sents}")
print(f"Jensen-Shannon distance for the word counts: {jsd_words}")

Calculate Bleu-4 Score:

In [None]:
evaluation.bleu_score(model=News_cVAELM.Decoder, 
                      index_decoder=word2vec_model.wv.index2word, 
                      reference_data=reference_data, 
                      model_name="News_cVAELM", 
                      latent_sample_gen=prior, 
                      num_sent=10000, 
                      n_grams=4, 
                      start_token=start_token, 
                      end_token=end_token, 
                      max_seq_length=max_seq_length)

Calculate Self-Bleu-4 Score:

In [None]:
evaluation.self_bleu_score(model=News_cVAELM.Decoder, 
                           index_decoder=word2vec_model.wv.index2word, 
                           model_name="News_cVAELM", 
                           latent_sample_gen=prior, 
                           num_sent=10000, 
                           n_grams=4, 
                           start_token=start_token, 
                           end_token=end_token, 
                           max_seq_length=max_seq_length)

Count Word Frequency:

In [27]:
top_k = 12
ref_freq, gen_freq = evaluation.word_freq(model=News_cVAELM.Decoder, 
                                          reference_data=reference_data, 
                                          index_decoder=word2vec_model.wv.index2word, 
                                          model_name="News_cVAELM", 
                                          latent_sample_gen=prior, 
                                          start_token=start_token, 
                                          end_token=end_token, 
                                          max_seq_length=max_seq_length)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
list(ref_freq.items())[:top_k]

In [None]:
list(gen_freq.items())[:top_k]

Word frequency plot:

In [None]:
save_path = "YourPathHere"
evaluation.word_freq_plots(reference_freq_dict=ref_freq, 
                           generated_freq_dict=gen_freq, 
                           top_k=top_k,
                           save_plots=False, 
                           save_path=save_path)