# **LaTextGAN**


## **Imports**

In [None]:
import csv
import tensorflow as tf
from tqdm.auto import tqdm 
from gensim.models import Word2Vec

# Adapt this variable to the path of the cloned repository
path = "YourPathHere"

# For importing custom modules
import sys
sys.path.append(f'{path}/Modules')

Mount google drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **Hyperparameters**

In [None]:
EMBEDDING_SIZE = 256 
HIDDEN_SIZE = 1024     
LEARNING_RATE_AE = 0.001
LEARNING_RATE_GAN = 0.0001
NUM_EPOCHS_AE = 20
NUM_EPOCHS_GAN = 150
BATCH_SIZE_AE = 256
BATCH_SIZE_GAN = 256
NUM_RESIDUAL_BLOCKS = 40
GEN_UPDATE = 10 # number of steps before the generator is updated 

## **Dataset**

In [None]:
from helper_functions import dataset_cleanup

In [None]:
data_path = f"{path}/Dataset/news_data_preprocessed.csv"
min_sent_len=10
max_sent_len=28

cleaned_data, max_seq_length = dataset_cleanup(data_path=data_path, 
                                               min_sent_len=min_sent_len, 
                                               max_sent_len=max_sent_len)

In [None]:
# Create data for training, consisting of (input,target,teacher) pairs
train_data = []
for sent in cleaned_data:
    train_data.append((sent, sent[1:], sent[:-1]))

### **Train word2vec embeddings**

We use gensim's word2vec function that trains a skip-gram model (with negative sampling) for 50 epochs to create 256 dimensional word embeddings:

In [None]:
word2vec_model = Word2Vec(sentences=cleaned_data, size=EMBEDDING_SIZE, window=5, min_count=1, workers=24, sg=1, negative=50, iter=50)
# Save the trained embeddings
word2vec_model.save(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

In [None]:
# Load previously saved embeddings
word2vec_model = Word2Vec.load(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

print("Examine the trained embeddings: ")
word2vec_model.most_similar("<NUM>", topn=10)

Import custom function that converts the word2vec model word vectors into a numpy matrix that is suitable for insertion into our TensorFlow/Keras embedding layer:

In [None]:
from helper_functions import word2vec_to_matrix

In [None]:
embedding_matrix, vocab_size = word2vec_to_matrix(word2vec_model=word2vec_model, embedding_size=EMBEDDING_SIZE)

Create a word2index dict in order to convert each token in our train_data dataset to its respective index in the embedding matrix:

In [None]:
word2index_dict = {token: token_index for token_index, token in enumerate(word2vec_model.wv.index2word)}

sent2index_input = []
sent2index_target = []
sent2index_teacher_forcing = []

for input, target, teacher in train_data:
    input = [word2index_dict[key] for key in input]
    target = [word2index_dict[key] for key in target]
    teacher = [word2index_dict[key] for key in teacher]
    sent2index_input.append(input)
    sent2index_target.append(target)
    sent2index_teacher_forcing.append(teacher)

# Take a look at one input, target, teacher pair
print("Input sentence: ")
print(sent2index_input[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_input[0]]))
print()
print("Target sentence: ")
print(sent2index_target[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_target[0]]))
print()
print("Teacher sentence: ")
print(sent2index_teacher_forcing[0])
print(" ".join([word2vec_model.wv.index2word[i] for i in sent2index_teacher_forcing[0]]))

Extract the indices of the **sos** and **eos** tokens for the inference mode:

In [None]:
start_token = word2index_dict["<Start>"]
end_token = word2index_dict["<End>"]
print(f"<Start>: {start_token}")
print(f"<End>: {end_token}")

### **Data Pipeline**
Creating tf.Dataset objects that are then cached, shuffled, batched and prefetched for efficient training.



Train data is of form (input, target, teacher), where:


*   **Input** contains the sentences that will be fed into the encoder of our AE.

*   **Target** contains the sentences that will be used to calculate the loss of our AE.

*   **Teacher** contains the sentences that will be used as the input for the decoder during training since we use teacher forcing.



In [None]:
# We split the data into train data (85%) and test data (15%)
train_dataset_input = tf.data.Dataset.from_tensor_slices(sent2index_input[:int(len(sent2index_input)*0.85)])
train_dataset_target = tf.data.Dataset.from_tensor_slices(sent2index_target[:int(len(sent2index_target)*0.85)])
train_dataset_teacher = tf.data.Dataset.from_tensor_slices(sent2index_teacher_forcing[:int(len(sent2index_teacher_forcing)*0.85)])

train_dataset = tf.data.Dataset.zip((train_dataset_input, train_dataset_target, train_dataset_teacher)).cache().shuffle(buffer_size=500000, reshuffle_each_iteration=True).batch(BATCH_SIZE_AE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)


# Repeat for test data
test_dataset_input = tf.data.Dataset.from_tensor_slices(sent2index_input[int(len(sent2index_input)*0.85):-1])
test_dataset_target = tf.data.Dataset.from_tensor_slices(sent2index_target[int(len(sent2index_target)*0.85):-1])
test_dataset_teacher = tf.data.Dataset.from_tensor_slices(sent2index_teacher_forcing[int(len(sent2index_teacher_forcing)*0.85):-1])

test_dataset = tf.data.Dataset.zip((test_dataset_input, test_dataset_target, test_dataset_teacher)).cache().shuffle(buffer_size=50000, reshuffle_each_iteration=True).batch(BATCH_SIZE_AE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

## **Training: Autoencoder**

In [None]:
from latextgan import AutoEncoder, train_AE

Create and train the model:

In [None]:
News_AE = AutoEncoder(vocab_size=vocab_size, 
                      embedding_matrix=embedding_matrix, 
                      embedding_size=EMBEDDING_SIZE, 
                      hidden_size=HIDDEN_SIZE)

In [None]:
save_path = "YourPathHere"
save_every = 1 # Number of epochs before saving model weights and plots

train_AE(model=News_AE,
         word2vec_model=word2vec_model, 
         save_every=save_every, 
         save_path=save_path,
         train_dataset=train_dataset, 
         test_dataset=test_dataset, 
         loss_function=tf.keras.losses.SparseCategoricalCrossentropy(), 
         num_epochs=NUM_EPOCHS_AE, 
         learning_rate=LEARNING_RATE_AE)

## **Training: LaTextGAN**

In [None]:
from latextgan import Generator, Discriminator, train_GAN

Load AutoEncoder:

In [None]:
News_AE = AutoEncoder(vocab_size=vocab_size, 
                      embedding_matrix=embedding_matrix, 
                      embedding_size=EMBEDDING_SIZE, 
                      hidden_size=HIDDEN_SIZE)
News_AE.compile()

hs = News_AE.Encoder(tf.convert_to_tensor([sent2index_input[-5]]))

out = News_AE.Decoder.inference_mode(start_token=start_token, 
                                     end_token=end_token, 
                                     max_seq_length=max_seq_length, 
                                     states=hs)

News_AE.load_weights(f"{path}/Model Weights/Thesis_Model_Weights/AE")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f2dc5f3dcd0>

Create a dataset containing the embeddings of real sentences to train our Discriminator on:

In [None]:
train_dataset_GAN = train_dataset_input

train_dataset_GAN = train_dataset_GAN.map(lambda x: tf.squeeze(News_AE.Encoder(tf.expand_dims(x, axis=0))))

train_dataset_GAN = train_dataset_GAN.cache().shuffle(buffer_size=500000, reshuffle_each_iteration=True).batch(BATCH_SIZE_GAN, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

Alternative pipeline that relies on intermediately saving and loading the dataset to free up memory:

In [None]:
# train_dataset_GAN = [tf.squeeze(News_AE.Encoder(tf.expand_dims(i, axis=0))) for i in tqdm(train_dataset_input)] 

# train_dataset_GAN = tf.data.Dataset.from_tensor_slices(train_dataset_GAN)

# tf.data.experimental.save(train_dataset_GAN, "drive/MyDrive/BA_2.0/Dataset/train_dataset_GAN_20", compression=None, shard_func=None)

In [None]:
# train_dataset_GAN = tf.data.experimental.load("drive/MyDrive/BA_2.0/Dataset/train_dataset_GAN_20", element_spec=None, compression=None, reader_func=None)
# train_dataset_GAN = train_dataset_GAN.cache().shuffle(buffer_size=500000, reshuffle_each_iteration=True).batch(BATCH_SIZE_GAN, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

Create and train the model:

In [None]:
LaTextGAN_Generator = Generator(hidden_size=HIDDEN_SIZE, 
                                num_res_blocks=NUM_RESIDUAL_BLOCKS)

LaTextGAN_Discriminator = Discriminator(hidden_size=HIDDEN_SIZE, 
                                        num_res_blocks=NUM_RESIDUAL_BLOCKS) 

When using the mapping pipeline to create `train_dataset_GAN` the first training epoch may tike quite a while, as every element must actually be fed through the Encoder.

In [None]:
save_path = "YourPathHere"
save_every = 1 # Number of epochs before saving model weights and plots

train_GAN(generator=LaTextGAN_Generator,
          discriminator=LaTextGAN_Discriminator,
          autoencoder=News_AE, 
          word2vec_model=word2vec_model,
          start_token=start_token, 
          end_token=end_token,  
          max_seq_length=max_seq_length,
          save_every=save_every, 
          save_path=save_path,
          train_dataset_GAN=train_dataset_GAN, 
          gen_update=GEN_UPDATE,
          num_epochs=NUM_EPOCHS_GAN, 
          learning_rate=LEARNING_RATE_GAN)

## **Evaluation**

Import evaluation module:

In [None]:
import evaluation

Load the trained model:

In [None]:
# Load Generator weights
LaTextGAN_Generator = Generator(hidden_size=HIDDEN_SIZE, 
                                num_res_blocks=NUM_RESIDUAL_BLOCKS)
LaTextGAN_Generator.compile()

LaTextGAN_Generator.load_weights(f"{path}/Model Weights/Thesis_Model_Weights/LaTextGAN")

Model Summaries:

In [None]:
for input, target, teacher in train_dataset.take(1):
    News_AE(input, teacher)

In [None]:
News_AE.Encoder.summary()

In [None]:
News_AE.Decoder.summary()

In [None]:
LaTextGAN_Discriminator = Discriminator(hidden_size=HIDDEN_SIZE, 
                                        num_res_blocks=NUM_RESIDUAL_BLOCKS) 

for input in train_dataset_GAN.take(1):
    LaTextGAN_Generator(tf.random.normal([input.shape[0], HIDDEN_SIZE]))
    LaTextGAN_Discriminator(input)

In [None]:
LaTextGAN_Generator.summary()

In [None]:
LaTextGAN_Discriminator.summary()

Create csv containing sentences for InferSent: 

In [None]:
sentences = []
for _ in tqdm(range(10000)):
    sentences.append([word2vec_model.wv.index2word[i.numpy()[0]] for i in News_AE.Decoder.inference_mode(start_token=start_token, 
                                                                                                         end_token=end_token, 
                                                                                                         max_seq_length=max_seq_length-1, 
                                                                                                         states=tf.random.normal([1, HIDDEN_SIZE]))])

with open(f"{path}/Evaluation/FID/News_LaTextGAN_InferSent.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(sentences)

Generate Sentences:

In [None]:
evaluation.generate_sentences(model=News_AE.Decoder, 
                              index_decoder=word2vec_model.wv.index2word, 
                              print_sentences=True, 
                              model_name="News_LaTextGAN", 
                              latent_sample_gen=LaTextGAN_Generator, 
                              num_sent=2, 
                              start_token=start_token, 
                              end_token=end_token, 
                              max_seq_length=max_seq_length)

Average sentence length:

In [None]:
evaluation.generate_sentences(model=News_AE.Decoder, 
                              index_decoder=word2vec_model.wv.index2word, 
                              print_sentences=False, 
                              model_name="News_LaTextGAN", 
                              latent_sample_gen=LaTextGAN_Generator, 
                              num_sent=10000, 
                              start_token=start_token, 
                              end_token=end_token, 
                              max_seq_length=max_seq_length)

  0%|          | 0/100 [00:00<?, ?it/s]


Average length of generated sentences: 18.24 tokens



Prepare the reference data used for Bleu, Self-Bleu and Word Frequency calculations:

In [None]:
reference_data = []
for sent in sent2index_target[int(len(sent2index_target)*0.85):int(len(sent2index_target)*0.85)+10000]:
    temp = []
    for token_id in sent:
        if token_id == end_token:
            break
        temp.append(word2vec_model.wv.index2word[token_id])
    reference_data.append(temp)

Calculate JS Distance for sentence length frequencies and word counts:

In [None]:
jsd_sents, jsd_words = evaluation.js_distance(model=News_AE.Decoder, 
                                              index_decoder=word2vec_model.wv.index2word, 
                                              reference_data=reference_data, 
                                              model_name="News_LaTextGAN", 
                                              latent_sample_gen=LaTextGAN_Generator, 
                                              start_token=start_token, 
                                              end_token=end_token, 
                                              max_seq_length=max_seq_length)

print(f"Jensen-Shannon distance for the sentence length frequencies: {jsd_sents}")
print(f"Jensen-Shannon distance for the word counts: {jsd_words}")

Calculate Bleu-4 Score:

In [None]:
evaluation.bleu_score(model=News_AE.Decoder, 
                      index_decoder=word2vec_model.wv.index2word, 
                      reference_data=reference_data, 
                      model_name="News_LaTextGAN", 
                      latent_sample_gen=LaTextGAN_Generator, 
                      num_sent=10000, 
                      n_grams=4, 
                      start_token=start_token, 
                      end_token=end_token, 
                      max_seq_length=max_seq_length)

Calculate Self-Bleu-4 Score:

In [None]:
evaluation.self_bleu_score(model=News_AE.Decoder, 
                           index_decoder=word2vec_model.wv.index2word, 
                           model_name="News_LaTextGAN", 
                           latent_sample_gen=LaTextGAN_Generator, 
                           num_sent=10000, 
                           n_grams=4, 
                           start_token=start_token, 
                           end_token=end_token, 
                           max_seq_length=max_seq_length)

Count Word Frequency:

In [None]:
top_k = 12
ref_freq, gen_freq = evaluation.word_freq(model=News_AE.Decoder, 
                                          index_decoder=word2vec_model.wv.index2word, 
                                          reference_data=reference_data, 
                                          model_name="News_LaTextGAN", 
                                          latent_sample_gen=LaTextGAN_Generator, 
                                          start_token=start_token, 
                                          end_token=end_token, 
                                          max_seq_length=max_seq_length)

In [None]:
list(ref_freq.items())[:top_k]

In [None]:
list(gen_freq.items())[:top_k]

Word frequency plot:

In [None]:
save_path = "YourPathHere"
evaluation.word_freq_plots(reference_freq_dict=ref_freq, 
                           generated_freq_dict=gen_freq, 
                           top_k=top_k,
                           save_plots=False, 
                           save_path=save_path)