In [None]:
!nvidia-smi

Wed Jan 27 01:15:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/vin/NLP/nmt_attention2'

/content/drive/MyDrive/vin/NLP/nmt_attention2


In [None]:
!pip install tensorflow-addons==0.11.2

Collecting tensorflow-addons==0.11.2
[?25l  Downloading https://files.pythonhosted.org/packages/b3/f8/d6fca180c123f2851035c4493690662ebdad0849a9059d56035434bff5c9/tensorflow_addons-0.11.2-cp36-cp36m-manylinux2010_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 5.8MB/s 
Installing collected packages: tensorflow-addons
  Found existing installation: tensorflow-addons 0.8.3
    Uninstalling tensorflow-addons-0.8.3:
      Successfully uninstalled tensorflow-addons-0.8.3
Successfully installed tensorflow-addons-0.11.2


In [None]:
import logging 
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import re
import numpy as np
import os
import io
import time
import pickle
import gc

 The versions of TensorFlow you are currently using is 2.4.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [None]:
log_file = open('logs/log_luong.log', 'a+')

In [None]:
# load tokenizer
with open('tokenizer/tokenizer.pickle', 'rb') as f:
  data = pickle.load(f)
  en_tokenizer = data['en_tokenizer']
  vi_tokenizer = data['vi_tokenizer']

In [None]:
def preprocess_sentence(s):
  s = s.lower()
  s = s.strip()
  s = '<s> ' + s + ' </s>'
  return s

en_data_tensor_path = 'sequences_data/en_data.pickle'
vi_data_tensor_path = 'sequences_data/vi_data.pickle'

with open(en_data_tensor_path, 'rb') as f:
  en_data = pickle.load(f)
  en_train = en_data['en_train_tensor']
  en_dev = en_data['en_dev_tensor']
  en_test = en_data['en_test_tensor']

with open(vi_data_tensor_path, 'rb') as f:
  vi_data = pickle.load(f)
  vi_train = vi_data['vi_train_tensor']
  vi_dev = vi_data['vi_dev_tensor']
  vi_test = vi_data['vi_test_tensor']

max_length_en, max_length_vi = en_train.shape[1], vi_train.shape[1]

# save information
with open('information/infor_luong.pickle', 'wb') as handle:
    pickle.dump(
        {'max_length_en': max_length_en, 
         'max_length_vi': max_length_vi, 
         'en_tokenizer': en_tokenizer,
         'vi_tokenizer': vi_tokenizer,
         'attention': 'luong',
         'en_example': en_train[0],
         'vi_example': vi_train[0]
        }, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
BUFFER_SIZE = 32000
BATCH_SIZE = 128
steps_per_epoch = len(en_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_en_size = len(en_tokenizer.word_index)+1
vocab_vi_size = len(vi_tokenizer.word_index)+1

train_dataset = tf.data.Dataset.from_tensor_slices((en_train, vi_train)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
en_train.shape

In [None]:
vi_train.shape

In [None]:
del en_train
del vi_train
gc.collect()

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding")

    ##________ LSTM layer in Encoder ------- ##
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform',
                                   name="lstm")



  def call(self, x, hidden):
    x = self.embedding(x)
    output, h, c = self.lstm_layer(x, initial_state = hidden)
    return output, h, c

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, embeddings_initializer='uniform')

    self.fc = tf.keras.layers.Dense(vocab_size)

    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)

    self.sampler = tfa.seq2seq.sampler.TrainingSampler()

    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_en], self.attention_type)

    self.rnn_cell = self.build_rnn_cell(batch_sz)

    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)


  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state

  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_vi-1])
    return outputs

In [None]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

In [None]:
encoder = Encoder(vocab_en_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_vi_size, embedding_dim, units, BATCH_SIZE, 'luong')

checkpoint_dir = './checkpoints/luong_cp'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
# BasicDecoder
def evaluate_sentence(sentence):
  sentence = preprocess_sentence(sentence)

  inputs = [en_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_en,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], vi_tokenizer.word_index['<s>'])
  end_token = vi_tokenizer.word_index['</s>']

  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

  decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
  decoder.attention_mechanism.setup_memory(enc_out)

  decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)

  decoder_embedding_matrix = decoder.embedding.variables[0]

  outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
  return outputs.sample_id.numpy()

def translate(sentence):
  result = evaluate_sentence(sentence)
  print(result)
  result = vi_tokenizer.sequences_to_texts(result)
  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))
  return result


In [None]:
def translate_eval(sentence):
  result = evaluate_sentence(sentence)
  result = vi_tokenizer.sequences_to_texts(result)
  return result

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] 
    real = targ[ : , 1: ]       

    decoder.attention_mechanism.setup_memory(enc_output)

    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

EPOCHS = 50
for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      log = 'Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy())
      print(log)
      # log_file.writelines(log+"\n")
  checkpoint.save(file_prefix = checkpoint_prefix)
  
  log = 'Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch)
  print(log)
  log = 'Time taken for 1 epoch {} sec\n'.format(time.time() - start)
  print(log)

Epoch 1 Batch 0 Loss 0.7310
Epoch 1 Batch 100 Loss 0.7504
Epoch 1 Batch 200 Loss 0.8544
Epoch 1 Batch 300 Loss 0.8565
Epoch 1 Batch 400 Loss 0.8591
Epoch 1 Batch 500 Loss 0.8335
Epoch 1 Batch 600 Loss 0.8450
Epoch 1 Batch 700 Loss 0.8828
Epoch 1 Batch 800 Loss 0.7997
Epoch 1 Batch 900 Loss 0.7918
Epoch 1 Loss 0.8252
Time taken for 1 epoch 908.1572012901306 sec

Epoch 2 Batch 0 Loss 0.6463
Epoch 2 Batch 100 Loss 0.6953
Epoch 2 Batch 200 Loss 0.6714
Epoch 2 Batch 300 Loss 0.7454
Epoch 2 Batch 400 Loss 0.6402
Epoch 2 Batch 500 Loss 0.8249
Epoch 2 Batch 600 Loss 0.7705
Epoch 2 Batch 700 Loss 0.7133
Epoch 2 Batch 800 Loss 0.7321
Epoch 2 Batch 900 Loss 0.7391
Epoch 2 Loss 0.7402
Time taken for 1 epoch 908.9237003326416 sec

Epoch 3 Batch 0 Loss 0.6239
Epoch 3 Batch 100 Loss 0.5773
Epoch 3 Batch 200 Loss 0.6617
Epoch 3 Batch 300 Loss 0.6511
Epoch 3 Batch 400 Loss 0.7499
Epoch 3 Batch 500 Loss 0.6969
Epoch 3 Batch 600 Loss 0.6382
Epoch 3 Batch 700 Loss 0.6819
Epoch 3 Batch 800 Loss 0.7227
Epoc

In [None]:
log_file.close()

In [None]:
# BeamSearchDecoder
def beam_evaluate_sentence(sentence, beam_width=3):
  sentence = preprocess_sentence(sentence)

  inputs = [en_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_en,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], vi_tokenizer.word_index['<s>'])
  end_token = vi_tokenizer.word_index['</s>']

  enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
  decoder.attention_mechanism.setup_memory(enc_out)
  print("beam_with * [batch_size, max_length_en, rnn_units] :  3 * [1, 16, 1024]] :", enc_out.shape)

  hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)
  decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)
  decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

  decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc)
  decoder_embedding_matrix = decoder.embedding.variables[0]

  outputs, final_state, sequence_lengths = decoder_instance(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)
 
  final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))
  beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))

  return final_outputs.numpy(), beam_scores.numpy()

def beam_translate(sentence):
  result, beam_scores = beam_evaluate_sentence(sentence)
  print(result.shape, beam_scores.shape)
  for beam, score in zip(result, beam_scores):
    print(beam.shape, score.shape)
    output = vi_tokenizer.sequences_to_texts(beam)
    output = [a[:a.index('</s>')] for a in output]
    beam_score = [a.sum() for a in score]
    print('Input: %s' % (sentence))
    for i in range(len(output)):
      print('{} Predicted translation: {}  {}'.format(i+1, output[i], beam_score[i]))

beam_translate(u'i love you .')

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

#BLEU score

In [None]:
en_test, vi_test = load_data('data/dev/tst2012.en', 'data/dev/tst2012.vi', max_length=50)