# EC2-A5: Attention mechanisms
### Jing Qian (jq2282)

## Step 1. Install packages and load libraries


In [1]:
!pip install -q tensorflow-gpu==2.0.0-alpha0
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU



In [0]:
import os
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata

## Step 2. Load and preprocess data

### 2.1. Load data

In [0]:
ntrain = 1000
ntest = 100
nall = ntrain + ntest


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
def load(fname):
  # Load the file using std open
  f = open(fname, 'r')
  text = []
  for line in f.readlines():
    text.append(line.replace('\n','').split('\t'))
    
  f.close()
  return text

data = load('/content/gdrive/My Drive/spa-eng/spa.txt')

In [6]:
print(data[10:15])
print(np.shape(data))

[['Fire!', '¡Disparad!'], ['Help!', '¡Ayuda!'], ['Help!', '¡Socorro! ¡Auxilio!'], ['Help!', '¡Auxilio!'], ['Jump!', '¡Salta!']]
(118964, 2)


In [7]:
np.random.seed(10)
shuffled_data = np.random.permutation(data)
selected_id = np.random.randint(len(data), size = nall)
train_data = shuffled_data[selected_id[:ntrain], :]
print(np.shape(train_data), np.shape(shuffled_data))

(500, 2) (118964, 2)


### 2.2. Preprocessing

In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [9]:
train_data = [(preprocess(eng), preprocess(spa)) for (eng, spa) in train_data]
print(train_data[0])
train_eng, train_spa = list(zip(*train_data))

('<start> How do you think that makes me feel ? <end>', '<start> ¿ Como crees que me hace sentir eso ? <end>')


In [10]:
eng_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
eng_tokenizer.fit_on_texts(train_eng)
train_eng = eng_tokenizer.texts_to_sequences(train_eng)
train_eng = tf.keras.preprocessing.sequence.pad_sequences(train_eng, padding='post')
print(train_eng[0])

[  1  47  20   7  88  16 369  17 103  10   2   0   0   0   0   0   0   0
   0   0   0   0   0   0]


In [11]:
spa_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
spa_tokenizer.fit_on_texts(train_spa)
train_spa = spa_tokenizer.texts_to_sequences(train_spa)
train_spa = tf.keras.preprocessing.sequence.pad_sequences(train_spa, padding='post')
print(train_spa[0])

[  1  10  35 127   4  15  47 193  36  11   2   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]


In [0]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1
spa_vocab_size = len(spa_tokenizer.word_index) + 1

In [13]:
# Demonstrate the correspondence between word and code, not used in model
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(train_eng[0], eng_tokenizer)

1 -> <start>
47 -> how
20 -> do
7 -> you
88 -> think
16 -> that
369 -> makes
17 -> me
103 -> feel
10 -> ?
2 -> <end>


In [14]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(train_spa.shape)
target_labels[:,0:train_spa.shape[1] -1] = train_spa[:,1:]

print("Target sequence", train_spa[0])
print("Target label", target_labels[0])

Target sequence [  1  10  35 127   4  15  47 193  36  11   2   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]
Target label [ 10.  35. 127.   4.  15.  47. 193.  36.  11.   2.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.]


In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((train_eng, train_spa, target_labels)).batch(batch_size)

In [0]:
# Test code!
# example_batch = next(iter(dataset))
# source, target, taget_labels = example_batch
# print("Shapes:", source.shape, target.shape, taget_labels.shape)

# Step 3. Model with Bahdanau Attention Mechanism 

### 3.1. Model class

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.rnn_units = rnn_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.rnn_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def init_state(self):
    return tf.zeros((self.batch_size, self.rnn_units))

In [0]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # hidden shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # we are doing this to perform addition to calculate the score
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, hidden_size)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.dec_units)
    
  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)
    
    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [0]:
EMBEDDING_DIM = 32
RNN_SIZE = 64
BATCH_SIZE = 5
EPOCHS = 10

In [0]:
# Maximum sequence length
def max_length(tensor):
    return max(len(t) for t in tensor)
  
max_length_eng, max_length_spa = max_length(train_eng), max_length(train_spa)

In [0]:
class Model():
  def __init__(self, encoder, decoder, source_tokenizer, target_tokenizer, source_max_length, target_max_length, model_name, rnn_size):
    self.encoder = encoder
    self.decoder = decoder
    self.source_tokenizer = source_tokenizer
    self.target_tokenizer = target_tokenizer
    self.source_max_length = source_max_length
    self.target_max_length = target_max_length
    self.rnn_size = rnn_size
    # Initilize the optimizer:
    self.optimizer = tf.keras.optimizers.Adam()
    # Create checkpoint:
#     checkpoint_dir = './training_checkpoints'
#     self.checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_"+model_name)
#     self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
#                                      encoder=self.encoder,
#                                      decoder=self.decoder)

    # Initilize the loss function
    self.crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    
  @tf.function
  def train_step(self, source, target, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
      enc_output, enc_hidden = self.encoder(source, enc_hidden)
      dec_hidden = enc_hidden
      dec_input = tf.expand_dims([self.target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

      # Teacher forcing - feeding the target as the next input
      for t in range(1, target.shape[1]):
        # passing enc_output to the decoder
        predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
        loss += self.calc_loss(target[:, t], predictions)
        # using teacher forcing
        dec_input = tf.expand_dims(target[:, t], 1)
    batch_loss = (loss / int(target.shape[1]))
    variables = self.encoder.trainable_variables + self.decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    self.optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

  def calc_loss(self, targets, logits):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    return self.crossentropy(targets, logits, sample_weight=mask)
  
  def train(self, source_train, target_train, epochs, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((source_train, target_train))
    dataset = dataset.batch(batch_size, drop_remainder=True)

    # calc steps per epoch
    steps_per_epoch = len(source_train)//batch_size
    for epoch in range(epochs):
      start = time.time()

      enc_hidden = self.encoder.init_state()
      total_loss = 0
 

      for (batch, (source, target)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = self.train_step(source, target, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
     

      print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                          total_loss / steps_per_epoch))
      print('Time taken for 1 epoch {} sec'.format(time.time() - start))
     
 
    
  def evaluate(self, sentence):
    inputs = tf.expand_dims(sentence, axis=0)
    result = ''
    #print(inputs)
    hidden = [tf.zeros((1, self.rnn_size))]
    #print('1')
    enc_out, enc_hidden = self.encoder(inputs, hidden)
    #print('2')
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([self.target_tokenizer.word_index['<start>']], 0)
    #print('3')
    for t in range(self.target_max_length):
        predictions, dec_hidden, attention_weights = self.decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        #attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += self.target_tokenizer.index_word[predicted_id] + ' '

        if self.target_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence
    
        
  def translate(self, sentence):
    #print('Input: %s' % (sentence))
    result, sentence = self.evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    #attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    #self.plot_attention(attention_plot, sentence.split(' '), result.split(' '))

### 3.2. From Eng to Spa

In [0]:
eng_spa_encoder = Encoder(eng_vocab_size, EMBEDDING_DIM, RNN_SIZE, BATCH_SIZE)
eng_spa_decoder = Decoder(spa_vocab_size, EMBEDDING_DIM, RNN_SIZE, BATCH_SIZE)

In [24]:
eng_spa_model = Model(eng_spa_encoder, eng_spa_decoder, eng_tokenizer, spa_tokenizer, "eng_spa", max_length_eng, max_length_spa, RNN_SIZE)
eng_spa_model.train(train_eng, train_spa, EPOCHS, BATCH_SIZE)#, test, spa_data_val)

Epoch 1 Batch 0 Loss 1.9999
Epoch 1 Loss 1.7806
Time taken for 1 epoch 58.503260135650635 sec
Epoch 2 Batch 0 Loss 1.5379
Epoch 2 Loss 1.5351
Time taken for 1 epoch 3.4940223693847656 sec
Epoch 3 Batch 0 Loss 1.4216
Epoch 3 Loss 1.4554
Time taken for 1 epoch 3.491806983947754 sec
Epoch 4 Batch 0 Loss 1.3723
Epoch 4 Loss 1.4152
Time taken for 1 epoch 3.5640389919281006 sec
Epoch 5 Batch 0 Loss 1.3354
Epoch 5 Loss 1.3839
Time taken for 1 epoch 3.4316999912261963 sec
Epoch 6 Batch 0 Loss 1.3039
Epoch 6 Loss 1.3595
Time taken for 1 epoch 3.402456521987915 sec
Epoch 7 Batch 0 Loss 1.2812
Epoch 7 Loss 1.3409
Time taken for 1 epoch 3.7065794467926025 sec
Epoch 8 Batch 0 Loss 1.2695
Epoch 8 Loss 1.3243
Time taken for 1 epoch 4.243173122406006 sec
Epoch 9 Batch 0 Loss 1.2543
Epoch 9 Loss 1.3097
Time taken for 1 epoch 4.253550291061401 sec
Epoch 10 Batch 0 Loss 1.2425
Epoch 10 Loss 1.2932
Time taken for 1 epoch 3.564115524291992 sec


### 3.3. From Spa to Eng

In [25]:
spa_eng_encoder = Encoder(spa_vocab_size, EMBEDDING_DIM, RNN_SIZE, BATCH_SIZE)
spa_eng_decoder = Decoder(eng_vocab_size, EMBEDDING_DIM, RNN_SIZE, BATCH_SIZE)
spa_eng_model = Model(spa_eng_encoder, spa_eng_decoder, spa_tokenizer, eng_tokenizer, max_length_spa, max_length_eng, "spa_eng", RNN_SIZE)
spa_eng_model.train(train_spa, train_eng, EPOCHS, BATCH_SIZE)

Epoch 1 Batch 0 Loss 2.4667
Epoch 1 Loss 2.1850
Time taken for 1 epoch 45.3548321723938 sec
Epoch 2 Batch 0 Loss 1.9530
Epoch 2 Loss 1.8963
Time taken for 1 epoch 3.276911735534668 sec
Epoch 3 Batch 0 Loss 1.7959
Epoch 3 Loss 1.7981
Time taken for 1 epoch 3.5957908630371094 sec
Epoch 4 Batch 0 Loss 1.7317
Epoch 4 Loss 1.7503
Time taken for 1 epoch 3.5935492515563965 sec
Epoch 5 Batch 0 Loss 1.6906
Epoch 5 Loss 1.7078
Time taken for 1 epoch 3.1771514415740967 sec
Epoch 6 Batch 0 Loss 1.6470
Epoch 6 Loss 1.6697
Time taken for 1 epoch 2.898642063140869 sec
Epoch 7 Batch 0 Loss 1.6098
Epoch 7 Loss 1.6394
Time taken for 1 epoch 2.93833327293396 sec
Epoch 8 Batch 0 Loss 1.5809
Epoch 8 Loss 1.6125
Time taken for 1 epoch 2.9686360359191895 sec
Epoch 9 Batch 0 Loss 1.5571
Epoch 9 Loss 1.5874
Time taken for 1 epoch 3.618291139602661 sec
Epoch 10 Batch 0 Loss 1.5321
Epoch 10 Loss 1.5633
Time taken for 1 epoch 3.5522220134735107 sec


### 3.4. Back translation

In [0]:
test_data = shuffled_data[selected_id[ntrain:], :]
test_data = [(preprocess(eng), preprocess(spa)) for (eng, spa) in test_data]
test_eng, test_spa = list(zip(*test_data))
test_eng = eng_tokenizer.texts_to_sequences(test_eng)
test_eng = tf.keras.preprocessing.sequence.pad_sequences(test_eng, padding='post')




In [27]:
middle_spa = []
origin_eng = []
for i in range(ntest):
  translation, sentence = eng_spa_model.evaluate(test_eng[i])
  #print(input_sent)
  origin_eng.append(test_data[i][0])
  middle_spa.append(translation[:-5])
  
input_spa = spa_tokenizer.texts_to_sequences(middle_spa)
input_spa = tf.keras.preprocessing.sequence.pad_sequences(input_spa, padding='post')
#print(input_spa)

back_eng = []
for i in range(ntest):
  translation, sentence = spa_eng_model.evaluate(input_spa[i])
  #print(translation)
  back_eng.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(back_eng, [origin_eng])
print(results)  

BLEU(score=0.0, counts=[294, 87, 0, 0], totals=[500, 400, 300, 200], precisions=[58.8, 21.75, 0.0, 0.0], bp=0.4033300780671188, sys_len=500, ref_len=954)


# Step 4. Conclusion
In this assignment, I implement Bahdanau Attention mechanism in the translation model. 
Compared to the model without attention mechanism, one needs to modify the decoder and training process with "teaching".
Because of the limited training and testint set, the BLEU didn't get better with the Bahdanau attention.