# Assignment 5 Machine Translation
### Jing Qian (jq2282)

## Step 1. Install packages and load libraries


In [1]:
!pip install -q tensorflow-gpu==2.0.0-alpha0
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU



In [0]:
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata

## Step 2. Load and preprocess data

### 2.1. Load and select data

In [0]:
ntrain = 10000
ntest = 1000
nall = ntrain + ntest
nepoch = 100

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
def load(fname):
  # Load the file using std open
  f = open(fname, 'r')
  text = []
  for line in f.readlines():
    text.append(line.replace('\n','').split('\t'))
    
  f.close()
  return text

data = load('/content/gdrive/My Drive/spa-eng/spa.txt')

In [6]:
print(data[10:15])
print(np.shape(data))

[['Fire!', '¡Disparad!'], ['Help!', '¡Ayuda!'], ['Help!', '¡Socorro! ¡Auxilio!'], ['Help!', '¡Auxilio!'], ['Jump!', '¡Salta!']]
(118964, 2)


In [7]:
#np.random.seed(10)
shuffled_data = np.random.permutation(data)
selected_id = np.random.randint(len(data), size = nall)
train_data = shuffled_data[selected_id[:ntrain], :]
print(np.shape(train_data), np.shape(shuffled_data))

(10000, 2) (118964, 2)


### 2.2. Preprocessing

In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [9]:
train_data = [(preprocess(eng), preprocess(spa)) for (eng, spa) in train_data]
print(train_data[0])
train_eng, train_spa = list(zip(*train_data))

('<start> Yesterday was Tuesday , January , . <end>', '<start> Ayer fue martes de enero del . <end>')


In [10]:
eng_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
eng_tokenizer.fit_on_texts(train_eng)
train_eng = eng_tokenizer.texts_to_sequences(train_eng)
train_eng = tf.keras.preprocessing.sequence.pad_sequences(train_eng, padding='post')
print(train_eng[0])

[   1  170   20 2646   19 1482   19    3    2    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


In [11]:
spa_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
spa_tokenizer.fit_on_texts(train_spa)
train_spa = spa_tokenizer.texts_to_sequences(train_spa)
train_spa = tf.keras.preprocessing.sequence.pad_sequences(train_spa, padding='post')
print(train_spa[0])

[   1  125   48 2290    5 1725   45    3    2    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [0]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1
spa_vocab_size = len(spa_tokenizer.word_index) + 1

In [13]:
# Demonstrate the correspondence between word and code, not used in model
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(train_eng[0], eng_tokenizer)

1 -> <start>
170 -> yesterday
20 -> was
2646 -> tuesday
19 -> ,
1482 -> january
19 -> ,
3 -> .
2 -> <end>


## Step 3. Train Model 1: translate from English to Spanish


### 3.1. Prepare data for model1

In [14]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(train_spa.shape)
target_labels[:,0:train_spa.shape[1] -1] = train_spa[:,1:]

print("Target sequence", train_spa[0])
print("Target label", target_labels[0])

Target sequence [   1  125   48 2290    5 1725   45    3    2    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
Target label [1.250e+02 4.800e+01 2.290e+03 5.000e+00 1.725e+03 4.500e+01 3.000e+00
 2.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]


In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((train_eng, train_spa, target_labels)).batch(batch_size)

In [0]:
# Test code!
# example_batch = next(iter(dataset))
# source, target, taget_labels = example_batch
# print("Shapes:", source.shape, target.shape, taget_labels.shape)

### 3.2. General components for models

In [0]:
embedding_size = 32
rnn_size = 64

In [0]:
# Differ from example, add source_vocab_size to initialization
class Encoder(tf.keras.Model):
  def __init__(self, source_vocab_size):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

In [0]:
# Test code!
# Create a batch of one sentence
# ex_sentence = tf.expand_dims(train_eng[0], axis=0)
# ex_translation = tf.expand_dims(train_spa[0], axis=0)
# ex_labels = tf.expand_dims(target_labels[0], axis=0)
# print(ex_sentence.shape)


In [0]:
# Differ from example, add target_vocab_size to initialization
class Decoder(tf.keras.Model):
  def __init__(self, target_vocab_size):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

#print("Loss", calc_loss(ex_labels, decoder_output))

In [0]:
def translate(idx=None, flag=1):
  
    if idx == None: 
      idx = np.random.choice(len(train_data))
    
    if flag == 1:
      source_data = train_eng
      target_tokenizer = spa_tokenizer
      encoder = encoder1
      decoder = decoder1
      tmp0, tmp1 = train_data[idx][0], train_data[idx][1]
    elif flag == 2:
      source_data = train_spa
      target_tokenizer = eng_tokenizer
      encoder = encoder2
      decoder = decoder2
      tmp0, tmp1 = train_data[idx][1], train_data[idx][0] 
    elif flag == 3:
      source_data = test_eng
      target_tokenizer = spa_tokenizer
      encoder = encoder1
      decoder = decoder1
      tmp0, tmp1 = test_data[idx][0], test_data[idx][1]
    elif flag == 4:
      source_data = input_spa
      target_tokenizer = eng_tokenizer
      encoder = encoder2
      decoder = decoder2
      tmp0, tmp1 = middle_spa[idx], test_data[idx][0]       
      
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return tmp0, tmp1, translation

In [0]:
# Test code
# input_sent, target_sent, translation = translate()
# print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

In [0]:
optimizer = tf.keras.optimizers.Adam()

### 3.3. Train Model 1: from English to Spanish

In [0]:
@tf.function # remove this annotation when debugging
def train_step(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder1(source_seq, initial_state)
    logits, decoder_state = decoder1(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder1.trainable_variables + decoder1.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [25]:
EPOCHS = nepoch

encoder1 = Encoder(eng_vocab_size)
decoder1 = Decoder(spa_vocab_size)

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder1.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 0.6808, Time 27.48 sec
Input: <start> I ve finished reading the book . <end>
Target: <start> He acabado de leer el libro . <end>
Translation: ¿ no se que no se que tom que tom que tom que tom que tom que tom que tom

Epoch #10, Loss 0.3854, Time 21.28 sec
Input: <start> I asked Tom to sing . <end>
Target: <start> Le pedi a Tom que cantase . <end>
Translation: me gusta los huevos . <end>

Epoch #20, Loss 0.2837, Time 20.42 sec
Input: <start> Let s get started . <end>
Target: <start> Comencemos . <end>
Translation: dejame que charle un momento . <end>

Epoch #30, Loss 0.2079, Time 21.77 sec
Input: <start> Come back . <end>
Target: <start> Vuelve . <end>
Translation: bienvenido de vuelta , el es mi hermana . <end>

Epoch #40, Loss 0.1763, Time 20.40 sec
Input: <start> Let s sue Tom . <end>
Target: <start> Demandemos a Tom . <end>
Translation: comamos . <end>

Epoch #50, Loss 0.1604, Time 20.50 sec
Input: <start> I feel safe here . <end>
Target: <start> Me siento segura aca 

In [26]:
# Test code for BLEU score
references, hypotheses = [], []

for i in range(20):
  input_sent, target_sent, translation = translate()
  references.append(target_sent)
  hypotheses.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

BLEU(score=0.0, counts=[80, 26, 2, 0], totals=[182, 162, 142, 122], precisions=[43.956043956043956, 16.049382716049383, 1.408450704225352, 0.0], bp=0.9890711476293599, sys_len=182, ref_len=184)


## Step 4. Train Model 2: translate from Spanish to English

### 4.1. Prepare data for model2


In [0]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(train_eng.shape)
target_labels[:,0:train_eng.shape[1] -1] = train_eng[:,1:]
dataset = tf.data.Dataset.from_tensor_slices((train_spa, train_eng, target_labels)).batch(batch_size)

### 4.2 Train Model2: from Spanish to English

In [0]:
@tf.function # remove this annotation when debugging
def train_step2(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder2(source_seq, initial_state)
    logits, decoder_state = decoder2(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder2.trainable_variables + decoder2.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [29]:
encoder2 = Encoder(spa_vocab_size)
decoder2 = Decoder(eng_vocab_size)

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder2.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      loss = train_step2(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate(flag=2)
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 0.6055, Time 24.35 sec
Input: <start> Tom debio haber hecho las cosas de otra forma . <end>
Target: <start> Tom should have done things differently . <end>
Translation: i m not a lot . <end>

Epoch #10, Loss 0.4340, Time 19.65 sec
Input: <start> Ella parece estar involucrada en ese caso de asesinato . <end>
Target: <start> She seems to be involved in that murder case . <end>
Translation: she was a very surprised . <end>

Epoch #20, Loss 0.2913, Time 20.71 sec
Input: <start> ¿ No puedes escuchar el sonido ? <end>
Target: <start> Can t you hear the sound ? <end>
Translation: can i ask you ? <end>

Epoch #30, Loss 0.2375, Time 20.46 sec
Input: <start> Nunca pense que dirias algo asi . <end>
Target: <start> I never thought you d say something like that . <end>
Translation: i never thought tom could not come at home . <end>

Epoch #40, Loss 0.1822, Time 19.75 sec
Input: <start> Dejadme salir ! <end>
Target: <start> Let me out ! <end>
Translation: let me know your baggage . <e

In [30]:
# Test code for BLEU score
references, hypotheses = [], []

for i in range(20):
  input_sent, target_sent, translation = translate(flag=2)
  references.append(target_sent)
  hypotheses.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

BLEU(score=27.35379629803985, counts=[109, 57, 28, 16], totals=[181, 161, 141, 121], precisions=[60.22099447513812, 35.40372670807454, 19.858156028368793, 13.223140495867769], bp=1.0, sys_len=181, ref_len=178)


## Step 5. Back-translate
Use your two models to translate a sentence from English to Spanish, and then back to English. Compare the original sentence, and the back-translated sentence. Repeat this using an evaluation corpus of 1,000 sentences, and report the BLEU score.

### 5.1. Preprocess test data

In [0]:
test_data = shuffled_data[selected_id[ntrain:], :]
test_data = [(preprocess(eng), preprocess(spa)) for (eng, spa) in test_data]
#print(train_data[0])
test_eng, test_spa = list(zip(*test_data))
test_eng = eng_tokenizer.texts_to_sequences(test_eng)
test_eng = tf.keras.preprocessing.sequence.pad_sequences(test_eng, padding='post')

### 5.2. Using Model 1 to translate test data from English to Spanish.

In [0]:
middle_spa = []
origin_eng = []
for i in range(ntest):
  input_sent, target_sent, translation = translate(idx=i, flag=3)
  #print(input_sent)
  origin_eng.append(input_sent)
  middle_spa.append(translation[:-5])

### 5.3. Using Model 2 to translate the output from Model 1 back to English

In [0]:
input_spa = spa_tokenizer.texts_to_sequences(middle_spa)
input_spa = tf.keras.preprocessing.sequence.pad_sequences(input_spa, padding='post')
#print(input_spa)

In [0]:
back_eng = []
for i in range(ntest):
  input_sent, target_sent, translation = translate(idx=i, flag=4)
  #print(translation)
  back_eng.append("<start> " + translation)

### 5.4. Calculate BLEU

In [35]:
results = sacrebleu.raw_corpus_bleu(back_eng, [origin_eng])
print(results)  

BLEU(score=1.7002882259851262, counts=[3378, 966, 20, 6], totals=[9849, 8849, 7849, 6849], precisions=[34.297898263783125, 10.91648773872754, 0.25480952987641736, 0.08760402978537013], bp=1.0, sys_len=9849, ref_len=9591)
