# **Neural Machine Translation using LSTM Seq2Seq Model with Scale Dot Product Attention**

In [None]:
!pip install trax

Collecting trax
[?25l  Downloading https://files.pythonhosted.org/packages/a8/04/0c04116bbb372f459ad0a73bf306c5000f9fd63a8419bb179381f54773aa/trax-1.3.5-py2.py3-none-any.whl (416kB)
[K     |▉                               | 10kB 25.1MB/s eta 0:00:01[K     |█▋                              | 20kB 2.9MB/s eta 0:00:01[K     |██▍                             | 30kB 3.6MB/s eta 0:00:01[K     |███▏                            | 40kB 4.1MB/s eta 0:00:01[K     |████                            | 51kB 3.4MB/s eta 0:00:01[K     |████▊                           | 61kB 3.7MB/s eta 0:00:01[K     |█████▌                          | 71kB 4.2MB/s eta 0:00:01[K     |██████▎                         | 81kB 4.4MB/s eta 0:00:01[K     |███████                         | 92kB 4.7MB/s eta 0:00:01[K     |███████▉                        | 102kB 4.6MB/s eta 0:00:01[K     |████████▋                       | 112kB 4.6MB/s eta 0:00:01[K     |█████████▍                      | 122kB 4.6MB/s eta 0:00

In [None]:
from termcolor import colored
import random
import numpy as np

import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training

!pip list | grep trax

trax                          1.3.5                


In [None]:
#Get generator function for training set
train_stream_fn = trax.data.TFDS('opus/medical', keys = ('en', 'de'), eval_holdout_size=0.01, train = True)

#Get generator function for validation set
eval_stream_fn = trax.data.TFDS('opus/medical', keys = ('en', 'de'), eval_holdout_size=0.01, train = False)

[1mDownloading and preparing dataset opus/medical/0.1.0 (download: 34.29 MiB, generated: 188.85 MiB, total: 223.13 MiB) to /root/tensorflow_datasets/opus/medical/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/opus/medical/0.1.0.incompleteGUBXX1/opus-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1108752.0), HTML(value='')))

[1mDataset opus downloaded and prepared to /root/tensorflow_datasets/opus/medical/0.1.0. Subsequent calls will reuse this data.[0m


In [None]:
train_stream = train_stream_fn()
print(colored('train data (en, de) tuple:', 'red'), next(train_stream))

eval_stream = eval_stream_fn()
print(colored('eval data (en, de) tuple:', 'red'), next(eval_stream))

[31mtrain data (en, de) tuple:[0m (b'During treatment with olanzapine, adolescents gained significantly more weight compared with adults.\n', b'W\xc3\xa4hrend der Behandlung mit Olanzapin nahmen die Jugendlichen im Vergleich zu Erwachsenen signifikant mehr Gewicht zu.\n')
[31meval data (en, de) tuple:[0m (b'Lutropin alfa Subcutaneous use.\n', b'Pulver zur Injektion Lutropin alfa Subkutane Anwendung\n')


In [None]:
from google.colab import files
uploaded = files.upload()

Saving ende_32k.subword to ende_32k.subword


In [None]:
vocab_file = 'ende_32k.subword'
vocab_dir = 'dir/'

#tokenize the dataset
tokenized_train_stream = trax.data.Tokenize(vocab_file = vocab_file, vocab_dir = vocab_dir)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file = vocab_file, vocab_dir = vocab_dir)(eval_stream)

# **Tokenization**

In [None]:
#Append the EOS at the end of each sentence
EOS = 1

def append_eos(stream):
  for (inputs, targets) in stream:
    inputs_with_eos = list(inputs) + [EOS]
    targets_with_eos = list(targets) + [EOS]
    yield np.array(inputs_with_eos), np.array(targets_with_eos)

#append EOS to training data and validation data
tokenized_train_stream = append_eos(tokenized_train_stream)
tokenized_eval_stream = append_eos(tokenized_eval_stream)

train_inputs, train_targets = next(tokenized_train_stream)

#print the tokenized sentences from training
print('Input sentence from training:', train_inputs)
print('Target sentence from training:', train_targets, '\n')

eval_inputs, eval_targets = next(tokenized_eval_stream)

#print the tokenized sentences from validation
print('Input sentence from validation:', eval_inputs)
print('Target sentence from validation:', eval_targets)

Input sentence from training: [ 5345   568   909 30650  4048  5701  3771   115   349  9935   115  8035
    16 10146  4644    36   909 30650  4048  5701  3771   115   135   208
     8   909 33287   913   349  9935  3550 30650  4729   992     1     1]
Target sentence from training: [ 4172  2020  6006   349  9935   115 18457     5    24  6438  7368    69
  6006   135   208    12   909 33287   913   349  9935  3550 30650  4729
   992     1     1] 

Input sentence from validation: [  118    16  9000    17     4  6826  7211  3853  8834 20293  1978     7
 17067 13658    23  9708 12106   596    16   615    15 19849  3550 30650
  4729   992     1     1]
Target sentence from validation: [  168    78    41    44   159 23385     5 15881 23486  6469  7174    11
  7975 13658    23    15 12718 16754   596  2850  3550 30650  4729   992
     1     1]


In [None]:
#Filter too long sentences to not run out of memory
#length_keys = [0,1] means filter both english and german sentences, 256 tokens for training and 512 tokens for eval
filtered_train_stream = trax.data.FilterByLength(max_length=256, length_keys=[0,1])(tokenized_train_stream)
filtered_eval_stream = trax.data.FilterByLength(max_length=512, length_keys=[0,1])(tokenized_eval_stream)

train_inputs1, train_targets1 = next(filtered_train_stream)

#print the tokenized sentences from training
print('Filtered Input sentence from training:', train_inputs)
print('Filtered Target sentence from training:', train_targets, '\n')

eval_inputs1, eval_targets1 = next(filtered_eval_stream)

#print the tokenized sentences from validation
print('Filtered Input sentence from validation:', eval_inputs)
print('Filtered Target sentence from validation:', eval_targets)

Filtered Input sentence from training: [ 5345   568   909 30650  4048  5701  3771   115   349  9935   115  8035
    16 10146  4644    36   909 30650  4048  5701  3771   115   135   208
     8   909 33287   913   349  9935  3550 30650  4729   992     1     1]
Filtered Target sentence from training: [ 4172  2020  6006   349  9935   115 18457     5    24  6438  7368    69
  6006   135   208    12   909 33287   913   349  9935  3550 30650  4729
   992     1     1] 

Filtered Input sentence from validation: [  118    16  9000    17     4  6826  7211  3853  8834 20293  1978     7
 17067 13658    23  9708 12106   596    16   615    15 19849  3550 30650
  4729   992     1     1]
Filtered Target sentence from validation: [  168    78    41    44   159 23385     5 15881 23486  6469  7174    11
  7975 13658    23    15 12718 16754   596  2850  3550 30650  4729   992
     1     1]


# **Tokenize and detokenize helper function**

In [None]:
#encodes a string to array of numbers
def tokenize(input_str, vocab_file = None, vocab_dir=None):

  EOS = 1

  inputs = next(trax.data.tokenize(iter([input_str]), vocab_file=vocab_file, vocab_dir=vocab_dir))
  inputs = list(inputs) + [EOS]

  #adding the batch dimension to the front of the shape
  batch_inputs = np.reshape(np.array(inputs), [1, -1])

  return batch_inputs

def detokenize(integers, vocab_file = None, vocab_dir=None):

  #remove the dimension of size 1
  integers = list(np.squeeze(integers))

  EOS = 1

  #remove the EOS to decode only the original tokens
  if EOS in integers:
    integers = integers[:integers.index(EOS)]
  
  return trax.data.detokenize(integers, vocab_file = vocab_file, vocab_dir=vocab_dir)

In [None]:
print('SIngle detokenized example input:', detokenize(train_inputs1, vocab_file=vocab_file, vocab_dir=vocab_dir))
print('SIngle detokenized example target:', detokenize(train_targets1, vocab_file=vocab_file, vocab_dir=vocab_dir))

SIngle detokenized example input: Driving and using machines

SIngle detokenized example target: Verkehrstüchtigkeit und das Bedienen von Maschinen



# **Bucketing**

In [None]:
# Buckets are defined in terms of boundaries and batch sizes.
# Batch_sizes[i] determines the batch size for items with length < boundaries[i]
# So below, we'll take a batch of 256 sentences of length < 8, 128 if length is
# between 8 and 16, and so on and only 2 if length is over 512.
boundaries = [8, 16, 24, 32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16, 8, 4, 2]

#create the generators
train_batch_stream = trax.data.BucketByLength(boundaries, batch_sizes, length_keys= [0,1])(filtered_train_stream)
eval_batch_stream = trax.data.BucketByLength(boundaries, batch_sizes, length_keys=[0,1])(filtered_eval_stream)

#add masking for the padding 0's
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

In [None]:
input_batch, target_batch, mask_batch = next(train_batch_stream)

print('input_batch type:', type(input_batch))
print('target_batch type:', type(target_batch))

print('input_batch shape:', input_batch.shape)
print('target_batch shape:', target_batch.shape)

input_batch type: <class 'numpy.ndarray'>
target_batch type: <class 'numpy.ndarray'>
input_batch shape: (16, 64)
target_batch shape: (16, 64)


In [None]:
#pick a random index less than length of input_batch
index = random.randrange(len(input_batch))

print('This is the english sentence:', detokenize(input_batch[index], vocab_file=vocab_file, vocab_dir=vocab_dir))
print('This is the tokenized version of the english sentence:', input_batch[index], '\n')

print('This is the german sentence:', detokenize(target_batch[index], vocab_file=vocab_file, vocab_dir=vocab_dir))
print('This is the tokenized version of the german sentence:', target_batch[index], '\n')

This is the english sentence: The carcinogenic potential of methoxy polyethylene glycol-epoetin beta has not been evaluated in long-term animal studies.

This is the tokenized version of the english sentence: [   29  4492 12488 16956   841  1424     7 17067  5544 14533     5 16876
 11220 22175   510 10255 11828  6078    15  4472  6434  5193     5 11342
    13    63    48   110 13693   103     6   326    15   601  4663  4398
  3550 30650  4729   992     1     1     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0] 

This is the german sentence: Das kanzerogene Potenzial von Methoxy-Polyethylenglycol-Epoetin beta wurde nicht in Langzeitstudien an Tieren untersucht.

This is the tokenized version of the german sentence: [  111  4120 26643 20171  7956    21 15946 19646   105    15 21235 11220
 22175 28200 11828  6078    15 18585  6434  5193     5 11342    13   169
    44     6 31731 27283    23    27  5623    

# **2. Scaled Dot-Product Attention**

## **2.1 Input encoder function**

In [None]:
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):

  input_encoder = tl.Serial(
      tl.Embedding(input_vocab_size, d_model),
      [tl.LSTM(d_model) for _ in range(n_encoder_layers)]

  )
  return input_encoder


In [None]:
import w1_unittest

w1_unittest.test_input_encoder_fn(input_encoder_fn)

[92m All tests passed


## **2.2 Pre-attention decoder function**

In [None]:
#Pre-attention decoder runs on the targets and creates activations that are used as queries in attention.
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):

  pre_attention_decoder = tl.Serial(
      tl.ShiftRight(mode = mode),
      tl.Embedding(target_vocab_size, d_model),
      tl.LSTM(d_model)
  )

  return pre_attention_decoder

In [None]:
w1_unittest.test_pre_attention_decoder_fn(pre_attention_decoder_fn)

[92m All tests passed


## **2.3 Prepare attention input** 

In [None]:
def prepare_attention_input(encoder_activations, decoder_activations, inputs):

  keys = encoder_activations
  values = encoder_activations

  queries = decoder_activations

  # generate the mask to distinguish real tokens from padding
  mask = (inputs != 0)

  # add axes to the mask for attention heads (attention head is 1) and decoder length
  mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))

  # broadcast so mask shape is [batch size, attention heads, decoder-len, encoder-len]
  mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1))

  return queries, keys, values, mask

In [None]:
w1_unittest.test_prepare_attention_input(prepare_attention_input)

[92m All tests passed


# **3. LSTM seq2seq model with Attention**

In [None]:
#Returns an LSTM sequence-to-sequence model with attention
def NMTAttn(input_vocab_size = 33300, 
                 target_vocab_size = 33300, 
                 d_model = 1024, 
                 n_encoder_layers = 2,
                 n_decoder_layers = 2,
                 n_attention_heads = 4, 
                 attention_dropuout = 0.0, 
                 mode = 'train'):

  # Step 0: call the helper function to create layers for the input encoder
  input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

  # Step 0: call the helper function to create layers for the pre-attention decoder
  pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

  # Step 1: create a serial network
  model = tl.Serial(
      
      # Step 2: copy input tokens and target tokens as they will be needed later.
      tl.Select([0, 1, 0, 1]),

      # Step 3: run input encoder on the input and pre-attention decoder the target.
      tl.Parallel(input_encoder, pre_attention_decoder),

      # Step 4: prepare queries, keys, values and mask for attention.
      tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out = 4),

      # Step 5: run the AttentionQKV layer and nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
      tl.Residual(tl.AttentionQKV(d_model, n_heads = n_attention_heads, dropout = attention_dropuout, mode = mode)),

      # Step 6: drop attention mask
      tl.Select([0,2]),

      # Step 7: run the rest of the RNN decoder
      [tl.LSTM(d_model) for _ in range(n_decoder_layers)],

      # Step 8: prepare output by making it the right size
      tl.Dense(target_vocab_size),

      # Step 9: Log-softmax for output
      tl.LogSoftmax()
  )

  return model

In [None]:
w1_unittest.test_NMTAttn(NMTAttn)

[92m All tests passed


In [None]:
model = NMTAttn()
print(model)

Serial_in2_out2[
  Select[0,1,0,1]_in2_out4
  Parallel_in2_out2[
    Serial[
      Embedding_33300_1024
      LSTM_1024
      LSTM_1024
    ]
    Serial[
      ShiftRight(1)
      Embedding_33300_1024
      LSTM_1024
    ]
  ]
  PrepareAttentionInput_in3_out4
  Serial_in4_out2[
    Branch_in4_out3[
      None
      Serial_in4_out2[
        Parallel_in3_out3[
          Dense_1024
          Dense_1024
          Dense_1024
        ]
        PureAttention_in4_out2
        Dense_1024
      ]
    ]
    Add_in2
  ]
  Select[0,2]_in3_out2
  LSTM_1024
  LSTM_1024
  Dense_33300
  LogSoftmax
]


# **3. Training**

### **3.1 Train Task**

In [None]:
train_task = training.TrainTask(
    labeled_data = train_batch_stream,
    loss_layer = tl.CrossEntropyLoss(),
    optimizer = trax.optimizers.Adam(0.01),
    # use the `trax.lr.warmup_and_rsqrt_decay` as the learning rate schedule have 1000 warmup steps with a max value of 0.01
    lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, .01),
    n_steps_per_checkpoint = 10,
)

In [None]:
w1_unittest.test_train_task(train_task)

[92m All tests passed


### **3.2 Eval Task**

In [None]:
eval_task = training.EvalTask(
    labeled_data = eval_batch_stream,
    metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
)

### **3.3 Loop**

In [None]:
output_dir = '/content/model1'
 
!rm -f ~/content/model1/model.pkl.gz  

training_loop = training.Loop(
    NMTAttn(mode = 'train'),
    train_task,
    eval_tasks = [eval_task],
    output_dir = output_dir)

training_loop.run(10)


Step      1: Ran 1 train steps in 69.40 secs
Step      1: train CrossEntropyLoss |  10.44394588
Step      1: eval  CrossEntropyLoss |  10.44402790
Step      1: eval          Accuracy |  0.00000000

Step     10: Ran 9 train steps in 82.39 secs
Step     10: train CrossEntropyLoss |  10.30433750
Step     10: eval  CrossEntropyLoss |  10.05722523
Step     10: eval          Accuracy |  0.02896341


#. **4. Testing**

In [None]:
model = NMTAttn(mode = 'eval')

model.init_from_file('/content/model1/model.pkl.gz', weights_only = True)
model = tl.Accelerate(model)

##**4.1 Decoding**

In [None]:
#This function returns the index of the next token in the translated sentence and log probability of next symbol
#temperature (float): parameter for sampling ranging from 0.0 to 1.0. 0.0: same as argmax, always pick the most probable token and
#1.0: sampling from the distribution (can sometimes say random things)
def next_symbol(model, input_tokens, cur_output_tokens, temperature):

    # set the length of the current output tokens
    token_length = len(cur_output_tokens)

    # calculate next power of 2 for padding length 
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    # pad cur_output_tokens up to the padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length)
  
    # model expects the output to have an axis for the batch size in front so
    # convert `padded` list to a numpy array with shape (x, <padded_length>) where the
    # x position is the batch axis. (hint: you can use np.expand_dims() with axis=0 to insert a new axis)
    padded_with_batch = np.expand_dims(padded, axis=0)

    # get the model prediction. remember to use the `NMAttn` argument defined above.
    # hint: the model accepts a tuple as input (e.g. `my_model((input1, input2))`)
    output, _ = model((input_tokens, padded_with_batch))
    
    # get log probabilities from the last token output
    log_probs = output[0, token_length, :]

    # get the next symbol by getting a logsoftmax sample (*hint: cast to an int)
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))

    return symbol, float(log_probs[symbol])


In [None]:
w1_unittest.test_next_symbol(next_symbol, model)

Expected output:  [140, -0.000217437744]
[92m 1  Tests passed
[91m 1  Tests failed


In [None]:
#This function will call the next_symbol() function several times until the next output is the end-of-sentence token(EOS)
#It takes in an input sentence to translate and returns the translated version of that string.
#Returns:tuple: (list, str, float)
#list of int: tokenized version of the translated sentence
#float: log probability of the translated sentence
#str: the translated sentence
#this function generates the translation by getting the most probable word at each step.
def sampling_decode(input_sentence, model = None, temperature = 0.0, vocab_file = None, vocab_dir=None):

  # encode the input sentence
  input_tokens = tokenize(input_sentence, vocab_file, vocab_dir)

  cur_output_tokens = []

  # initialize an integer that represents the current output index
  cur_output = 0

  EOS = 1

  # check that the current output is not the end of sentence token
  while cur_output != EOS:

    # update the current output token by getting the index of the next word
    cur_output, log_prob = next_symbol(model, input_tokens, cur_output_tokens, temperature) 

    # append the current output token to the list of output tokens
    cur_output_tokens.append(cur_output)

  # detokenize the output tokens
  sentence = detokenize(cur_output_tokens, vocab_file, vocab_dir)

  return cur_output_tokens, log_prob, sentence

In [None]:
w1_unittest.test_sampling_decode(sampling_decode, model)

Test 1 fails
Test 2 fails
[92m 0  Tests passed
[91m 2  Tests failed


In [None]:
sampling_decode("I love languages.", model, temperature=0, vocab_file=vocab_file, vocab_dir = vocab_dir)

([752, 1], -9.70961856842041, 'Bei')

In [None]:
#The following function returns the translated version in input sentence
def greedy_decode_test(sentence, model = None, vocab_file = None, vocab_dir = None):
  temperature = 0
  _,_,translated_sentence = sampling_decode(sentence, model, temperature, vocab_file, vocab_dir)
  print('English sentence:', sentence)
  print('German sentence:', translated_sentence)
  return translated_sentence

In [None]:
your_sentence = 'I love languages.'
greedy_decode_test(your_sentence, model, vocab_file=vocab_file, vocab_dir = vocab_dir);

English sentence: I love languages.
German sentence: Bei


In [None]:
greedy_decode_test('You are almost done with the assignment!', model, vocab_file=vocab_file, vocab_dir = vocab_dir)

English sentence: You are almost done with the assignment!
German sentence: Bei


'Bei'

## **4.2 Minimum Bayes Risk Decoding**

In [None]:
#getting the most probable token at each step may not necessarily produce the best results. Another approach is to do Minimum Bayes Risk Decoding or MBR. 
#The general steps to implement this are:
#1.take several random samples
#2.score each sample against all other samples
#3.select the one with the highest score
def generate_samples(sentence, n_samples, model = None, temperature = 0.6, vocab_file = None, vocab_dir = None):
  samples, log_probs = [], []
  for _ in range(n_samples):
    sample, logp, _ = sampling_decode(sentence, model, temperature, vocab_file = vocab_file, vocab_dir = vocab_dir)
    samples.append(sample)
    log_probs.append(logp)
  return samples, log_probs

## **4.3 Jaccard Similarity to compare a sample against other sample**

In [None]:
#we will be calculating scores for unigram overlaps
#candidate is the tokenized version of candidate translation
#reference is the tokenized version of reference translation
def jaccard_similarity(candidate, reference):
  # convert the lists to a set to get the unique tokens
  can_unigram_set, ref_unigram_set = set(candidate), set(reference)

  # get the set of tokens common to both candidate and reference
  joint_elems = can_unigram_set.intersection(ref_unigram_set)

  # get the set of all tokens found in either candidate or reference
  all_elems = can_unigram_set.union(ref_unigram_set)

  overlap = len(joint_elems) / len(all_elems)

  return overlap

## **4.4 Rouge1 Similarity for unigrams**

In [None]:
from collections import Counter

#Returns the ROUGE-1 score between two token lists
#system: tokenized version of the system translation
#reference: tokenized version of the reference trans
def rouge1_similarity(system, reference):
  # make a frequency table of the system tokens
  sys_counter = Counter(system)

  # make a frequency table of the reference tokens
  ref_counter = Counter(reference)

  overlap = 0

  for token in sys_counter:
    # lookup the value of the token in the sys_counter dictionary
    token_count_system = sys_counter.get(token, 0)

    # lookup the value of the token in the ref_counter dictionary
    token_count_ref = ref_counter.get(token, 0)

    # update the overlap by getting the smaller number between the two token counts above
    overlap += min(token_count_system, token_count_ref)

  # get the precision (i.e. number of overlapping tokens / number of system tokens)
  precision = overlap / sum(sys_counter.values())

  # get the recall (i.e. number of overlapping tokens / number of reference tokens)
  recall = overlap / sum(ref_counter.values())

  if precision + recall != 0:
    # compute the f1-score
    rouge1_score = 2 * ((precision * recall) / (precision + recall))
  else:
    rouge1_score = 0
  
  return rouge1_score

## **4.5 Overall Score**

In [None]:
#We will now build a function to generate the overall score for a particular sample.
#these will be the steps to generate the scores of a 4-sample list.
#Get similarity score between sample 1 and sample 2
#Get similarity score between sample 1 and sample 3
#Get similarity score between sample 1 and sample 4
#Get average score of the first 3 steps. This will be the overall score of sample 1.
#Iterate and repeat until samples 1 to 4 have overall scores
#the following function Returns the arithmetic mean of each candidate sentence in the samples
#samples (list of lists): tokenized version of the translated sentences
#*ignore_params: additional parameters will be ignored
def average_overlap(similarity_fn, samples, *ignore_params):

  scores = {}

  for index_candidate, candidate in enumerate(samples):
    overlap = 0

    for index_sample, sample in enumerate(samples):

      if index_candidate == index_sample:
        continue

      # get the overlap between candidate and sample using the similarity function
      sample_overlap = similarity(candidate, sample)

      # add the sample overlap to the total overlap
      overlap += sample_overlap

    #get the score for the candidate by computing the average
    score = overlap / index_sample

    # save the score in the dictionary. use index as the key.
    scores[index_candidate] = score

  return scores

In [None]:
#Returns the weighted mean of each candidate sentence in the samples
#samples: tokenized version of the translated sentences
#log_probs (list of float): log probability of the translated sentences
def weighted_avg_overlap(similarity_fn, samples,log_probs):
  scores = {}

  for index_candidate, candidate in enumerate(samples):
    overlap, weights_sum = 0.0, 0.0

    for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):

      if index_candidate == index_sample:
        continue

      # convert log probability to linear scale
      sample_p = float(np.exp(logp))

      # update the weighted sum
      weights_sum += sample_p

      # get the unigram overlap between candidate and sample
      sample_overlap = similarity_fn(candidate, sample)

      # update the overlap
      overlap += sample_p * sample_overlap

    # get the score for the candidate
    score = overlap / weights_sum

    # save the score in the dictionary. use index as the key.
    scores[index_candidate] = score

  return scores

In [None]:
#We will now put everything together and develop the mbr_decode() function.
#You will want to generate samples, get the score for each sample, get the highest score among all samples, then detokenize this sample to get the translated sentence.
#the following function Returns the translated sentence using Minimum Bayes Risk decoding
#sentence (str): sentence to translate.
#n_samples (int): number of samples to generate
#score_fn (function): function that generates the score for each sample
#similarity_fn (function): function used to compute the overlap between a pair of samples
def mbr_decode(sentence, n_samples, score_fn, similarity_fn, model = None, temperature = 0.6, vocab_file = None, vocab_dir = None):
  # generate samples
  samples, log_probs = generate_samples(sentence, n_samples, model, temperature, vocab_file, vocab_dir)

  # use the scoring function to get a dictionary of scores
  scores = score_fn(similarity_fn, samples, log_probs )

  # find the key with the highest score
  max_index = max(scores, key=scores.get)

  # detokenize the token list associated with the max_index
  translated_sentence = detokenize(samples[max_index], vocab_file, vocab_dir)

  return (translated_sentence, max_index, scores)

In [None]:
temperature = 1.0

your_sentence = 'She speaks English and German.'

mbr_decode(your_sentence, 4, weighted_avg_overlap, jaccard_similarity, model, temperature, vocab_file=vocab_file, vocab_dir = vocab_dir)[0]

In [None]:
mbr_decode('You have completed the assignment!', 4, average_overlap, rouge1_similarity, model, TEMPERATURE, vocab_file=vocab_file)[0]