# Neural machine translation

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

## Download and prepare the dataset

We'll use a language dataset provided by http://www.manythings.org/anki/

In [5]:
!curl http://www.manythings.org/anki/rus-eng.zip -o rus-eng.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 13.7M    0 41928    0     0  41928      0  0:05:43  0:00:01  0:05:42 35774
  4 13.7M    4  634k    0     0   634k      0  0:00:22  0:00:01  0:00:21  335k
 17 13.7M   17 2517k    0     0  1258k      0  0:00:11  0:00:02  0:00:09  870k
 48 13.7M   48 6743k    0     0  2247k      0  0:00:06  0:00:03  0:00:03 1733k
 96 13.7M   96 13.2M    0     0  3394k      0  0:00:04  0:00:04 --:--:-- 2776k
100 13.7M  100 13.7M    0     0  3512k      0  0:00:04  0:00:04 --:--:-- 3720k


In [7]:
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng/

A subdirectory or file rus-eng already exists.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [9]:
!Expand-Archive rus-eng.zip rus-eng/

'Expand-Archive' is not recognized as an internal or external command,
operable program or batch file.


In [17]:
!dir "./rus-eng/"

 Volume in drive C has no label.
 Volume Serial Number is 205C-D26E

 Directory of C:\Users\Ifl\Documents\GitHub\GB_NLP\l10\rus-eng

16.10.2021  20:17    <DIR>          .
16.10.2021  20:17    <DIR>          ..
14.07.2021  10:16        71ÿ485ÿ809 rus.txt
14.07.2021  10:16             1ÿ441 _about.txt
               2 File(s)     71ÿ487ÿ250 bytes
               2 Dir(s)  60ÿ790ÿ181ÿ888 bytes free


In [18]:
# Download the file
path_to_file = "./rus-eng/rus.txt"

In [19]:
def preprocess_sentence(w):
  w = w.lower().strip()

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [20]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [21]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [22]:
en, ru = create_dataset(path_to_file, None)
print(en[0])
print(ru[0])

<start> go . <end>
<start> марш ! <end>


In [23]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [24]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

### Limit the size of the dataset to experiment faster (optional)


In [25]:
len(en), len(ru)

(431097, 431097)

In [26]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [27]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [28]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [29]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
6 ----> том
548 ----> сильно
2563 ----> напился
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
5 ----> tom
67 ----> got
48 ----> very
246 ----> drunk
3 ----> .
2 ----> <end>


### Create a tf.data dataset

In [30]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [31]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 15]), TensorShape([64, 11]))

In [32]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=False,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [33]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [34]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden):
    # enc_output shape == (batch_size, max_length, hidden_size)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x, initial_state=hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state

In [35]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)



In [36]:
decoder_sample_x.shape

TensorShape([64, 7260])

In [37]:
decoder_sample_h.shape

TensorShape([64, 1024])

## Define the optimizer and the loss function

In [38]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [39]:
checkpoint_dir = './training_nmt_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [40]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [41]:
EPOCHS = 50

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6350
Epoch 1 Batch 100 Loss 2.1084
Epoch 1 Batch 200 Loss 1.7281
Epoch 1 Batch 300 Loss 1.7200
Epoch 1 Batch 400 Loss 1.6572
Epoch 1 Batch 500 Loss 1.4465
Epoch 1 Batch 600 Loss 1.5117
Epoch 1 Batch 700 Loss 1.3957
Epoch 1 Batch 800 Loss 1.3340
Epoch 1 Batch 900 Loss 1.2833
Epoch 1 Batch 1000 Loss 1.2737
Epoch 1 Batch 1100 Loss 1.1699
Epoch 1 Batch 1200 Loss 1.1603
Epoch 1 Loss 1.5303
Time taken for 1 epoch 701.5899829864502 sec

Epoch 2 Batch 0 Loss 1.0605
Epoch 2 Batch 100 Loss 0.9324
Epoch 2 Batch 200 Loss 1.0338
Epoch 2 Batch 300 Loss 1.0016
Epoch 2 Batch 400 Loss 0.9530
Epoch 2 Batch 500 Loss 0.9638
Epoch 2 Batch 600 Loss 0.9520
Epoch 2 Batch 700 Loss 0.8883
Epoch 2 Batch 800 Loss 0.8971
Epoch 2 Batch 900 Loss 0.8489
Epoch 2 Batch 1000 Loss 0.7748
Epoch 2 Batch 1100 Loss 0.8026
Epoch 2 Batch 1200 Loss 0.8204
Epoch 2 Loss 0.8954
Time taken for 1 epoch 718.1259777545929 sec

Epoch 3 Batch 0 Loss 0.5892
Epoch 3 Batch 100 Loss 0.6179
Epoch 3 Batch 200 Loss 0.538

Epoch 18 Batch 1100 Loss 0.1134
Epoch 18 Batch 1200 Loss 0.0631
Epoch 18 Loss 0.0713
Time taken for 1 epoch 750.8179750442505 sec

Epoch 19 Batch 0 Loss 0.0466
Epoch 19 Batch 100 Loss 0.0318
Epoch 19 Batch 200 Loss 0.0390
Epoch 19 Batch 300 Loss 0.0370
Epoch 19 Batch 400 Loss 0.0656
Epoch 19 Batch 500 Loss 0.0986
Epoch 19 Batch 600 Loss 0.0858
Epoch 19 Batch 700 Loss 0.0753
Epoch 19 Batch 800 Loss 0.0906
Epoch 19 Batch 900 Loss 0.0525
Epoch 19 Batch 1000 Loss 0.0754
Epoch 19 Batch 1100 Loss 0.0668
Epoch 19 Batch 1200 Loss 0.0665
Epoch 19 Loss 0.0698
Time taken for 1 epoch 727.7256755828857 sec

Epoch 20 Batch 0 Loss 0.0388
Epoch 20 Batch 100 Loss 0.0453
Epoch 20 Batch 200 Loss 0.0780
Epoch 20 Batch 300 Loss 0.0224
Epoch 20 Batch 400 Loss 0.0604
Epoch 20 Batch 500 Loss 0.0528
Epoch 20 Batch 600 Loss 0.0655
Epoch 20 Batch 700 Loss 0.0753
Epoch 20 Batch 800 Loss 0.1030
Epoch 20 Batch 900 Loss 0.0576
Epoch 20 Batch 1000 Loss 0.0690
Epoch 20 Batch 1100 Loss 0.1278
Epoch 20 Batch 1200 Loss 0

Epoch 36 Batch 200 Loss 0.0451
Epoch 36 Batch 300 Loss 0.0389
Epoch 36 Batch 400 Loss 0.0778
Epoch 36 Batch 500 Loss 0.0359
Epoch 36 Batch 600 Loss 0.0370
Epoch 36 Batch 700 Loss 0.1105
Epoch 36 Batch 800 Loss 0.0722
Epoch 36 Batch 900 Loss 0.0887
Epoch 36 Batch 1000 Loss 0.0535
Epoch 36 Batch 1100 Loss 0.0544
Epoch 36 Batch 1200 Loss 0.0440
Epoch 36 Loss 0.0539
Time taken for 1 epoch 699.8970670700073 sec

Epoch 37 Batch 0 Loss 0.0629
Epoch 37 Batch 100 Loss 0.0223
Epoch 37 Batch 200 Loss 0.0424
Epoch 37 Batch 300 Loss 0.0725
Epoch 37 Batch 400 Loss 0.0798
Epoch 37 Batch 500 Loss 0.0809
Epoch 37 Batch 600 Loss 0.0410
Epoch 37 Batch 700 Loss 0.0634
Epoch 37 Batch 800 Loss 0.0343
Epoch 37 Batch 900 Loss 0.0703
Epoch 37 Batch 1000 Loss 0.0809
Epoch 37 Batch 1100 Loss 0.0794
Epoch 37 Batch 1200 Loss 0.0585
Epoch 37 Loss 0.0540
Time taken for 1 epoch 677.2793025970459 sec

Epoch 38 Batch 0 Loss 0.0251
Epoch 38 Batch 100 Loss 0.0425
Epoch 38 Batch 200 Loss 0.0489
Epoch 38 Batch 300 Loss 0.0

## Translate

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [42]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # storing the attention weights to plot later on
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [43]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

## Restore the latest checkpoint and test

In [44]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x25e9306f310>

In [45]:
translate('Здесь хорошо.')

Input: <start> здесь хорошо . <end>
Predicted translation: it's here here . <end> 


In [46]:
translate('Я не смогу поехать.')

Input: <start> я не смогу поехать . <end>
Predicted translation: i can't go . <end> 


In [47]:
translate(u'Вы еще дома?')

Input: <start> вы еще дома ? <end>
Predicted translation: are you still at home ? <end> 


In [48]:
translate(u'Вы все еще дома?')

Input: <start> вы все еще дома ? <end>
Predicted translation: are you still at home ? <end> 


In [49]:
translate(u'Попробуй сделать это.')

Input: <start> попробуй сделать это . <end>
Predicted translation: try and do it . <end> 


In [50]:
translate(u'Я люблю, когда идет снег.')

Input: <start> я люблю , когда идет снег . <end>
Predicted translation: i like playing silk . <end> 


In [51]:
translate(u'Я никогда такого не делаю.')

Input: <start> я никогда такого не делаю . <end>
Predicted translation: i never do that . <end> 
