<a href="https://colab.research.google.com/github/MinhDg00/en-vi-translation/blob/master/en_vi_translation%20with%20attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Translation Eng-Viet

The Project uses the EVBCorpus - a English and Vietnamese parallel translations and bitexts 

More information about the data can be found [here](https://github.com/qhungngo/EVBCorpus)

### Load Files and Create Dataset

In [1]:
!pwd

/content


In [0]:
# Packages for loading file
import os
from zipfile import ZipFile
from bs4 import BeautifulSoup
import re
import glob
import warnings
warnings.filterwarnings('ignore')

In [0]:
# Set seed
import numpy as np
import random
SEED = 46
np.random.seed(SEED)
random.seed(SEED)

In [0]:
with ZipFile('data.zip', 'r') as data:
   # Extract all the contents of zip file in current directory
   data.extractall()

In [0]:
data = []
path = 'data/*.sgml'
files = glob.glob(path)
for file in files:
    data.append(BeautifulSoup(open(file), 'lxml'))

In [189]:
data[1].find_all('s')[99].text

'Các phép đo bao gồm độ dẫn da , mạch thể tích máu , các dạng sóng não ( sử dụng điện não đồ ) , hoạt động cơ ( sử dụng ghi điện đồ cơ ) , hoạt động thở , và sự giãn nở đồng tử .'

In [0]:
# Create 2 list contains english and vietnamese texts
english_sentences = []
vietnamese_sentences = []
for d in data:
    i = 0
    for text in d.find_all('s'):
        if i%2 == 0:
            english_sentences.append(text.text)
        else:
            vietnamese_sentences.append(text.text)
        i += 1 

In [7]:
print(vietnamese_sentences[0])
print(english_sentences[0])

ADN của ông Dominique Strauss-Kahn " có dính líu đến cô phục vụ phòng "
Dominique Strauss-Kahn DNA " linked to maid "


### Vocabulary

In [0]:
# Create a counter
import collections

english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
vietnamese_words_counter = collections.Counter([word for sentence in vietnamese_sentences for word in sentence.split()])

In [9]:
# Inverstigate unique and most common words in both texts
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')


880216 English words.
40903 unique English words.
10 Most common words in the English dataset:
"." "," "the" "to" "of" "and" "a" "in" """ "is"


In [10]:
print('{} Vietnamese words.'.format(len([word for sentence in vietnamese_sentences for word in sentence.split()])))
print('{} unique Vietnamese words.'.format(len(vietnamese_sentences)))
print('10 Most common words in the Vietnamese dataset:')
print('"' + '" "'.join(list(zip(*vietnamese_words_counter.most_common(10)))[0]) + '"')

1201737 Vietnamese words.
45308 unique Vietnamese words.
10 Most common words in the Vietnamese dataset:
"." "," "và" "có" "của" "là" "một" """ "cho" "các"


###Sentiment Classification


In [11]:
# install if not
!pip install spacy
!pip install transformers==2.3.0
!pip install tensorflow==2.1.0

Collecting transformers==2.3.0
  Using cached https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl
[31mERROR: spacy-transformers 0.5.1 has requirement transformers<2.1.0,>=2.0.0, but you'll have transformers 2.3.0 which is incompatible.[0m
Installing collected packages: transformers
  Found existing installation: transformers 2.0.0
    Uninstalling transformers-2.0.0:
      Successfully uninstalled transformers-2.0.0
Successfully installed transformers-2.3.0


In [0]:
from transformers import pipeline
import pandas as pd

In [0]:
sentiment_classifier = pipeline('sentiment-analysis')

We will only investigate the first 100 english texts

In [0]:
s = ([sentiment_classifier(sentence) for sentence in english_sentences[:100]])

In [0]:
sentiment_df = pd.DataFrame({'English_text':english_sentences[:100], 'Sentiment': s}, index = range(1,101))

In [16]:
sentiment_df.head()

Unnamed: 0,English_text,Sentiment
1,"Dominique Strauss-Kahn DNA "" linked to maid ""","[{'label': 'POSITIVE', 'score': 0.6252788}]"
2,Dominique Strauss-Kahn is being held under hou...,"[{'label': 'NEGATIVE', 'score': 0.9738129}]"
3,DNA found on the clothes of a New York hotel m...,"[{'label': 'NEGATIVE', 'score': 0.98521155}]"
4,These unconfirmed reports cited sources close ...,"[{'label': 'NEGATIVE', 'score': 0.8924382}]"
5,More tests from the room where the alleged att...,"[{'label': 'NEGATIVE', 'score': 0.99367356}]"


### Named Entity Recognition
For performance sake, I will only investigate the first 100 english sentences

In [0]:
import spacy

In [0]:
tag = spacy.load('en_core_web_sm') 

In [0]:
tag = ([tag(sentence) for sentence in english_sentences[:100]])

In [38]:
print('\nNamed Entity recognized from the text 1 to 3\n')
for ent in tag[2].ents:
  print(ent.text, ent.label_)


Named Entity recognized from the text 1 to 3

New York GPE
Dominique Strauss-Kahn PERSON
US GPE


### Preprocess


In [39]:
from __future__ import absolute_import, division, print_function
import tensorflow as tf
import time
import os 
import io
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [0]:
# Lowercase, trim and remove non-letter characters
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r'[-]+', " ", s)
    #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = s.rstrip().strip()
    s = '<sos> ' + s + ' <eos>'
    
    return s

In [179]:
english_sentences[13]

'Guinea , told authorities that Mr Strauss-Kahn had accosted her after she entered his hotel room to clean it .'

In [0]:
# Normalize both english and vietnamese texts
eng_text = []
viet_text = []
for (eng, viet) in zip(english_sentences, vietnamese_sentences):
  eng_text.append(normalizeString(eng))
  viet_text.append(normalizeString(viet))

In [202]:
eng_text[202]

'<sos> worldwide , it is thought that at least half a million women are raped each year after consuming adulterated drinks . <eos>'

In [0]:
def tokenize(x):
    x_tk = Tokenizer(filters = '')
    x_tk.fit_on_texts(x)
    
    return x_tk.texts_to_sequences(x), x_tk

In [0]:
def pad(x, length = None):

    if length is None:
        length = max([len(sentence) for sentence in x])
    
    return pad_sequences(x, maxlen = length, padding = 'post')  # maxlen = length

In [0]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x, 30)
    preprocess_y = pad(preprocess_y, 30)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    # preprocess_y = preprocess_y.reshape(*preprocess_y.shape)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [0]:
# Reduce the data size to speed up training
eng_text = eng_text[:10000]
viet_text = viet_text[:10000]

In [260]:
eng_text[13]

'<sos> guinea , told authorities that mr strauss kahn had accosted her after she entered his hotel room to clean it . <eos>'

In [0]:
preproc_english_sentences, preproc_vietnamese_sentences, english_tokenizer, vietnamese_tokenizer =\
    preprocess(eng_text, viet_text)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_vietnamese_sequence_length = preproc_vietnamese_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
vietnamese_vocab_size = len(vietnamese_tokenizer.word_index)

In [262]:
print(max_english_sequence_length)
print(max_vietnamese_sequence_length)
print(english_vocab_size)
print(vietnamese_vocab_size)

30
30
31498
14849


In [263]:
# Split data into training and validation set
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val =  train_test_split(preproc_vietnamese_sentences, preproc_english_sentences, test_size = 0.2)

print(len(input_tensor_train), len(input_tensor_val), len(target_tensor_train), len(target_tensor_val))

36246 9062 36246 9062


In [0]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [265]:
convert(english_tokenizer, target_tensor_train[0])

1 ----> <sos>
28 ----> from
9 ----> a
1772 ----> tiny
6475 ----> wooden
1523 ----> table
17 ----> with
15705 ----> pint
942 ----> size
2912 ----> plastic
334 ----> white
5740 ----> chairs
5 ----> ,
9 ----> a
360 ----> woman
2424 ----> served
87 ----> us
2779 ----> giant
8071 ----> bowls
4 ----> .
2 ----> <eos>


In [0]:
# Create a tf.data dataset
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 64
units = 256
vocab_inp_size = len(vietnamese_tokenizer.word_index) + 1
vocab_tar_size = len(english_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [267]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape


(TensorShape([64, 30]), TensorShape([64, 30]))

### Machine Translation Model with Attention
The model is inspired by NMT with Attention Model from TensorFlow tutorial

For more information, please visit https://www.tensorflow.org/tutorials/text/nmt_with_attention

Encoder Model

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [269]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))


Encoder output shape: (batch size, sequence length, units) (64, 30, 256)
Encoder Hidden state shape: (batch size, units) (64, 256)


##### Attention model

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights


In [271]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))


Attention result shape: (batch size, units) (64, 256)
Attention weights shape: (batch_size, sequence_length, 1) (64, 30, 1)


##### Decoder

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [273]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))


Decoder output shape: (batch_size, vocab size) (64, 31499)


##### Optimizer and Loss function

In [0]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([english_tokenizer.word_index['<sos>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss


In [0]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch 1 Batch 0 Loss 6.0528
Epoch 1 Batch 100 Loss 4.1949
Epoch 1 Batch 200 Loss 4.1404
Epoch 1 Batch 300 Loss 4.4094
Epoch 1 Batch 400 Loss 4.0954
Epoch 1 Batch 500 Loss 4.1793
Epoch 1 Loss 4.2349
Time taken for 1 epoch 186.51109504699707 sec

Epoch 2 Batch 0 Loss 4.0469
Epoch 2 Batch 100 Loss 3.8184
Epoch 2 Batch 200 Loss 4.0111
Epoch 2 Batch 300 Loss 3.7580
Epoch 2 Batch 400 Loss 3.8417
Epoch 2 Batch 500 Loss 4.0206
Epoch 2 Loss 3.9303
Time taken for 1 epoch 148.71146774291992 sec

Epoch 3 Batch 0 Loss 3.5319


In [179]:
max_english_sequence_length

70

In [243]:
english_tokenizer.word_index['the']

1

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_english_sequence_length, max_vietnamese_sequence_length))

  sentence = normalizeString(sentence)

  inputs = [vietnamese_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen= max_vietnamese_sequence_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([english_tokenizer.word_index['<sos>']], 0)

  for t in range(max_english_sequence_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    #attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += english_tokenizer.index_word[predicted_id] + ' '

    if english_tokenizer.index_word[predicted_id] == '<eos>':
      return result, sentence #, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence #, attention_plot

In [0]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))