In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
path_to_file = "./text_data/rus-eng/rus.txt"


In [3]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != "Mn")


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # Add space between punctuation and word
    w = re.sub(r"([?.!,])", r" \1", w)
    w = re.sub(r'[" "]+', " ", w)

    # Replace other with space
    # w = re.sub(r"[^a-zA-Z.?!,]+", " ", w)

    w = w.strip()

    # Add start and end token
    w = "<start> %s <end>" %w

    return w

In [4]:
ru_sentence = u"Я любим."
en_sentence = u"I'm loved."

print(preprocess_sentence(ru_sentence))
print(preprocess_sentence(en_sentence))

<start> я любим . <end>
<start> i'm loved . <end>


In [5]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='utf-8').read().strip().split('\n')

    # Only get the en part and russian ([:2])
    word_paris = [[preprocess_sentence(w) for w in l.split('\t')[:2]] for l in lines[:num_examples]]

    return zip(*word_paris)

en, ru = create_dataset(path_to_file, 10)
print(en[-1])
print(ru[-1])

<start> run ! <end>
<start> бегите ! <end>


In [6]:
def tokenize(lang):
    """
    Tokenize the texts (sequence of sentences)
    :param lang: the texts to tokenize
    :return: list of id token for mapping to word and tokenizer
    """
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding="post")

    return tensor, lang_tokenizer


def load_dataset(path, num_examples=None):
    # clean input, output pairs
    tar_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, tar_lang_tokenizer = tokenize(tar_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, tar_lang_tokenizer

In [7]:
NUM_EXAMPLES = 10000
input_tensor, target_tensor, inp_lang, tar_lang = load_dataset(path_to_file, num_examples=NUM_EXAMPLES)

max_len_tar, max_len_inp = target_tensor.shape[1], input_tensor.shape[1]

max_len_tar, max_len_inp

(8, 14)

In [8]:
# lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
# lang_tokenizer.fit_on_texts(ru)
#
# tensor = lang_tokenizer.texts_to_sequences(ru)
# print(tensor)
# tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

In [9]:
inp_train, inp_val, tar_train, tar_val = train_test_split(input_tensor, target_tensor, test_size=.2)

len(inp_train), len(inp_val)


(8000, 2000)

In [10]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print("%d\t ---> %s" % (t, lang.index_word[t]))

convert(inp_lang, inp_train[0])
print("")
convert(tar_lang, tar_train[0])

1	 ---> <start>
39	 ---> оно
870	 ---> белое
3	 ---> .
2	 ---> <end>

1	 ---> <start>
17	 ---> it's
386	 ---> white
3	 ---> .
2	 ---> <end>


In [11]:
# Build dataset
BUFFER_SIZE = len(inp_train)
BATCH_SIZE = 64
steps_per_epoch = len(inp_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(tar_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((inp_train, tar_train))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(tf.data.experimental.AUTOTUNE)

In [12]:
example_inp_batch, example_tar_batch = next(iter(dataset))
example_inp_batch.shape, example_tar_batch.shape

(TensorShape([64, 14]), TensorShape([64, 8]))

In [13]:
# Encoder and Decoder
# The input is put through an encoder model which give the encoder output of shape
# (batch_size, max_length, hidden_size) and the encoder hidden state of shape (batch_size, hidden_size)
# a_ts = exp(score(ht,hs)) /  sum(exp(score(ht,hs)) Attention weights
# ct = sum(a_ts * hs)               Context vector
# at = f(ct, ht) = tanh(Wc[ct;ht])  Attention vector
# score(ht, hs) = ht.T*W*h_s or v_a.T * tanh(W1*ht + W2*h2))

# score = FC(tanh(FC(EO) + FC(H))) (EO: encoder output)
# attention weights = softmax(score, axis=1) (axis 1 is max_length of input)
# context vector = sum(attention weights * EO, axis=1)
# embedding_output = the input to the decoder X is passed through an embedding layer
# merged vector = concat(embedding output, context vector)
# then feed to GRU

class Encoder(tf.keras.Model):
    def get_config(self):
        pass

    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden=None, *args):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))


encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_inp_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 14, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [14]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values=None, *args, **kwargs):
        """
        query hidden state shape == (batch_size, hidden size)
        query_with_times_axis shape == (batch_size, 1, hidden_size)
        # values shape = (batch_size, max_len, hidden_size)
        broadcast addition along the time axis to calculate the score
        :param query:
        :param values:
        :param args:
        :param kwargs:
        :return:
        """
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape = (batch_size, max_length, 1)
        # get a at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 14, 1)


In [15]:
class Decoder(tf.keras.Model):
    def get_config(self):
        pass
    def __init__(self, vocab_size, embedding_dim, decode_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decode_units = decode_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.decode_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.decode_units)

    def call(self, x, hidden=None, enc_output=None, *args, **kwargs):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation = (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, shape=(-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights


decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 1748)


In [16]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_func(target, predict):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, predict)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [17]:
checkpoint_dir = './machine_translate_seq2seq_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [18]:
# Training
# Pass the input through the encoder which return encoder output and the encoder hidden state
# The encoder output, encoder hidden state and the decoder input is passed to the decoder
# The decoder return the predictions and the decoder hidden state
# The decoder hidden state is then passed back into the model and the predictions
# are used to calculate the loss
# Use target word passed as the next input to the decoder to decide the next input to the decoder
# Then calculate the gradients and apply into optimizer and backpropagate

@tf.function
def train_step(inp, target, enc_hidden):
    loss = 0

    with tf.GradientTape() as g:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([tar_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, target.shape[1]):
            # Pass enc_output to decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_func(target[:, t], predictions)

            dec_input = tf.expand_dims(target[:, t], 1)

    batch_loss = (loss / int(target.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = g.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [19]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, target)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, target, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch %d Batch %d Loss %.4f' % (epoch + 1, batch, batch_loss.numpy()))


    if (epoch + 1) % 5 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch 1 Batch 0 Loss 4.3016
Epoch 1 Batch 100 Loss 1.8870


KeyboardInterrupt: 