In [None]:
from google.colab import drive
drive.mount("/content/data")

Mounted at /content/data


In [None]:
import os
import numpy as np
import unicodedata
import tensorflow as tf
import re
import io
import time
from sklearn.model_selection import train_test_split

In [None]:
path_data_file = "/content/data/MyDrive/data/rus-eng/rus.txt"

In [None]:
def unicode_to_ascii(w):
    return ''.join(unicodedata.normalize("NFD", c) for c in w if unicodedata.category(c) != 'Mn')

word = "Оно там?"
unicode_to_ascii(word)

'Оно там?'

In [None]:
def preprocess_word(w):
    w = unicode_to_ascii(w)

    w = re.sub(r"([.!?,])", r" \1", w)
    w = re.sub(r'[" ]', " ", w)

    w = w.strip()
    return "<start> %s <end>" % w

preprocess_word(word)

'<start> Оно там ? <end>'

In [None]:
def create_dataset(path, num_instance):
    lines = io.open(path, encoding="utf-8").read().strip().split("\n")
    return zip(*[[preprocess_word(w) for w in line.split("\t")[:2]] for line in lines[:num_instance]])

a, b = create_dataset(path_data_file, 3)
a, b

(('<start> Go . <end>', '<start> Go . <end>', '<start> Go . <end>'),
 ('<start> Марш ! <end>', '<start> Иди . <end>', '<start> Идите . <end>'))

In [None]:
def tokenize(texts):
    """

    :param texts: the text to tokenize
    :return: the tensors and tokenizer of the texts
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(texts)

    tensor = tokenizer.texts_to_sequences(texts)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding="post")

    return tensor, tokenizer


def load_dataset(path, num_instance):
    tar, inp = create_dataset(path, num_instance)

    tar_tensor, tar_tokenizer = tokenize(tar)
    inp_tensor, inp_tokenizer = tokenize(inp)

    return tar_tensor, inp_tensor, tar_tokenizer, inp_tokenizer



In [None]:
NUM_EXAMPLES = 150000

tar_tensor, inp_tensor, tar_tokenizer, inp_tokenizer = load_dataset(path_data_file, NUM_EXAMPLES)

max_len_tar = tar_tensor.shape[1]
max_len_inp = inp_tensor.shape[1]

max_len_tar, max_len_inp

(12, 15)

In [None]:
inp_tensor_train, inp_tensor_val, tar_tensor_train, tar_tensor_val = train_test_split(inp_tensor, tar_tensor)

len(inp_tensor_train), len(inp_tensor_val)

(112500, 37500)

In [None]:
def print_convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print("%d\t--->\t%s" % (t, tokenizer.index_word[t]))

print_convert(inp_tokenizer, inp_tensor_train[0])

1	--->	<start>
10	--->	ты
523	--->	делаешь
17	--->	меня
150	--->	такой
989	--->	счастливой
3	--->	.
2	--->	<end>


In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((inp_tensor_train, tar_tensor_train))
dataset = dataset.shuffle(BUFFER_SIZE).cache().batch(BATCH_SIZE, drop_remainder=True).prefetch(1)
dataset

<PrefetchDataset shapes: ((64, 15), (64, 12)), types: (tf.int32, tf.int32)>

In [None]:
EMBEDDING_DIM = 256
ENC_UNITS = 1024
DEC_UNITS = 1024

vocab_inp_size = len(inp_tokenizer.word_index) + 1
vocab_tar_size = len(tar_tokenizer.word_index) + 1

In [None]:
for exam_inp, exam_tar in dataset.take(1):
    print(exam_inp.shape, exam_tar.shape)

(64, 15) (64, 12)


In [None]:
class Encoder(tf.keras.Model):
    def get_config(self):
        pass
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.gru = tf.keras.layers.GRU(encoder_units, return_sequences=True, return_state=True,
                                       recurrent_initializer="glorot_uniform")

    def call(self, x, hidden, *args):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden(self):
        return tf.zeros((self.batch_size, self.encoder_units))


exam_inp, exam_tar = next(iter(dataset))
encoder_test = Encoder(vocab_inp_size, EMBEDDING_DIM, ENC_UNITS, batch_size=BATCH_SIZE)

init_hidden = encoder_test.initialize_hidden()
sample_output, sample_hidden = encoder_test(exam_inp, init_hidden)

"Encoder Output Shape: ", sample_output.shape, "Encoder Hidden shape: ", sample_hidden.shape


('Encoder Output Shape: ',
 TensorShape([64, 15, 1024]),
 'Encoder Hidden shape: ',
 TensorShape([64, 1024]))

In [None]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, fc_units):
        super(BahdanauAttention, self).__init__()
        self.fc1 = tf.keras.layers.Dense(fc_units)
        self.fc2 = tf.keras.layers.Dense(fc_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values, *args):
        """

        :param query: The hidden from encoder (batch_size, enc_hidden)
        :param values: encoder output (batch_size, seq_len, enc_hidden)
        :param args:
        :return:
        """
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # expand dim to broadcast addition along the time axis to calculate the score
        query_time_with_axis = tf.expand_dims(query, axis=1)
        # fc2 --> (batch_size, 1, units)
        # fc1 --> (batch_size, max_len, units)
        # fc1 + fc2 --> (batch_size, max_len, units)
        # V --> (batch_size, max_len, 1) (score shape)
        score = self.V(tf.tanh(self.fc2(query_time_with_axis) + self.fc1(values)))
        # attention_weights = tf.keras.layers.Softmax(axis=1)(score)
        attention_weights = tf.nn.softmax(score, axis=1)
        # Point wise element multi (not dot product)
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        # context_vector = tf.reduce_sum(tf.matmul(attention_weights, encoder_out), axis=1)

        return context_vector, attention_weights

    # def call(self, inputs, training=None, mask=None):
    #     encoder_out, hidden = inputs
    #     score = self.V(tf.tanh(self.fc2(encoder_out) + self.fc1(hidden)))
    #     # attention_weights = tf.keras.layers.Softmax(axis=1)(score)
    #     attention_weights = tf.nn.softmax(score, axis=1)
    #     context_vector = tf.reduce_sum(attention_weights * encoder_out, axis=1)
    #     # context_vector = tf.reduce_sum(tf.matmul(attention_weights, encoder_out), axis=1)
    #
    #     return context_vector, attention_weights

attention = BahdanauAttention(10)
attention_context, attention_weights = attention(sample_hidden, sample_output)

"Attention context shape: ", attention_context.shape, "attention weights shape: ", attention_weights.shape

('Attention context shape: ',
 TensorShape([64, 1024]),
 'attention weights shape: ',
 TensorShape([64, 15, 1]))

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.gru = tf.keras.layers.GRU(units=dec_units, return_state=True, return_sequences=True,
                                       recurrent_initializer="glorot_uniform")

        self.final_fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(dec_units)

    def call(self, x, hidden, enc_output):
        """
        :param x: inputs
        :param hidden: hidden from encoder
        :param enc_output: output of encoder
        :return:
        """

        # (batch_size, hidden_units), (batch_size, seq_len, hidden_units)
        # embedded (batch_size, ..., embedding_dim)
        # concat (batch_size, ..., embedding_dim + units)
        # gru --> out (batch_size, ..., units), state (batch_size, units)
        # reshape --> merge batch_size and ... --> (batch_size, units)
        # fc --> (batch_size, vocab_size)
        context_vector, context_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([x, tf.expand_dims(context_vector, axis=1)], axis=-1)

        out, state = self.gru(x)

        out = tf.reshape(out, shape=(-1, out.shape[2]))

        x = self.final_fc(out)

        return x, state, attention_weights


decoder = Decoder(vocab_tar_size, embedding_dim=EMBEDDING_DIM, dec_units=DEC_UNITS)

sample_decode_output, _, _ = decoder(tf.random.uniform(shape=(BATCH_SIZE, 1)), sample_hidden, sample_output)

sample_decode_output.shape

TensorShape([64, 8952])

In [None]:
optimizer = tf.keras.optimizers.Adam()
sparse_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_func(real, pred):
    # Mask for target (0 is mask, 1 is real target)
    mask = tf.math.logical_not(tf.equal(real, 0))
    loss = sparse_loss(real, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    # ignore for mask loss
    loss *= mask

    return tf.reduce_mean(loss)



In [None]:
encoder = Encoder(vocab_size=vocab_inp_size, embedding_dim=EMBEDDING_DIM, encoder_units=ENC_UNITS, batch_size=BATCH_SIZE)
decoder = Decoder(vocab_size=vocab_tar_size, embedding_dim=EMBEDDING_DIM, dec_units=DEC_UNITS)

checkpoint_dir = "/content/data/MyDrive/checkpoints/text_translate"
checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer, step=tf.Variable(1))
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=1)
# checkpoint.restore(manager.latest_checkpoint)
# if manager.latest_checkpoint:
    # print("Restored from {}".format(manager.latest_checkpoint))
# else:
    # print("Initializing from scratch.")

checkpoint

<tensorflow.python.training.tracking.util.Checkpoint at 0x7f182b881f90>

In [None]:
@tf.function
def train_step(inp, tar, enc_hidden):
    """
    Pass the input through the encoder which return encoder output and the encoder hidden state.
    The encoder output, encoder hidden state and the decoder input (which is the start token) is passed to the decoder.
    The decoder returns the predictions and the decoder hidden state.
    The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
    Use teacher forcing to decide the next input to the decoder.
    Teacher forcing is the technique where the target word is passed as the next input to the decoder.
    The final step is to calculate the gradients and apply it to the optimizer and backpropagate
    :param inp:
    :param tar:
    :param enc_hidden:
    :return:
    """
    loss = 0.0

    with tf.GradientTape() as g:

        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_input = tf.expand_dims([tar_tokenizer.word_index['<start>']] * BATCH_SIZE, axis=1)
        dec_hidden = enc_hidden
        for t in range(1, tar.shape[1]):
            predictions, state, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_func(tar[:, t], predictions)

            dec_input = tf.expand_dims(tar[:, t], axis=1)

    batch_loss = (loss / int(tar.shape[1]))

    train_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = g.gradient(loss, train_variables)

    optimizer.apply_gradients(zip(gradients, train_variables))

    return batch_loss


In [None]:
EPOCHS = 20
steps_per_epoch = len(inp_tensor_train) // BATCH_SIZE

for epoch in range(EPOCHS):
    total_loss = 0
    enc_hidden = encoder.initialize_hidden()
    start = time.time()
    for (batch, (inp, tar)) in enumerate(dataset.take(steps_per_epoch)):
        loss = train_step(inp, tar, enc_hidden)
        total_loss += loss

        if batch % 100 == 0:
            print('Epoch %d Batch %d Loss %.4f' % (epoch + 1, batch, loss.numpy()))
    
    
    # checkpoint.step.assign_add(1)
    # manager.save()
    # checkpoint.save(file_prefix=checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.7514
Epoch 1 Batch 100 Loss 2.2475
Epoch 1 Batch 200 Loss 1.9592
Epoch 1 Batch 300 Loss 1.8101
Epoch 1 Batch 400 Loss 1.7925
Epoch 1 Batch 500 Loss 1.5098
Epoch 1 Batch 600 Loss 1.4591
Epoch 1 Batch 700 Loss 1.4134
Epoch 1 Batch 800 Loss 1.3025
Epoch 1 Batch 900 Loss 1.2295
Epoch 1 Batch 1000 Loss 1.2663
Epoch 1 Batch 1100 Loss 1.1056
Epoch 1 Batch 1200 Loss 0.9932
Epoch 1 Batch 1300 Loss 1.0235
Epoch 1 Batch 1400 Loss 0.9037
Epoch 1 Batch 1500 Loss 0.9304
Epoch 1 Batch 1600 Loss 0.8729
Epoch 1 Batch 1700 Loss 0.6965
Epoch 1 Loss 1.3523
Time taken for 1 epoch 158.07966089248657 sec

Epoch 2 Batch 0 Loss 0.7573
Epoch 2 Batch 100 Loss 0.8084
Epoch 2 Batch 200 Loss 0.7277
Epoch 2 Batch 300 Loss 0.6855
Epoch 2 Batch 400 Loss 0.8240
Epoch 2 Batch 500 Loss 0.5633
Epoch 2 Batch 600 Loss 0.5963
Epoch 2 Batch 700 Loss 0.5648
Epoch 2 Batch 800 Loss 0.5593
Epoch 2 Batch 900 Loss 0.5656
Epoch 2 Batch 1000 Loss 0.5720
Epoch 2 Batch 1100 Loss 0.5546
Epoch 2 Batch 1200 Loss 0.4

KeyboardInterrupt: ignored

In [None]:
# Translate
# The input to the decoder at each time step is its previous predictions along with the hidden state and
# the decoder output
# Stop prediction when reach end token and store the attention weights for every time step
# The encoder output is calculated only once for one input

def evaluate(sentence):
    sentence = sentence.lower()
    attention_plot = np.zeros((max_len_tar, max_len_inp))

    sentence = preprocess_word(sentence)

    inputs = [inp_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_len_inp, padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, DEC_UNITS))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([tar_tokenizer.word_index['<start>']], 0)

    for t in range(max_len_tar):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        # Storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, shape=(-1, ))
        # attention_plot[t] = attention_weights.numpy()

        prediction_id = tf.argmax(predictions[0]).numpy()

        result += tar_tokenizer.index_word[prediction_id] + ' '

        if tar_tokenizer.index_word[prediction_id] == '<end>':
            return result, sentence, attention_plot

        # Feed back into the model the predicted id
        dec_input = tf.expand_dims([prediction_id], 0)

    return result, sentence, attention_plot

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % sentence)
    print("Predicted translation: {}".format(result))

    # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [None]:
# translate(u'Go')
# Позвони Тому.
# Вот это да
# inp_tokenizer.word_index['Тому']
# inp_tokenizer.word_index[u'тому']
translate(u'Я совсем о ней забыл.')


Input: <start> я совсем о ней забыл . <end>
Predicted translation: i forgot all . <end> 
