In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [None]:
from model import *

In [None]:
BATCH_SIZE = 16
embedding_dim = 256
units = 128

In [None]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

en, sp = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

# Try experimenting with the size of that dataset
# num_examples = 118964
num_examples = 30000

input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

In [None]:
input_tensor = input_tensor[-30000:, :]
target_tensor = target_tensor[-30000:, :]

In [None]:
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

BUFFER_SIZE = len(input_tensor_train)
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

In [None]:
my_model = NMT((vocab_inp_size, vocab_tar_size), inp_lang, targ_lang)

In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):

    start = time.time()

    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):

        batch_loss = my_model.train(inp, targ)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
            print('Time taken for 1 batch {:.1f} sec\n'.format(time.time() - start))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                  total_loss / steps_per_epoch))
    print('Time taken for an epoch: {:.1f} sec\n'.format(time.time() - start))

In [None]:
my_model.translate(u'hace mucho frio aqui.', max_length_targ, max_length_inp)

In [None]:
my_model.translate(u'esta es mi vida.', max_length_targ, max_length_inp)

In [None]:
my_model.translate(u'¿todavia estan en casa?', max_length_targ, max_length_inp)

In [None]:
# wrong translation
my_model.translate(u'¿Hay algún problema que pueda ayudar?', max_length_targ, max_length_inp)