# 1. Encoder-Decoder
Building a Neural Machine Translation system using Encoder-Decoder (Seq-to-Seq Net)

In [41]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
import matplotlib.pyplot as plt

def cls():
    tf.random.set_seed(42)
    tf.keras.backend.clear_session()

In [8]:
##########################################
# Lets download some dataset first       #
# That will translate english to spanish #
##########################################
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [9]:
with open((Path(path).with_name("spa-eng") / "spa.txt"), "r", encoding='utf-8') as f:
    text = f.read()

In [11]:
print(text[:100])

Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.
Run!	¡Corre!
Run.	Corred.
Who?	¿Quién?
Fire!	¡Fueg


In [16]:
text.splitlines()[:10]

['Go.\tVe.',
 'Go.\tVete.',
 'Go.\tVaya.',
 'Go.\tVáyase.',
 'Hi.\tHola.',
 'Run!\t¡Corre!',
 'Run.\tCorred.',
 'Who?\t¿Quién?',
 'Fire!\t¡Fuego!',
 'Fire!\t¡Incendio!']

In [17]:
text = text.replace("¡", "").replace("¿", "") # Replacing chracters
pairs = [line.split("\t") for line in text.splitlines()] # spliting the lines from \n first then by \t
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [19]:
for i in range(5):
    print(sentences_en[i], "=>", sentences_es[i])

Do you know when Tom will be home? => Saben cuándo va a estar en casa Tom?
The time has passed very quickly. => El tiempo pasó muy rápido.
Tom asked me what I needed. => Tom me preguntó qué necesitaba.
Tom found me a taxi. => Tom me consiguió un taxi.
You drive too fast. => Manejás demasiado rápido.


In [34]:
# There would be around 25_512 unique word in dataset
len(set(" ".join(sentences_en).split()))

23848

In [35]:
# After doing preprocessing like, remove puntuations, lowering, etc.
# total unique values would be around 13634

text_vec_layer_en = tf.keras.layers.TextVectorization()
text_vec_layer_en.adapt(sentences_en)
len(text_vec_layer_en.get_vocabulary())

13634

In [37]:
# Lets build the vocabulary of words with 1000 and max length 50
# any sentence have more than 50 words would be ignore
vocab_size = 1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es]) # adding startofseq and endofseq 

In [38]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [39]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [40]:
# input for encoder
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# input for decoder
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [43]:
cls()

# Layer 1
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# Layer 2
embed_size = 12 # Change embedding dimension 12 to 128 for better result
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

# Layer 3
encoder = tf.keras.layers.LSTM(2, return_state=True) # Change this to 512
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
decoder = tf.keras.layers.LSTM(2, return_sequences=True) # Change this to 512
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

# Layer 4 
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

# Combining layers
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  (None, 50)          0           ['input_1[0][0]']                
 orization)                                                                                       
                                                                                                  
 text_vectorization_3 (TextVect  (None, 50)          0           ['input_2[0][0]']            

In [44]:
model.fit((X_train, X_train_dec), Y_train, epochs=1,
          validation_data=((X_valid, X_valid_dec), Y_valid))



<keras.callbacks.History at 0x2a86b6b71c0>

In [51]:
translation = ""
for i in range(10):
    str = "I love my life"
    X_enc = np.array([str])
    X_dec = np.array(['startofseq ' + translation])
    y_proba = model.predict((X_enc,X_dec))
    predicted_word = text_vec_layer_es.get_vocabulary()[(np.argmax(y_proba[0,i]))]
    translation += " " + predicted_word
print(translation)

 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]


In [54]:
# Lets create a function that will translate our language
def translate(sentence_en, max_length=50):
    translation = ""
    for word_idx in range(max_length): # 50
        X = np.array([sentence_en])  # encoder input 
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
        print(translation)
    return translation.strip()

In [55]:
translate("I love my life", 5)

 [UNK]
 [UNK] [UNK]
 [UNK] [UNK] [UNK]
 [UNK] [UNK] [UNK] [UNK]
 [UNK] [UNK] [UNK] [UNK] [UNK]


'[UNK] [UNK] [UNK] [UNK] [UNK]'

In [57]:
# We need to increase models complexity as it didn't learn anything yet.

# 2. BiDirectional RNNs