# ***Necessary Importations***

In [1]:
import pandas as pd 
import numpy as np
import re
import string
from string import digits
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow 
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding,Dropout,LSTM
from tensorflow.keras.optimizers import RMSprop

# ***Loading data***

In [2]:
data = pd.read_csv("por.txt", 
                      sep='\t', 
                 
                      names=["EN","PT", "Attribution"], 
                      header=None)

In [3]:
df = data.sample(frac=1).reset_index(drop=True)

In [None]:
df.head(5)

Unnamed: 0,EN,PT,Attribution
0,What do you think of those Japanese writers?,O que você acha desses escritores japoneses?,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
1,Tom hasn't finished his homework yet.,Tom ainda não terminou o dever de casa.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
2,I like you as a friend.,Eu gosto de você como amigo.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,I know that guy. His name's Tom.,Eu conheço esse cara. O nome dele é Tom.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,He wants one.,Ele quer um.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [4]:
#precise the english sentences and the portuguese senetences 
english_sentences=df['EN']
portuguese_sentences=df['PT']

In [5]:
#add letter s=start as a starting token and e=end as an ending token
portuguese_sentences="s "+portuguese_sentences+" e"

In [6]:
mark_start = 's '
mark_end = 'e'

# ***Preprocessing phase***

In [None]:
# Lowercase all characters
english_sentences=english_sentences.apply(lambda x: x.lower())
portuguese_sentences=portuguese_sentences.apply(lambda x: x.lower())

In [None]:
# Remove quotes
english_sentences=english_sentences.apply(lambda x:  re.sub("'", '', x))
portuguese_sentences=portuguese_sentences.apply(lambda x:  re.sub("'", '', x))

In [None]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
english_sentences=english_sentences.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
portuguese_sentencesi=portuguese_sentences.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
english_sentences=english_sentences.apply(lambda x: x.translate(remove_digits))
portuguese_sentences=portuguese_sentences.apply(lambda x: x.translate(remove_digits))

portuguese_sentences = portuguese_sentences.apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
english_sentences=english_sentences.apply(lambda x: x.strip())
portuguese_sentences=portuguese_sentences.apply(lambda x: x.strip())
english_sentences=english_sentences.apply(lambda x: re.sub(" +", " ", x))
portuguese_sentences=portuguese_sentences.apply(lambda x: re.sub(" +", " ", x))

In [7]:
num_words = 5000

Create A class called TokenizerWrap in which you create methods to use in predictions phase and also we will use this class for the word embedding techniques

In [8]:
class TokenizerWrap(Tokenizer):
    
    def __init__(self, texts, padding,
                 reverse=False, num_words=None):

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

        # Convert all texts to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):


        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    def text_to_tokens(self, text, reverse=False, padding=False):
        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)

            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

# ***Word Embedding adn data splitting***

In [9]:
tokenizer_src = TokenizerWrap(texts=english_sentences,
                              padding='pre',
                              reverse=True,
                              num_words=num_words)

In [10]:
tokenizer_dest = TokenizerWrap(texts=portuguese_sentences,
                               padding='post',
                               reverse=False,
                               num_words=num_words)

In [11]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(168903, 10)
(168903, 12)


In [12]:
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_start

2

In [13]:
token_end = tokenizer_dest.word_index[mark_end.strip()]
token_end

1

In [14]:
encoder_input_data = tokens_src

In [15]:
decoder_input_data = tokens_dest[:, :-1]
decoder_input_data.shape

(168903, 11)

In [16]:
decoder_output_data = tokens_dest[:, 1:]
decoder_output_data.shape

(168903, 11)

# ***Model Creattion***

In [18]:
encoder_input = Input(shape=(None, ), name='encoder_input')

In [19]:
embedding_size = 512

In [20]:
encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')

In [21]:
state_size = 512

In [22]:
encoder_gru1 = GRU(state_size, name='encoder_gru1', return_sequences=True)
dropout_encoder1=Dropout(0.2, name='dropout_encoder1')

encoder_gru2 = GRU(state_size, name='encoder_gru2', return_sequences=True)
dropout_encoder2=Dropout(0.2, name='dropout_encoder2')

encoder_gru3 = GRU(state_size, name='encoder_gru3',return_sequences=False)                  
dropout_encoder3=Dropout(0.2, name='dropout_encoder3')

In [23]:
def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = dropout_encoder1(net)
    net = encoder_gru2(net)
    net = dropout_encoder2(net)
    net = encoder_gru3(net)
    net = dropout_encoder3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

In [24]:
encoder_output = connect_encoder()

In [25]:
decoder_initial_state = Input(shape=(state_size,),name='decoder_initial_state')

In [26]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [27]:
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

In [28]:
decoder_gru1 = GRU(state_size, name='decoder_gru1',  return_sequences=True)                 
dropout_decoder1=Dropout(0.2, name='dropout_decoder1')

decoder_gru2 = GRU(state_size, name='decoder_gru2',return_sequences=True)
dropout_decoder2=Dropout(0.2, name='dropout_decoder2')

decoder_gru3 = GRU(state_size, name='decoder_gru3',return_sequences=True)
dropout_decoder3=Dropout(0.2, name='dropout_decoder3')

In [29]:
decoder_dense = Dense(num_words,
                      activation='softmax',
                      name='decoder_output')

In [30]:
def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = dropout_decoder1(net)
    net = decoder_gru2(net, initial_state=initial_state)
    net = dropout_decoder2(net )
    net = decoder_gru3(net, initial_state=initial_state)
    net = dropout_decoder3(net )

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [31]:
decoder_output = connect_decoder(initial_state=encoder_output)

In [32]:
model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

In [33]:
model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

In [34]:
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

# ***Compiling the model and makie data in dicts***

In [None]:
model_train.compile(optimizer=RMSprop(lr=1e-3),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

In [36]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

In [37]:
y_data = \
{
    'decoder_output': decoder_output_data
}

In [None]:
validation_split = 10000 / len(encoder_input_data)
validation_split

# ***Fit the model***

In [None]:
print(model_train.summary())
model_train.fit(x=x_data,
                y=y_data,
                batch_size=384,
                epochs=150,
                validation_split=validation_split,
                )
                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

# ***Make Predictions***

In [41]:
def translate(input_text, true_output_text=None):

    # Convert the input-text to integer-tokens.
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    # Get the output of the encoder's GRU which will be used as the initial state in the decoder's GRU.
    initial_state = model_encoder.predict(input_tokens)

    # Max number of tokens / words in the output sequence.
    max_tokens = tokenizer_dest.max_tokens

    # Pre-allocate the 2-dim array used as input to the decoder.
    # This holds just a single sequence of integer-tokens,
    # but the decoder-model expects a batch of sequences.
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # The first input-token is the special start-token for 's '.
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    # While we haven't sampled the special end-token for ' e'
    # and we haven't processed the max number of tokens.
    while token_int != token_end and count_tokens < max_tokens:
        # Update the input-sequence to the decoder
        # with the last token that was sampled.
        # In the first iteration this will set the
        # first element to the start-token.
        decoder_input_data[0, count_tokens] = token_int

        # Wrap the input-data in a dict for clarity and safety,
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

        # Input this data to the decoder and get the predicted output.
        decoder_output = model_decoder.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        token_onehot = decoder_output[0, count_tokens, :]
        
        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer_dest.token_to_word(token_int)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # Sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]
    
    # Print the input-text.
    print("Input text:")
    print(input_text)
    print()

    # Print the translated output-text.
    print("Translated text:")
    print(output_text)
    print()

    # Optionally print the true translated text.
    if true_output_text is not None:
        print("True output text:")
        print(true_output_text)
        print()

In [None]:
#make predictions on the 
idx=9988
translate(input_text=english_sentences[idx],
          true_output_text=portuguese_sentences[idx])

In [None]:
translate(input_text='I know how to do it',
          true_output_text=None)