https://towardsdatascience.com/how-to-build-an-encoder-decoder-translation-model-using-lstm-with-python-and-keras-a31e9d864b9b

In [13]:
import pandas as pd
import numpy as np
import string

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [14]:
with open('./fra-eng/fra.txt','r',encoding='utf-8') as f:
    raw_data=f.read()

raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in  raw_data]

pairs = pairs[:100000]

In [15]:
def clean_sentence(sent):
    sent=sent.lower()
    
    punc=string.punctuation + "¡" + '¿'
    sent=sent.translate(str.maketrans('','',punc))
    
    return sent

In [16]:
def tokenize(sent):
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(sent)
    # print(text_tokenizer.word_index.items())
    # print(text_tokenizer.texts_to_sequences(sent))
    return text_tokenizer.texts_to_sequences(sent), text_tokenizer

In [17]:
text_tokenizer=Tokenizer()  
text_tokenizer.fit_on_texts(pairs[0])
print(text_tokenizer.word_index.items())
print(len(text_tokenizer.word_index))
print(text_tokenizer.texts_to_sequences(pairs[0]))

dict_items([('go', 1), ('va', 2), ('cc', 3), ('by', 4), ('2', 5), ('0', 6), ('france', 7), ('attribution', 8), ('tatoeba', 9), ('org', 10), ('2877272', 11), ('cm', 12), ('1158250', 13), ('wittydev', 14)])
14
[[1], [2], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]]


In [18]:
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
french_sentences = [clean_sentence(pair[1]) for pair in pairs]

eng_text_tokenized, eng_text_tokenizer = tokenize([clean_sentence(pair[0]) for pair in pairs])
fre_text_tokenized, fre_text_tokenizer = tokenize([clean_sentence(pair[1]) for pair in pairs])

max_eng_len=len(max(eng_text_tokenized,key=len))
max_fre_len=len(max(fre_text_tokenized,key=len))

print('Maximum length English sentence: {}'.format(max_eng_len))
print('Maximum length French sentence: {}'.format(max_fre_len))

french_vocab = len(fre_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("French vocabulary is of {} unique words".format(french_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Maximum length English sentence: 9
Maximum length French sentence: 14
French vocabulary is of 21525 unique words
English vocabulary is of 9071 unique words


In [19]:
fre_pad_sentence=pad_sequences(fre_text_tokenized,max_fre_len,padding='post')
eng_pad_sentence=pad_sequences(eng_text_tokenized,max_eng_len,padding='post')

# fre_pad_seq[2150]

In [20]:
eng_pad_sentence.shape

(100000, 9)

In [21]:
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)
fre_pad_sentence = fre_pad_sentence.reshape(*fre_pad_sentence.shape, 1)

# eng_pad_sentence.shape

In [22]:
input_seq=Input(shape=max_fre_len,)
embedding=Embedding(input_dim=french_vocab,output_dim=128)(input_seq)    #will create an array of dim(english vocab size,128)  128 is the number of features
encoder=LSTM(64,return_sequences=False)(embedding)      #Even though each time step of the LSTM outputs a hidden vector we dont need it for encoder so return_seq is false
r_vec=RepeatVector(max_eng_len)(encoder)
decoder=LSTM(64,return_sequences=True,dropout=0.2)(r_vec)
logits=TimeDistributed(Dense(english_vocab))(decoder)

<font size="3">
As we can see in the image the hidden vector is repeated n times, so each time step of the LSTM receives the same vector (decoder)<br>
In order to have this same vector for every time step we need to use the layer RepeatVector, as its names implies its role is to repeat the vector it is receiving, the only parameter we need to define is n, the number of repetitions.<br>
This number is equal to the number of time step of the decoder part, in other words the maximum English sentence length, 6.</font>

<font size="3">Once we have the input ready, we will continue with the decoder. This is also built with a LSTM layer, the difference is the parameter return_sequences, which in this case is ‘True’.<br> 
What is this parameter for? In the encoder part we were expecting only one vector in the last time step and neglecting all the others, here we are expecting an output vector at every time step so the Dense layer can make a prediction.</font>

<font size="3">We have just seen how to apply the Dense layer and predict one word, but how do we make the prediction for the whole sentence? Because we are using return_sequence=True, LSTM layer outputs a vector at every time step, so we need to apply the previous explained Dense layer at every time step and predict one word at a time. To do this, Keras has developed a specific layer called TimeDistributed, it applies the same Dense layer to every time step.</font>

In [23]:
model=Model(input_seq,Activation('softmax')(logits))
model.compile(optimizer=Adam(1e-3),
              loss=sparse_categorical_crossentropy,
              metrics=['accuracy']
              )
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 14)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 14, 128)           2755200   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 9, 64)             0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 9, 64)             33024     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 9, 9071)           589615    
_________________________________________________________________
activation_1 (Activation)    (None, 9, 9071)           0   

In [24]:
result=model.fit(fre_pad_sentence,eng_pad_sentence, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
 569/3125 [====>.........................] - ETA: 4:04 - loss: 3.0519 - accuracy: 0.5378

In [None]:
print((model.predict(fre_pad_sentence[14:15]).shape))
print((model.predict(fre_pad_sentence[14:15])[0].shape))
print((model.predict(fre_pad_sentence[14:15])[0]))

(1, 11, 7749)
(11, 7749)
[[3.6618917e-03 1.5258732e-04 9.6696804e-06 ... 1.4072783e-06
  1.5732898e-06 5.1300553e-06]
 [5.7763761e-01 8.2470177e-07 3.9365095e-05 ... 6.7855808e-08
  6.8940295e-08 1.6914657e-07]
 [8.2410157e-01 2.3936357e-07 3.3217275e-06 ... 4.4366232e-08
  4.2616811e-08 2.5443842e-07]
 ...
 [9.9857342e-01 3.8039929e-07 4.5467485e-05 ... 1.5797486e-10
  1.5894412e-10 2.8378189e-09]
 [9.9867356e-01 3.6835235e-07 4.3847383e-05 ... 1.4109483e-10
  1.4133751e-10 2.6176972e-09]
 [9.9873930e-01 3.6272976e-07 4.1970387e-05 ... 1.3185808e-10
  1.3160180e-10 2.4757236e-09]]


fre_pad_sentence[14] is a 2d array but we need to pass 3d array to the model so pass fre_pad_sentence[14:15]

In [None]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
index = 135
print("The english sentence is: {}".format(english_sentences[index]))
print("The french sentence is: {}".format(french_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(model.predict(fre_pad_sentence[index:index+1])[0], eng_text_tokenizer))


The English sentence is: thanks
The French sentence is: merci 
The predicted sentence is :
restez          


In [None]:
import pickle

pickle.dump(model,open('model.pkl','wb'))