In [1]:
# Libraries
import numpy as np
import matplotlib.pyplot as plt
from utils.preprocessing import *
from utils.model import *
from utils.config import *
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# Paths
PATH_ENG = 'data/small_vocab_en'
PATH_FR = 'data/small_vocab_fr'
PATH_GLOVE = 'data/glove.6B.100d.txt'
MODEL_SAVE_PATH = 'weights/model.h5'

In [3]:
# Reading dataset
english = read_english(PATH_ENG)
french, french_inputs = read_french(PATH_FR)

Reading English Lines
Reading French Lines


In [4]:
# finding maximum length of input snetence
max_len_input = max(len(s) for s in english)

In [5]:
# Tokenizing English
input_sequence, word2idx_english = tokenize_english(english)

Tokenizing English Texts
Found 199 unique english tokens


In [6]:
# Tokenizing French
target_sequence, target_sequence_inputs, word2idx_french = tokenize_french(french, french_inputs)

Tokenizing French Texts
Found 353 unique french tokens


In [7]:
num_words_output = len(word2idx_french) + 1
max_len_target = max(len(s) for s in target_sequence)

In [8]:
# Padding all inputs for encoder and decoders
encoder_inputs, decoder_inputs, decoder_targets = padding(input_sequence,
                                                          target_sequence, 
                                                          target_sequence_inputs, 
                                                          max_len_input, 
                                                          max_len_target)

Padding..


In [9]:
# Loading GloVe Word Embedding
word2vec, embedding_matrix = glove_embedding(word2idx_english, PATH_GLOVE)

Loading GloVe word embedding
Found 400000 word vectors
Filling pre-trained embeddings...


In [10]:
num_words = min(MAX_NUM_WORDS, len(word2idx_english) + 1)

In [11]:
# creating object of model class
x = model(num_words, embedding_matrix, max_len_input, max_len_target, num_words_output)

In [12]:
# creating model
train_model = x.Seq2SeqModel()

In [13]:
# loading weights
train_model.load_weights(MODEL_SAVE_PATH)

In [14]:
# compile the model
train_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
idx2word_eng = {v:k for k, v in word2idx_english.items()}
idx2word_trans = {v:k for k, v in word2idx_french.items()}

In [16]:
# prediction model
prediction_model = x.prediction()

In [18]:
while True:
    #Do some test translations
    i = np.random.choice(len(english))
    input_seq = encoder_inputs[i:i+1]
    translation = x.decode_sequence(input_seq, word2idx_french, prediction_model, idx2word_trans)
    print('-')
    print('Input sentence:', english[i])
    print('Predicted translation:', translation)
    print('Actual translation:', french[i])

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith('n'):
        break

-
Input sentence: india is sometimes nice during summer , and it is quiet in february .
Predicted translation: l' inde est parfois agrã©able pendant l' ã©tã© , et il est calme en fã©vrier .
Actual translation: l' inde est parfois agrÃ©able pendant l' Ã©tÃ© , et il est calme en fÃ©vrier . <eos>
Continue? [Y/n]y
-
Input sentence: the united states is never warm during december , and it is sometimes freezing in september .
Predicted translation: les ã©tats-unis est jamais chaud en dã©cembre , et il est parfois le gel en septembre .
Actual translation: les Ã©tats-unis est jamais chaud en dÃ©cembre , et il est parfois le gel en septembre . <eos>
Continue? [Y/n]y
-
Input sentence: this dog is your least favorite animal .
Predicted translation: ce chien est votre animal prã©fã©rã© moins .
Actual translation: ce chien est votre animal prÃ©fÃ©rÃ© moins . <eos>
Continue? [Y/n]y
-
Input sentence: the mango is my least favorite fruit , but the grapefruit is her least favorite .
Predicted translati