# Train the model

In [1]:
from keras.models import Model

#tmp order
from keras.layers import Input, CuDNNLSTM, Embedding, Dense, Dropout, RNN, LSTMCell, TimeDistributed
import keras
from keras import backend as K
from keras.utils import plot_model
import tensorflow as tf
###

from keras.utils import to_categorical
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
from keras.models import load_model
import numpy as np
import sentencepiece as spm
from tqdm import tqdm

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
batch_size = 64
epochs = 100
hidden_dims = 256

data_file = "jpn.txt"
enc_input_tokens = []
dec_input_tokens = []
dec_target_tokens = []
start_token_id = 1
end_token_id = 2
pad_token_id = 3

with open(data_file, "r", encoding="utf-8") as f:
    lines_list = f.read().split("\n")

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load("sentencepiece/spm_for_nmt.model")

for line in lines_list:
    #for the last black data, we need to skip
    if line == "":
        break
    source_text, target_text = line.split("\t")
    tokenized_source_text = tokenizer.EncodeAsPieces(source_text)
    #test = tokenizer.EncodeAsIds(source_text)
    #print(test)
    tokenized_target_text = tokenizer.EncodeAsPieces(target_text)
    
    int_tokenized_source = []
    int_tokenized_input_target = []
    int_tokenized_output_target = []
    for token in tokenized_source_text:
        int_tokenized_source.append(tokenizer.piece_to_id(token))
    for i, token in enumerate(tokenized_target_text):
        if i == 0:
            int_tokenized_input_target.append(start_token_id)
            continue
        int_tokenized_input_target.append(tokenizer.piece_to_id(token))
        int_tokenized_output_target.append(tokenizer.piece_to_id(token))
        
    int_tokenized_output_target.append(end_token_id)
    
    if len(int_tokenized_output_target) != len(int_tokenized_input_target):
        print("Error")
        
    enc_input_tokens.append(int_tokenized_source)
    dec_input_tokens.append(int_tokenized_input_target)
    dec_target_tokens.append(int_tokenized_output_target)

len(enc_input_tokens), len(dec_input_tokens), len(dec_target_tokens)

(45093, 45093, 45093)

In [3]:
max_enc_seq = max([len(i) for i in enc_input_tokens])
max_dec_seq = max([len(i) for i in dec_input_tokens])

max_enc_seq, max_dec_seq

(128, 61)

In [4]:
def pad_or_truncate_inputs(data, max_len):
    new_data = []
        
    for sample in tqdm(data):
        if len(sample) >= max_len:
            tmp = sample[:max_len]
        else:
            tmp = sample
            num_of_pads_needed = max_len - len(sample)
            for _ in range(num_of_pads_needed):
                tmp.append(pad_token_id)
                
        new_data.append(tmp)
        
    return new_data

In [5]:
enc_input_tokens = pad_or_truncate_inputs(enc_input_tokens, max_enc_seq)
dec_input_tokens = pad_or_truncate_inputs(dec_input_tokens, max_dec_seq)
dec_target_tokens = pad_or_truncate_inputs(dec_target_tokens, max_dec_seq)

len(enc_input_tokens), len(dec_input_tokens), len(dec_target_tokens)

100%|██████████| 45093/45093 [00:00<00:00, 45910.31it/s]
100%|██████████| 45093/45093 [00:00<00:00, 84701.79it/s]
100%|██████████| 45093/45093 [00:00<00:00, 85467.92it/s]


(45093, 45093, 45093)

In [6]:
print(dec_input_tokens[0],"\n", dec_target_tokens[0])

[1, 168, 421, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 
 [168, 421, 6, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [7]:
np.random.seed(1234)

def shuffle_dataset_and_split_into_train_test(enc_input, dec_input, dec_target, test_ratio=0.2):
    dataset_list = list(zip(enc_input, dec_input, dec_target))
    np.random.shuffle(dataset_list)
    split_point = int(len(enc_input) * test_ratio)
    test = dataset_list[:split_point]
    train = dataset_list[split_point:]
    return train, test

In [8]:
train, test = shuffle_dataset_and_split_into_train_test(enc_input_tokens, dec_input_tokens, dec_target_tokens)
len(train), len(test), len(train[0]), len(test[0])

(36075, 9018, 3, 3)

In [9]:
len(train[0][0]), len(train[0][1]), len(train[0][2])

(128, 61, 61)

In [10]:
def generate_data(data_list, batch_size, shuffle=False):
    while True:
        if shuffle:
            np.random.shuffle(data_list)
            
        for i in range(0, len(data_list), batch_size):
            enc_input_list = []
            dec_input_list = []
            dec_target_list = []
            batch_list_inside_tuples = data_list[i: i + batch_size]
            
            for sample in batch_list_inside_tuples:
                e_inp, d_inp, d_tar = sample[0], sample[1], sample[2]
                enc_input_list.append(e_inp)
                dec_input_list.append(d_inp)
                dec_target_list.append(d_tar)
            np_batch_enc_input = np.vstack(enc_input_list)
            np_batch_dec_input = np.vstack(dec_input_list)
            np_batch_dec_target = np.vstack(dec_target_list)
            np_batch_dec_target = np_batch_dec_target.reshape((np_batch_dec_target.shape[0], np_batch_dec_target.shape[1], 1))
            #np_batch_dec_target_one_hot = to_categorical(np_batch_dec_target, num_classes=vocab_in_size)
            ##input values are inside of [], and the rest is output value
            #yield [np_batch_enc_input, np_batch_dec_input], np_batch_dec_target_one_hot
            yield [np_batch_enc_input, np_batch_dec_input], np_batch_dec_target

In [11]:
train_on_batch = generate_data(train, batch_size, shuffle=True)
test_on_batch = generate_data(test, batch_size)
train_steps_per_epoch = len(train) // batch_size
test_steps_per_epoch = len(test) // batch_size
train_steps_per_epoch, test_steps_per_epoch

(563, 140)

In [84]:
vocab_in_size = 8000
vocab_out_size = 8000
units = 256
embedding_dim = 300
len_input = max_enc_seq

encoder_inputs = Input(shape=(len_input,))
encoder_emb = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
encoder_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb(encoder_inputs))
encoder_states = [state_h, state_c]

# Now create the Decoder layers.
decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(input_dim=vocab_out_size, output_dim=embedding_dim)
decoder_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
decoder_lstm_out, _, _ = decoder_lstm(decoder_emb(decoder_inputs), initial_state=encoder_states)
# Two dense layers added to this model to improve inference capabilities.
decoder_d1 = Dense(units, activation="relu")
decoder_d2 = Dense(vocab_out_size, activation="softmax")
# Drop-out is added in the dense layers to help mitigate overfitting in this part of the model. Astute developers
# may want to add the same mechanism inside the LSTMs.
decoder_out = decoder_d2(Dropout(rate=.4)(decoder_d1(Dropout(rate=.4)(decoder_lstm_out))))

# Finally, create a training model which combines the encoder and the decoder.
# Note that this model has three inputs:
#  encoder_inputs=[batch,encoded_words] from input language (English)
#  decoder_inputs=[batch,encoded_words] from output language (Spanish). This is the "teacher tensor".
#  decoder_out=[batch,encoded_words] from output language (Spanish). This is the "target tensor".
model = Model([encoder_inputs, decoder_inputs], decoder_out)
plot_model(model, to_file="check.png", show_shapes=True)
# We'll use sparse_categorical_crossentropy so we don't have to expand decoder_out into a massive one-hot array.
#  Adam is used because it's, well, the best.
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", #metrics=['sparse_categorical_accuracy'])
             )

In [82]:
list, a = next(train_on_batch)
type(a[0])

numpy.ndarray

In [85]:
earlystopping = EarlyStopping(monitor="val_loss", patience=2, verbose=1, mode="auto")

model.fit_generator(
    generator=train_on_batch,
    steps_per_epoch=train_steps_per_epoch,
    epochs=epochs,
    verbose=1,
    callbacks=[earlystopping],
    validation_data=test_on_batch,
    validation_steps=test_steps_per_epoch
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 00036: early stopping


<keras.callbacks.History at 0x7fff48d84a58>

In [86]:
# Create the encoder model from the tensors we previously declared.
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Generate a new set of tensors for our new inference decoder. Note that we are using new tensors, 
# this does not preclude using the same underlying layers that we trained on. (e.g. weights/biases).
inf_decoder_inputs = Input(shape=(None,), name="inf_decoder_inputs")
# We'll need to force feed the two state variables into the decoder each step.
state_input_h = Input(shape=(units,), name="state_input_h")
state_input_c = Input(shape=(units,), name="state_input_c")
decoder_res, decoder_h, decoder_c = decoder_lstm(
    decoder_emb(inf_decoder_inputs), 
    initial_state=[state_input_h, state_input_c])
inf_decoder_out = decoder_d2(decoder_d1(decoder_res))
inf_model = Model(inputs=[inf_decoder_inputs, state_input_h, state_input_c], 
                  outputs=[inf_decoder_out, decoder_h, decoder_c])

In [18]:
def translate(input_sentence, infenc_model, infmodel, attention=False):
    print("[input sentence]", input_sentence)
    sv = tokenizer.EncodeAsIds(input_sentence)
    sv = np.array(sv).reshape(1, len(sv)).tolist()
    sv = pad_or_truncate_inputs(sv, max_enc_seq)
    sv = np.array(sv)
    # Reshape so we can use the encoder model. New shape=[samples,sequence length]
    #sv = sv.reshape(1,len(sv))
    [emb_out, sh, sc] = infenc_model.predict(x=sv)
    
    i = 0
    start_vec = start_token_id
    stop_vec = end_token_id
    # We will continuously feed cur_vec as an input into the decoder to produce the next word,
    # which will be assigned to cur_vec. Start it with "<start>".
    cur_vec = np.zeros((1,1))
    cur_vec[0,0] = start_vec
    #cur_word = "<start>"
    output_sentence = ""
    output_sequence = []
    len_target = max_dec_seq
    # Start doing the feeding. Terminate when the model predicts an "<end>" or we reach the end
    # of the max target language sentence length.
    while cur_vec[0,0] != end_token_id and i < (len_target-1):
        i += 1
        #if cur_word != "<start>":
        #    output_sentence = output_sentence + " " + cur_word
        x_in = [cur_vec, sh, sc]
        # This will allow us to accomodate attention models, which we will talk about later.
        if attention:
            x_in += [emb_out]
        [nvec, sh, sc] = infmodel.predict(x=x_in)
        # The output of the model is a massive softmax vector with one spot for every possible word. Convert
        # it to a word ID using argmax().
        cur_vec[0,0] = np.argmax(nvec[0,0])
        output_sequence.append(int(cur_vec[0,0]))
    
    result = tokenizer.DecodeIds(output_sequence)
    return "[output sentence]" + result

In [90]:
print(translate("This park reminds me of my childhood.", encoder_model, inf_model))

100%|██████████| 1/1 [00:00<00:00, 3418.34it/s]

[input sentence] This park reminds me of my childhood.
[output sentence]その少年は私の兄を養うのが好きだ。





In [12]:
vocab_in_size = 8000
vocab_out_size = 8000
units = 256
embedding_dim = 300
len_input = max_enc_seq

In [13]:
# RNN "Cell" classes in Keras perform the actual data transformations at each timestep. Therefore, in order
# to add attention to LSTM, we need to make a custom subclass of LSTMCell.
class AttentionLSTMCell(LSTMCell):
    def __init__(self, **kwargs):
        self.attentionMode = False
        super(AttentionLSTMCell, self).__init__(**kwargs)
    
    # Build is called to initialize the variables that our cell will use. We will let other Keras
    # classes (e.g. "Dense") actually initialize these variables.
    #@tf_utils.shape_type_conversion
    def build(self, input_shape):        
        # Converts the input sequence into a sequence which can be matched up to the internal
        # hidden state.
        self.dense_constant = TimeDistributed(Dense(self.units, name="AttLstmInternal_DenseConstant"))
        
        # Transforms the internal hidden state into something that can be used by the attention
        # mechanism.
        self.dense_state = Dense(self.units, name="AttLstmInternal_DenseState")
        
        # Transforms the combined hidden state and converted input sequence into a vector of
        # probabilities for attention.
        self.dense_transform = Dense(1, name="AttLstmInternal_DenseTransform")
        
        # We will augment the input into LSTMCell by concatenating the context vector. Modify
        # input_shape to reflect this.
        batch, input_dim = input_shape[0]
        batch, timesteps, context_size = input_shape[-1]
        lstm_input = (batch, input_dim + context_size)
        
        # The LSTMCell superclass expects no constant input, so strip that out.
        return super(AttentionLSTMCell, self).build(lstm_input)
    
    # This must be called before call(). The "input sequence" is the output from the 
    # encoder. This function will do some pre-processing on that sequence which will
    # then be used in subsequent calls.
    def setInputSequence(self, input_seq):
        self.input_seq = input_seq
        self.input_seq_shaped = self.dense_constant(input_seq)
        self.timesteps = tf.shape(self.input_seq)[-2]
    
    # This is a utility method to adjust the output of this cell. When attention mode is
    # turned on, the cell outputs attention probability vectors across the input sequence.
    def setAttentionMode(self, mode_on=False):
        self.attentionMode = mode_on
    
    # This method sets up the computational graph for the cell. It implements the actual logic
    # that the model follows.
    def call(self, inputs, states, constants):
        # Separate the state list into the two discrete state vectors.
        # ytm is the "memory state", stm is the "carry state".
        ytm, stm = states
        # We will use the "carry state" to guide the attention mechanism. Repeat it across all
        # input timesteps to perform some calculations on it.
        stm_repeated = K.repeat(self.dense_state(stm), self.timesteps)
        # Now apply our "dense_transform" operation on the sum of our transformed "carry state" 
        # and all encoder states. This will squash the resultant sum down to a vector of size
        # [batch,timesteps,1]
        # Note: Most sources I encounter use tanh for the activation here. I have found with this dataset
        # and this model, relu seems to perform better. It makes the attention mechanism far more crisp
        # and produces better translation performance, especially with respect to proper sentence termination.
        combined_stm_input = self.dense_transform(
            keras.activations.relu(stm_repeated + self.input_seq_shaped))
        # Performing a softmax generates a log probability for each encoder output to receive attention.
        score_vector = keras.activations.softmax(combined_stm_input, 1)
        # In this implementation, we grant "partial attention" to each encoder output based on 
        # it's log probability accumulated above. Other options would be to only give attention
        # to the highest probability encoder output or some similar set.
        context_vector = K.sum(score_vector * self.input_seq, 1)
        
        # Finally, mutate the input vector. It will now contain the traditional inputs (like the seq2seq
        # we trained above) in addition to the attention context vector we calculated earlier in this method.
        inputs = K.concatenate([inputs, context_vector])
        
        # Call into the super-class to invoke the LSTM math.
        res = super(AttentionLSTMCell, self).call(inputs=inputs, states=states)
        
        # This if statement switches the return value of this method if "attentionMode" is turned on.
        if(self.attentionMode):
            return (K.reshape(score_vector, (-1, self.timesteps)), res[1])
        else:
            return res

# Custom implementation of the Keras LSTM that adds an attention mechanism.
# This is implemented by taking an additional input (using the "constants" of the
# RNN class) into the LSTM: The encoder output vectors across the entire input sequence.
class LSTMWithAttention(RNN):
    def __init__(self, units, **kwargs):
        cell = AttentionLSTMCell(units=units)
        self.units = units
        super(LSTMWithAttention, self).__init__(cell, **kwargs)
        
    #@tf_utils.shape_type_conversion
    def build(self, input_shape):
        self.input_dim = input_shape[0][-1]
        self.timesteps = input_shape[0][-2]
        return super(LSTMWithAttention, self).build(input_shape) 
    
    # This call is invoked with the entire time sequence. The RNN sub-class is responsible
    # for breaking this up into calls into the cell for each step.
    # The "constants" variable is the key to our implementation. It was specifically added
    # to Keras to accomodate the "attention" mechanism we are implementing.
    def call(self, x, constants, **kwargs):
        if isinstance(x, list):
            self.x_initial = x[0]
        else:
            self.x_initial = x
        
        # The only difference in the LSTM computational graph really comes from the custom
        # LSTM Cell that we utilize.
        self.cell._dropout_mask = None
        self.cell._recurrent_dropout_mask = None
        self.cell.setInputSequence(constants[0])
        return super(LSTMWithAttention, self).call(inputs=x, constants=constants, **kwargs)

# Below is test code to validate that this LSTM class and the associated cell create a
# valid computational graph.
test = LSTMWithAttention(units=units, return_sequences=True, return_state=True)
test.cell.setAttentionMode(True)
attenc_inputs2 = Input(shape=(len_input,))
attenc_emb2 = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
test(inputs=attenc_emb2(attenc_inputs2), constants=attenc_emb2(attenc_inputs2), initial_state=None)

[<tf.Tensor 'lstm_with_attention_1/transpose_1:0' shape=(?, ?, ?) dtype=float32>,
 <tf.Tensor 'lstm_with_attention_1/while/Exit_3:0' shape=(?, 256) dtype=float32>,
 <tf.Tensor 'lstm_with_attention_1/while/Exit_4:0' shape=(?, 256) dtype=float32>]

In [14]:
# Re-create an entirely new model and set of layers for the attention model

# Encoder Layers
attenc_inputs = Input(shape=(len_input,), name="attenc_inputs")
attenc_emb = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
attenc_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
attenc_outputs, attstate_h, attstate_c = attenc_lstm(attenc_emb(attenc_inputs))
attenc_states = [attstate_h, attstate_c]

attdec_inputs = Input(shape=(None,))
attdec_emb = Embedding(input_dim=vocab_out_size, output_dim=embedding_dim)
attdec_lstm = LSTMWithAttention(units=units, return_sequences=True, return_state=True)
# Note that the only real difference here is that we are feeding attenc_outputs to the decoder now.
# Nice and clean!
attdec_lstm_out, _, _ = attdec_lstm(inputs=attdec_emb(attdec_inputs), 
                                    constants=attenc_outputs, 
                                    initial_state=attenc_states)
attdec_d1 = Dense(units, activation="relu")
attdec_d2 = Dense(vocab_out_size, activation="softmax")
attdec_out = attdec_d2(Dropout(rate=.4)(attdec_d1(Dropout(rate=.4)(attdec_lstm_out))))

attmodel = Model([attenc_inputs, attdec_inputs], attdec_out)
attmodel.compile(optimizer="adam", loss="sparse_categorical_crossentropy", #metrics=['sparse_categorical_accuracy']
                )

In [15]:
attmodel.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attenc_inputs (InputLayer)      (None, 128)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 128, 300)     2400000     attenc_inputs[0][0]              
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 300)    2400000     input_2[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm

In [28]:
plot_model(attmodel, to_file="attention_nmt_model.png", show_shapes=True, show_layer_names=True)

In [16]:
epochs = 100
#atthist = attmodel.fit([input_data, teacher_data], target_data,
#                 batch_size=BATCH_SIZE,
#                 epochs=epochs,
#                 validation_split=0.2)
#
## Plot the results of the training.
#plt.plot(atthist.history['sparse_categorical_accuracy'], label="Training loss")
#plt.plot(atthist.history['val_sparse_categorical_accuracy'], label="Validation loss")
#plt.show()

earlystopping = EarlyStopping(monitor="val_loss", patience=2, verbose=1, mode="auto")

attmodel.fit_generator(
    generator=train_on_batch,
    steps_per_epoch=train_steps_per_epoch,
    epochs=epochs,
    verbose=1,
    callbacks=[earlystopping],
    validation_data=test_on_batch,
    validation_steps=test_steps_per_epoch
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 00034: early stopping


<keras.callbacks.History at 0x7fff50dab5c0>

In [19]:
def createAttentionInference(attention_mode=False):
    # Create an inference model using the layers already trained above.
    attencoder_model = Model(attenc_inputs, [attenc_outputs, attstate_h, attstate_c])
    state_input_h = Input(shape=(units,), name="state_input_h")
    state_input_c = Input(shape=(units,), name="state_input_c")
    attenc_seq_out = Input(shape=attenc_outputs.get_shape()[1:], name="attenc_seq_out")
    inf_attdec_inputs = Input(shape=(None,), name="inf_attdec_inputs")
    attdec_lstm.cell.setAttentionMode(attention_mode)
    attdec_res, attdec_h, attdec_c = attdec_lstm(attdec_emb(inf_attdec_inputs), 
                                                 initial_state=[state_input_h, state_input_c], 
                                                 constants=attenc_seq_out)
    attinf_model = None
    if not attention_mode:
        inf_attdec_out = attdec_d2(attdec_d1(attdec_res))
        attinf_model = Model(inputs=[inf_attdec_inputs, state_input_h, state_input_c, attenc_seq_out], 
                             outputs=[inf_attdec_out, attdec_h, attdec_c])
    else:
        attinf_model = Model(inputs=[inf_attdec_inputs, state_input_h, state_input_c, attenc_seq_out], 
                             outputs=[attdec_res, attdec_h, attdec_c])
    return attencoder_model, attinf_model

attencoder_model, attinf_model = createAttentionInference()
print(translate("This park reminds me of my childhood.", attencoder_model, attinf_model, True))

100%|██████████| 1/1 [00:00<00:00, 1869.12it/s]

[input sentence] This park reminds me of my childhood.





[output sentence]車を修理するのは私のことを確信している。


In [20]:
print(translate("I am feeling good today.", attencoder_model, attinf_model, True))

100%|██████████| 1/1 [00:00<00:00, 5924.16it/s]

[input sentence] I am feeling good today.
[output sentence]今日は忙しい。





In [21]:
print(translate("Can I play tennis today?", attencoder_model, attinf_model, True))

100%|██████████| 1/1 [00:00<00:00, 3738.24it/s]

[input sentence] Can I play tennis today?
[output sentence]今日はテニスをする?





In [22]:
print(translate("I am feeling tired.", attencoder_model, attinf_model, True))

100%|██████████| 1/1 [00:00<00:00, 1401.84it/s]

[input sentence] I am feeling tired.
[output sentence]私はとても疲れている。





In [23]:
print(translate("I want to eat some food now because I am so hungry.", attencoder_model, attinf_model, True))

100%|██████████| 1/1 [00:00<00:00, 5857.97it/s]

[input sentence] I want to eat some food now because I am so hungry.
[output sentence]私はまだ泳ぐのは楽しみに行きたい。





In [31]:
input_text = "Go."
tokenized_ids = tokenizer.EncodeAsIds(input_text)
tokenized_ids = np.array(tokenized_ids).reshape(1, len(tokenized_ids)).tolist()
enc_input = pad_or_truncate_inputs(tokenized_ids, max_enc_seq)
enc_input = np.array(enc_input).reshape(1, len(tokenized_ids[0]))
_, sh, sc = encoder_model.predict(enc_input)
output_seq = []
cur_token = [start_token_id]
#cur_token = np.array(cur_token).reshape(1, len(cur_token)).tolist()
#cur_token = pad_or_truncate_inputs(cur_token, max_dec_seq)
i = 0
while cur_token != end_token_id and i < (max_dec_seq -1):
    i += 1
    #print("i", cur_token)
    #print(cur_token)
    dec_inputs = [cur_token] + [sh, sc]
    [out_dist_vec, sh, sc] = inf_model.predict(dec_inputs)
    #print(out_dist_vec.shape)
    output_token = np.argmax(out_dist_vec[0,0], axis=-1)
    #print("o", output_token)
    output_seq.append(int(output_token))
    cur_token = [output_token]
    #cur_token = np.array(output_token).reshape(1, 1).tolist()
    #cur_token = pad_or_truncate_inputs(cur_token, max_dec_seq)

result = tokenizer.DecodeIds(output_seq)
print(result)

100%|██████████| 1/1 [00:00<00:00, 4169.29it/s]

お願いします。はいつもりです。はいつもりです。はいつもりだ。はいつもりだ。、私は私に言った。はいけない。、私は私





In [182]:
model.save("seq2seq_modified.h5", include_optimizer=False)
print("The model is saved!")

  '. They will not be included '


The model is saved!


# Inference by the model

In [183]:
model = load_model("seq2seq_modified.h5", compile=False)

In [16]:
##inference network architecture
enc_model = Model(enc_inputs, enc_states)
enc_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 300)          2400000   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     [(None, 256), (None, 256) 571392    
Total params: 2,971,392
Trainable params: 2,971,392
Non-trainable params: 0
_________________________________________________________________


In [17]:
plot_model(enc_model, to_file="modified_enc_model_for_inference.png", show_shapes=True)

In [1]:
dec_input_state_h = Input(shape=(hidden_dims,))
dec_input_state_c = Input(shape=(hidden_dims,))
dec_input_states = [dec_input_state_h, dec_input_state_c]
dec_outputs, state_h, state_c = dec(dec_emb(dec_inputs), initial_state=dec_input_states)
dec_states = [state_h, state_c]
dec_outputs = dec_dense(dec_outputs)
dec_model = Model([dec_inputs] + dec_input_states,
                 [dec_outputs] + dec_states)
dec_model.summary()

NameError: name 'Input' is not defined

In [19]:
plot_model(enc_model, to_file="modified_dec_model_for_inference.png", show_shapes=True)

In [27]:
input_text = "We listened to her for some time."
tokenized_ids = tokenizer.EncodeAsIds(input_text)
tokenized_ids = np.array(tokenized_ids).reshape(1, len(tokenized_ids)).tolist()
enc_input = pad_or_truncate_inputs(tokenized_ids, max_enc_seq)
enc_input = np.array(enc_input).reshape(1, len(tokenized_ids[0]))
enc_states = enc_model.predict(enc_input)
output_seq = []
cur_token = [start_token_id]
#cur_token = np.array(cur_token).reshape(1, len(cur_token)).tolist()
#cur_token = pad_or_truncate_inputs(cur_token, max_dec_seq)
i = 0
while cur_token != end_token_id and i < (max_dec_seq -1):
    i += 1
    #print("i", cur_token)
    dec_inputs = [cur_token] + enc_states
    [out_dist_vec, sh, sc] = dec_model.predict(dec_inputs)
    print(out_dist_vec.shape)
    output_token = np.argmax(out_dist_vec[0,0], axis=-1)
    #print("o", output_token)
    output_seq.append(tokenizer.id_to_piece(int(output_token)))
    cur_token = [output_token]
    #cur_token = np.array(output_token).reshape(1, 1).tolist()
    #cur_token = pad_or_truncate_inputs(cur_token, max_dec_seq)

print(output_seq)

100%|██████████| 1/1 [00:00<00:00, 1297.74it/s]

(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
(1, 1, 8000)
['私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 'と', '私', 'も', '彼女', 'いない', 


