# Train the model

In [1]:
from keras.models import Model
from keras.layers import Input, CuDNNLSTM, Embedding, Dense
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
from keras.models import load_model
import numpy as np
import sentencepiece as spm

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
batch_size = 64
epochs = 100
hidden_dims = 256

data_file = "jpn.txt"
enc_input_tokens = []
dec_input_tokens = []
dec_target_tokens = []
start_token_id = 1
end_token_id = 2

with open(data_file, "r", encoding="utf-8") as f:
    lines_list = f.read().split("\n")

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load("spm.model")

for line in lines_list:
    #for the last black data, we need to skip
    if line == "":
        break
    source_text, target_text = line.split("\t")
    tokenized_source_text = tokenizer.EncodeAsPieces(source_text)
    #test = tokenizer.EncodeAsIds(source_text)
    #print(test)
    tokenized_target_text = tokenizer.EncodeAsPieces(target_text)
    
    int_tokenized_source = []
    int_tokenized_input_target = []
    int_tokenized_output_target = []
    for token in tokenized_source_text:
        int_tokenized_source.append(tokenizer.piece_to_id(token))
    for i, token in enumerate(tokenized_target_text):
        if i == 0:
            int_tokenized_input_target.append(start_token_id)
            continue
        int_tokenized_input_target.append(tokenizer.piece_to_id(token))
        int_tokenized_output_target.append(tokenizer.piece_to_id(token))
        
    int_tokenized_output_target.append(end_token_id)
    
    if len(int_tokenized_output_target) != len(int_tokenized_input_target):
        print("Error")
        
    enc_input_tokens.append(int_tokenized_source)
    dec_input_tokens.append(int_tokenized_input_target)
    dec_target_tokens.append(int_tokenized_output_target)

len(enc_input_tokens), len(dec_input_tokens), len(dec_target_tokens)

(45093, 45093, 45093)

In [3]:
max_enc_seq = max([len(i) for i in enc_input_tokens])
max_dec_seq = max([len(i) for i in dec_input_tokens])

max_enc_seq, max_dec_seq

(128, 61)

In [4]:
from tqdm import tqdm

def pad_or_truncate_inputs(data, max_len):
    new_data = []
    pad_id = 0
        
    for sample in tqdm(data):
        if len(sample) >= max_len:
            tmp = sample[:max_len]
        else:
            tmp = sample
            num_of_pads_needed = max_len - len(sample)
            for _ in range(num_of_pads_needed):
                tmp.append(pad_id)
                
        new_data.append(tmp)
        
    return new_data

In [5]:
enc_input_tokens = pad_or_truncate_inputs(enc_input_tokens, max_enc_seq)
dec_input_tokens = pad_or_truncate_inputs(dec_input_tokens, max_dec_seq)
dec_target_tokens = pad_or_truncate_inputs(dec_target_tokens, max_dec_seq)

len(enc_input_tokens), len(dec_input_tokens), len(dec_target_tokens)

100%|██████████| 45093/45093 [00:00<00:00, 46074.02it/s]
100%|██████████| 45093/45093 [00:00<00:00, 84133.82it/s]
100%|██████████| 45093/45093 [00:00<00:00, 87501.76it/s]


(45093, 45093, 45093)

In [6]:
np.random.seed(1234)

def shuffle_dataset_and_split_into_train_test(enc_input, dec_input, dec_target, test_ratio=0.2):
    dataset_list = list(zip(enc_input, dec_input, dec_target))
    np.random.shuffle(dataset_list)
    split_point = int(len(enc_input) * test_ratio)
    test = dataset_list[:split_point]
    train = dataset_list[split_point:]
    return train, test

In [7]:
train, test = shuffle_dataset_and_split_into_train_test(enc_input_tokens, dec_input_tokens, dec_target_tokens)
len(train), len(test), len(train[0]), len(test[0])

(36075, 9018, 3, 3)

In [8]:
len(train[0][0]), len(train[0][1]), len(train[0][2])

(128, 61, 61)

In [9]:
def generate_data(data_list, batch_size, shuffle=False):
    while True:
        if shuffle:
            np.random.shuffle(data_list)
            
        for i in range(0, len(data_list), batch_size):
            enc_input_list = []
            dec_input_list = []
            dec_target_list = []
            batch_list_inside_tuples = data_list[i: i + batch_size]
            
            for sample in batch_list_inside_tuples:
                e_inp, d_inp, d_tar = sample[0], sample[1], sample[2]
                enc_input_list.append(e_inp)
                dec_input_list.append(d_inp)
                dec_target_list.append(d_tar)
            np_batch_enc_input = np.vstack(enc_input_list)
            np_batch_dec_input = np.vstack(dec_input_list)
            np_batch_dec_target = np.vstack(dec_target_list)
            np_batch_dec_target_one_hot = to_categorical(np_batch_dec_target, num_classes=vocab_size)
            ##input values are inside of [], and the rest is output value
            yield [np_batch_enc_input, np_batch_dec_input], np_batch_dec_target_one_hot

In [10]:
train_on_batch = generate_data(train, batch_size, shuffle=True)
test_on_batch = generate_data(test, batch_size)
train_steps_per_epoch = len(train) // batch_size
test_steps_per_epoch = len(test) // batch_size
train_steps_per_epoch, test_steps_per_epoch

(563, 140)

In [11]:
vocab_size = 8000
hidden_dims = 256
emb_dims = 300

##training network architecture
enc_inputs = Input(shape=(max_enc_seq,))
enc_emb = Embedding(vocab_size, emb_dims)(enc_inputs)
enc = CuDNNLSTM(hidden_dims, return_state=True)
_, state_h, state_c = enc(enc_emb)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(max_dec_seq,))
dec_emb = Embedding(vocab_size, emb_dims)(dec_inputs)
#return_state is used when the model inferences
dec = CuDNNLSTM(hidden_dims, return_sequences=True, return_state=True)
dec_outputs, _, _ = dec(dec_emb, initial_state=enc_states)
dec_dense = Dense(8000, activation="softmax")
dec_outputs = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], dec_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 128)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 61)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 128, 300)     2400000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 61, 300)      2400000     input_2[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm

In [12]:
plot_model(model, to_file="seq2seq.png", show_shapes=True)

In [14]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])

In [15]:
earlystopping = EarlyStopping(monitor="val_loss", patience=1, verbose=1, mode="auto")

model.fit_generator(
    generator=train_on_batch,
    steps_per_epoch=train_steps_per_epoch,
    epochs=epochs,
    verbose=1,
    callbacks=[earlystopping],
    validation_data=test_on_batch,
    validation_steps=test_steps_per_epoch
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 00027: early stopping


<keras.callbacks.History at 0x7fff4bcad390>

In [16]:
model.save("nmt_seq2seq.h5", include_optimizer=False)
print("The model is saved!")

The model is saved!


  '. They will not be included '


# Inference by the model

In [None]:
model = load_model("nmt_seq2seq.h5", compile=False)

In [24]:
##inference network architecture
enc_model = Model(enc_inputs, enc_states)
enc_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 300)          2400000   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     [(None, 256), (None, 256) 571392    
Total params: 2,971,392
Trainable params: 2,971,392
Non-trainable params: 0
_________________________________________________________________


In [27]:
plot_model(enc_model, to_file="enc_model_for_inference.png", show_shapes=True)

In [29]:
dec_input_state_h = Input(shape=(hidden_dims,))
dec_input_state_c = Input(shape=(hidden_dims,))
dec_input_states = [dec_input_state_h, dec_input_state_c]
dec_outputs, state_h, state_c = dec(dec_emb, initial_state=dec_input_states)
dec_states = [state_h, state_c]
dec_outputs = dec_dense(dec_outputs)
dec_model = Model([dec_inputs] + dec_input_states,
                 [dec_outputs] + dec_states)
dec_model.summary()

ValueError: Layer cu_dnnlstm_2 expects 7 inputs, but it received 3 input tensors. Input received: [<tf.Tensor 'embedding_2/embedding_lookup:0' shape=(?, 61, 300) dtype=float32>, <tf.Tensor 'input_7:0' shape=(?, 256) dtype=float32>, <tf.Tensor 'input_8:0' shape=(?, 256) dtype=float32>]

In [21]:
input_text = input("翻訳したい英語を入力してください。:(応答文)")

翻訳したい英語を入力してください。:(応答文)翻訳の精度をテストしたいです。


In [22]:
test = tokenizer.EncodeAsIds(input_text)

[6, 4205, 4, 1752, 338, 8, 3031, 17, 41, 14, 156, 5]


In [None]:
def preprocess_text(input_text):
    tokenized_ids = tokenizer.EncodeAsIds(input_text)
    enc_input = pad_or_truncate_inputs(tokenized_ids, max_enc_seq)
    
    
    enc_states = enc_model.predict(enc_input)
    output_seq = [start_token_id]
    