# Machine Translation（Seq2Seq, Attention）EN -> JA

In [1]:
def Attention(train_X, train_Y, tokenizer_en, tokenizer_ja):
    import numpy as np
    from keras.models import Model
    from keras import regularizers
    from keras.layers import Input, Permute, Activation, Embedding, Dense, LSTM, concatenate, dot, BatchNormalization
    from keras.optimizers import RMSprop, Adam
    from keras.callbacks import EarlyStopping
    from keras import backend as K

    emb_dim = 256
    hid_dim = 256
    att_dim = 256
    weight_decay = 1e-4

    en_vocab_size = len(tokenizer_en.word_index) + 1
    ja_vocab_size = len(tokenizer_ja.word_index) + 1

    seqX_len = len(train_X[0])
    seqY_len = len(train_Y[0])
    
    # Building a model architecture -----------------------------------------------------------------------------------
    # encoder
    encoder_inputs = Input(shape=(seqX_len,))
    encoder_embedded = Embedding(en_vocab_size, emb_dim, mask_zero=True)(encoder_inputs)
#     encoder_embedded_BN = BatchNormalization()(encoder_embedded)
    encoded_seq, *encoder_states = LSTM(hid_dim, return_sequences=True, return_state=True)(encoder_embedded)

    # decoder
    decoder_inputs = Input(shape=(seqY_len,))
    decoder_embedding = Embedding(ja_vocab_size, emb_dim)
    decoder_embedded = decoder_embedding(decoder_inputs)
#     decoder_embedded_BN = BatchNormalization()(decoder_embedded)
    decoder_lstm = LSTM(hid_dim, return_sequences=True, return_state=True)
    decoded_seq, _, _ = decoder_lstm(decoder_embedded, initial_state=encoder_states)

    # Attention
    score_dense = Dense(hid_dim)
    score = score_dense(decoded_seq)                        
    score = dot([score, encoded_seq], axes=(2,2)) 
    attention = Activation('softmax')(score) 
    context = dot([attention, encoded_seq], axes=(2,1)) 
    concat = concatenate([context, decoded_seq], axis=2)
    attention_dense = Dense(att_dim, activation='tanh')
#                             kernel_regularizer=regularizers.l2(weight_decay))
    attentional = attention_dense(concat)
    output_dense = Dense(ja_vocab_size, activation='softmax')
    outputs = output_dense(attentional)
    
    # training
    train_target = np.hstack((train_Y[:, 1:], np.zeros((len(train_Y),1), dtype=np.int32)))
    
    adam1 = Adam(lr=0.001)
    adam2 = Adam(lr=0.0003)
    rms1 = RMSprop(lr=0.001)
    rms2 = RMSprop(lr=0.0003)
    early_stopping = EarlyStopping(patience=0, verbose=1)
    
    model = Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer=rms1, loss='sparse_categorical_crossentropy')
    model.fit([train_X, train_Y], np.expand_dims(train_target, -1), 
              batch_size=128, epochs=15, verbose=2, validation_split=0.2, callbacks=[early_stopping])
    
    model.compile(optimizer=rms2, loss='sparse_categorical_crossentropy')
    model.fit([train_X, train_Y], np.expand_dims(train_target, -1), 
              batch_size=128, epochs=10, verbose=2, validation_split=0.2,  callbacks=[early_stopping])
    
    # Prediction -----------------------------------------------------------------------------------------------------
    # encoder
    encoder_model = Model(encoder_inputs, [encoded_seq]+encoder_states)

    # decoder
    decoder_states_inputs = [Input(shape=(hid_dim,)), Input(shape=(hid_dim,))]

    decoder_inputs = Input(shape=(1,))
    decoder_embedded = decoder_embedding(decoder_inputs)
    decoded_seq, *decoder_states = decoder_lstm(decoder_embedded, initial_state=decoder_states_inputs)

    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoded_seq] + decoder_states)

    # Attention
    encoded_seq_in, decoded_seq_in = Input(shape=(seqX_len, hid_dim)), Input(shape=(1, hid_dim))
    score = score_dense(decoded_seq_in)
    score = dot([score, encoded_seq_in], axes=(2,2))
    attention = Activation('softmax')(score)
    context = dot([attention, encoded_seq_in], axes=(2,1))
    concat = concatenate([context, decoded_seq_in], axis=2)
    attentional = attention_dense(concat)
    attention_outputs = output_dense(attentional)

    attention_model = Model([encoded_seq_in, decoded_seq_in], [attention_outputs, attention])
    
    def decode_sequence(input_seq, bos_eos, max_output_length=1000):
        encoded_seq, *states_value = encoder_model.predict(input_seq)

        target_seq = np.array(bos_eos[0])  # corresponding index to bos_eos[0]="<s>"
        output_seq = bos_eos[0][:]
#         attention_seq = np.empty((0,len(input_seq[0])))

        while True:
            decoded_seq, *states_value = decoder_model.predict([target_seq] + states_value)
            output_tokens, attention = attention_model.predict([encoded_seq, decoded_seq])
            sampled_token_index = [np.argmax(output_tokens[0, -1, :])]
            output_seq += sampled_token_index
#             attention_seq = np.append(attention_seq, attention[0], axis=0)

            if (sampled_token_index == bos_eos[1] or len(output_seq) > max_output_length):
                break

            target_seq = np.array(sampled_token_index)

        return output_seq
    
    return decode_sequence

In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

seed = 42

# Load data＆Tokenization
def load_texts(file_path):
    tokenizer = Tokenizer(filters="")
    whole_texts = []
    for line in open(file_path, encoding='utf-8'):
        whole_texts.append("<s> " + line.strip() + " </s>")
        
    tokenizer.fit_on_texts(whole_texts)
    
    return tokenizer.texts_to_sequences(whole_texts), tokenizer

def load_dataset():
    train_X, tokenizer_en = load_texts('./data/train.en')
    train_Y, tokenizer_ja = load_texts('./data/train.ja')
    
    train_X = pad_sequences(train_X, padding='post')
    train_Y = pad_sequences(train_Y, padding='post')

    train_X, test_X, train_Y, test_Y = train_test_split(train_X, train_Y, test_size=0.02, random_state=seed)
    
    return train_X, test_X, train_Y, test_Y, tokenizer_en, tokenizer_ja

def compute_bleu(refs, preds):
    return np.mean([sentence_bleu(r, p, emulate_multibleu=True) for r, p in zip(refs, preds)])

def score():
    train_X, test_X, train_Y, test_Y, tokenizer_en, tokenizer_ja = load_dataset()
    decode_sequence = Attention(train_X, train_Y, tokenizer_en, tokenizer_ja)

    bos_eos = tokenizer_ja.texts_to_sequences(["<s>", "</s>"])
    output = [decode_sequence(test_X[i][np.newaxis,:], bos_eos, 1000) for i in range(len(test_X))]
    
    detokenizer_ja = dict(map(reversed, tokenizer_ja.word_index.items()))
    
    preds = [[detokenizer_ja[i] for i in output[n][1:-1]] for n in range(len(output))]
    refs = [[detokenizer_ja[i] for i in test_Y[n][1:-(np.count_nonzero(test_Y[n]==0)+1)]] for n in range(len(test_Y))]
    refs = [[seq] for seq in refs]
    
    print(compute_bleu(refs, preds))
    
if __name__ == '__main__':
    score()

Using TensorFlow backend.


Train on 39200 samples, validate on 9800 samples
Epoch 1/15
 - 101s - loss: 2.9512 - val_loss: 2.3274
Epoch 2/15
 - 97s - loss: 2.1430 - val_loss: 2.0275
Epoch 3/15
 - 97s - loss: 1.8982 - val_loss: 1.8367
Epoch 4/15
 - 97s - loss: 1.7110 - val_loss: 1.7062
Epoch 5/15
 - 96s - loss: 1.5639 - val_loss: 1.5958
Epoch 6/15
 - 96s - loss: 1.4429 - val_loss: 1.5098
Epoch 7/15
 - 96s - loss: 1.3397 - val_loss: 1.4272
Epoch 8/15
 - 96s - loss: 1.2489 - val_loss: 1.3750
Epoch 9/15
 - 96s - loss: 1.1679 - val_loss: 1.3236
Epoch 10/15
 - 96s - loss: 1.0955 - val_loss: 1.2900
Epoch 11/15
 - 96s - loss: 1.0285 - val_loss: 1.2659
Epoch 12/15
 - 96s - loss: 0.9680 - val_loss: 1.2385
Epoch 13/15
 - 96s - loss: 0.9128 - val_loss: 1.2282
Epoch 14/15
 - 96s - loss: 0.8634 - val_loss: 1.2130
Epoch 15/15
 - 96s - loss: 0.8172 - val_loss: 1.2111
Train on 39200 samples, validate on 9800 samples
Epoch 1/10
 - 97s - loss: 0.7386 - val_loss: 1.1910
Epoch 2/10
