In [4]:
from keras.models import Model
from keras.layers import LSTM, Dense, Input, Embedding, TimeDistributed, Masking
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from collections import Counter
import nltk
import numpy as np
import pandas as pd
import re
import json
import MeCab

In [2]:
np.random.seed(42)

In [5]:
tagger = MeCab.Tagger('-Owakati')

def ja_tokenizer(text):
    result = tagger.parse(text)
    words = result.split()
    if len(words) ==0: return []
    if words[-1] == '\n':return words[:-1]
    return words

In [6]:
def make_corpus(corpus_path):
    corpus = open(corpus_path,'r').readlines()
    en,ja = [],[]
    pat = r'#ID.+\n'
    for c in corpus:
        if 'A: ' in c:
            clean_c = c.replace('A: ','')
            res = re.search(pat,clean_c)
            clean_c = clean_c.replace(res.group(0),'').split('\t')
            ja.append(clean_c[0])
            en.append(clean_c[1])
    with open('en.txt','w') as f:
        for t in en:
            f.write(t+'\n')
    with open('ja.txt','w') as f:
        for t in en:
            f.write(t+'\n')
    return en,ja

In [7]:
make_corpus('../data/examples.utf')

(['Muiriel is 20 now.',
  'I will be back soon.',
  'I may give up soon and just nap instead.',
  'I love you.',
  "I shouldn't have logged off.",
  'Everyone has both strong and weak points.',
  '"What\'s the matter?" asked the little white rabbit.',
  '"Trust me," he said.',
  '"This is what I was looking for!" he exclaimed.',
  '"This looks pretty interesting," Hiroshi says.',
  'Their communication may be much more complex than we thought.',
  "Someday I'll run like the wind.",
  "Please don't cry.",
  'It may be that the happiness awaiting us is not at all the sort of happiness we would want.',
  'It is up to you to decide whether we will go there or not.',
  "Class doesn't begin until eight-thirty.",
  'A Japanese person would never do such a thing.',
  'How do you spell "pretty"?',
  "Why don't we go home?",
  "I'm sorry, I can't stay long.",
  'Ten years is a long time to wait.',
  'One million people lost their lives in the war.',
  'I came to Tokyo three years ago and have be

In [15]:
BATCH_SIZE = 32
NUM_EPOCHS = 40
HIDDEN_UNITS = 128
MAX_INPUT_SEQ_LENGTH = 17
MAX_TARGET_SEQ_LENGTH = 24
MAX_VOCAB_SIZE = 30000
EN = '../data/en.txt'
JA = '../data/ja.txt'

In [16]:
input_counter = Counter()
target_counter = Counter()

In [19]:
# loading data
with open(EN, 'r', encoding='utf8') as f:
    en = f.read().split('\n')
with open(JA, 'r', encoding='utf8') as f:
    ja = f.read().split('\n')

In [21]:
input_texts = []
target_texts = []

In [22]:
prev_words = []
#for line in questions:
for line in en:
    next_words = [w.lower() for w in nltk.word_tokenize(line)]
    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        for w in prev_words:
            input_counter[w] += 1

    prev_words = next_words

In [23]:
prev_words = []
#for line in answers:
for line in ja:
    next_words = [w for w in ja_tokenizer(line)]
    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        target_words = next_words[:]
        target_words.insert(0, '<SOS>')
        target_words.append('<EOS>')
        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)

    prev_words = next_words

In [24]:
input_word2idx = {}
target_word2idx = {}
for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1
    

In [25]:
input_word2idx['<PAD>'] = 0
input_word2idx['<UNK>'] = 1
target_word2idx['<UNK>'] = 0

In [26]:
input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

In [27]:
encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

In [30]:
num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)

In [28]:
lens = []
for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    for w in input_words:
        w2idx = 1  # default [UNK]
        if w in input_word2idx:
            w2idx = input_word2idx[w]
        encoder_input_wids.append(w2idx)
    lens.append(len(encoder_input_wids))
    encoder_input_data.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)
#print(sum(lens)/len(lens))

In [31]:
context = dict()
context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

In [80]:
def generate_batch(input_data, output_text_data):
    num_batches = len(input_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length, padding='post')
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0  # default [UNK]
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

In [33]:
# embedding matrx
embeddings_index = {}
f = open('../data/wiki-news-300d-1M.vec')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [36]:
count = 0
embedding_matrix = np.zeros((len(input_word2idx) + 1, 300))
for word, index in input_word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
    else:
        count += 1
print("Unknown: ", count) 

Unknown:  1360


In [37]:
# embedding layer
em_sz = 300
encoder_embedding = Embedding(len(input_word2idx) + 1,
                            em_sz,
                            weights=[embedding_matrix],
                            input_length=encoder_max_seq_length,
                            trainable=True, name='encoder_embedding') 

In [38]:
# embedding matrx
embeddings_index = {}
f = open('../data/cc.ja.300.vec')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [60]:
count = 0
embedding_matrix = np.zeros((len(target_word2idx) + 1, 300))
for word, index in target_word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
    else:
        count += 1
print("Unknown: ", count) 

Unknown:  3951


In [86]:
# embedding layer
em_sz = 300
decoder_embedding = Embedding(len(target_word2idx) + 1,
                            em_sz,
                            weights=[embedding_matrix],
                            input_length=decoder_max_seq_length,
                            trainable=True, name='decoder_embedding')

In [62]:
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]
print('encoder',encoder_outputs)

encoder Tensor("encoder_lstm_2/TensorArrayReadV3:0", shape=(?, 128), dtype=float32)


In [88]:
decoder_inputs = Input(shape=(None,num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)
print('decoder',decoder_outputs)

decoder Tensor("decoder_dense_4/truediv:0", shape=(?, ?, 24301), dtype=float32)


In [89]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [90]:
optimizer = Adam(lr=5e-3)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, target_texts, test_size=0.1, random_state=42)

In [92]:
print(len(X_train))
print(len(X_test))

134807
14979


In [93]:
train_gen = generate_batch(X_train, y_train)
test_gen = generate_batch(X_test, y_test)

In [94]:
train_num_batches = len(X_train) // BATCH_SIZE
test_num_batches = len(X_test) // BATCH_SIZE

In [95]:
checkpoint = ModelCheckpoint(filepath='../data/keras-seq2seq.h5', save_best_only=True)
model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                    epochs=20,use_multiprocessing=True,
                    verbose=1, validation_data=test_gen, validation_steps=test_num_batches, callbacks=[checkpoint])

Epoch 1/20
 160/4212 [>.............................] - ETA: 2:38:26 - loss: 2.5884

Process ForkPoolWorker-9:
Process ForkPoolWorker-10:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/pool.py", line 125, in worker
    put((job, i, result))
  File "/home/paperspace/anaconda3/lib/python3.6/multiprocessing/queues.py", line 341, in put
    obj = _ForkingPickler.dumps(obj)




KeyboardInterrupt: 

In [None]:
from keras.layers import Activation, dot, concatenate

In [None]:
encoder_max_seq_length = decoder_max_seq_length = 27

In [None]:
encoder_inputs = Input(shape=(encoder_max_seq_length,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length,name='encoder_embedding')
#use embedding layer from before
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True,return_state=True,name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]
print('encoder', encoder_outputs)

In [None]:
decoder_inputs = Input(shape=(decoder_max_seq_length,num_decoder_tokens), name='decoder_inputs')
decoder = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c= decoder(decoder_inputs, initial_state=encoder_states)

print('decoder', decoder_outputs)
print('decoder c', decoder_state_c)
print('decoder h', decoder_state_h)

Upto here its mostly the same as regular seq2seq. You'll notice the encoder input shape (`encoder inputs`) has changed. 

Previously, we were not interested in the output generated by the encoder, only the hidden state (as a representation of the encoding information). This time for our encoder LSTM we add the parameter `return sequences=True` to the initialization to get all the outputs back instead of just the most recent one. It will become a little clear why in the next step. 

### Attention Layer
We look at the decoder's output, and also use "context” information. This “context” is computed by “looking back” into the outputs generated by the encoder and scoring them based on their relevancy to the decoder outputs. The context information (scores) is concatenated with the decoder outputs.

Since in this implementation we use the dot product scoring mechanism, the encoder outputs with the decoder outputs need to have the same size. 

In [None]:
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, decoder_outputs], axes=[2,1])
print('context', context)

decoder_combined_context = concatenate([context, decoder_outputs])
print('decoder_combined_context', decoder_combined_context)

output = TimeDistributed(Dense(HIDDEN_UNITS, activation="tanh"))(decoder_combined_context)
print('output', output)
output = TimeDistributed(Dense(num_decoder_tokens, activation="softmax"))(output)
print('output', output)

In [None]:
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[output])
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[perplexity])
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                    epochs=5,
                    verbose=1, validation_data=test_gen, validation_steps=test_num_batches)