In [None]:
### https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/

In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input, Add, Activation, Embedding, Reshape
from keras.layers import Input
from keras.callbacks import ModelCheckpoint

In [None]:
# Helper Functions

def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [None]:
# Filename before the train/test and filetype
filename = "phoenix"

dataset = load_clean_sentences(filename + '-both.pkl')
train = load_clean_sentences(filename + '-train.pkl')
test = load_clean_sentences(filename + '-test.pkl')

In [None]:
# Encoding Functions

# Constructs the Tokenizer for each language in the Translation
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# Determines the Longest Sentences contained in the Dataset
def max_length(lines):
    return max(len(line.split()) for line in lines)

# Encodes and Pad word sequences to fill the maximum length
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# One-hot Encode the output sequences
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

In [None]:
x_train = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
y_train = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
y_train = encode_output(y_train, eng_vocab_size)

x_test = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
y_test = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
y_test = encode_output(y_test, eng_vocab_size)

In [None]:
# NMT Model

def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [None]:
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 1000)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# fit model
filename = 'model3.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(x_train, y_train, epochs=30, batch_size=1, validation_data=(x_test, y_test), callbacks=[checkpoint], verbose=2)