In [None]:
import numpy as np
import tensorflow as tf
import random as rn
import re
import json

In [None]:
batch_size = 64
num_data = 50*batch_size
epochs = 100
latent_dim = 128
language_tag = 'en'
data_path = 'data/' + language_tag + '/original.txt'
model_name = 'nmt'
validation_split = .2
punctuation = ['!', '?', '.', ',']
sentinels = ['~', '^']

In [None]:

def encode_target(text):
    return '~' + text + '^'


def find_max_seq_data_len(data):
    max_source_seq_len = 0
    max_target_seq_len = 0
    for sample in data:
        source, target = sample.split('\t')

        source_len = len(source)
        if source_len > max_source_seq_len:
            max_source_seq_len = source_len

        target_len = len(encode_target(target))
        if target_len > max_target_seq_len:
            max_target_seq_len = target_len

    return max_source_seq_len, max_target_seq_len


def split_data(data):
    data_validation = data[-int(validation_split * len(data)):]
    data_train = data[:int(len(data) - len(data_validation))]
    return data_train, data_validation


def get_voc_from_data(data):
    source_voc = list()
    for sample in data:
        source, _ = sample.split('\t')
        [source_voc.append(token) for token in source if token not in source_voc]
    source_voc = sorted(source_voc)
    return source_voc, (source_voc + sentinels + punctuation)


def split_with_keep_delimiters(string, delimiters):
    return re.split('(' + '|'.join(map(re.escape, delimiters)) + ')', string)


def get_bi_grams(paired_data, freq):
    bi_gram_statistics = dict()
    for sample in paired_data:
        # a target sample
        sample = sample.split('\t')[1]
        tokens = list()
        [
            [
                tokens.append(token) for token in split_with_keep_delimiters(s, punctuation) if token is not ''
            ]
            for s in sample.split()
        ]

        for i in range(len(tokens) - 1):
            bi_gram = tokens[i] + ' ' + tokens[i + 1]
            if (tokens[i] not in punctuation) and (tokens[i + 1] not in punctuation):
                if bi_gram not in bi_gram_statistics:
                    bi_gram_statistics[bi_gram] = 1
                else:
                    bi_gram_statistics[bi_gram] += 1

    bi_gram_statistics = {k: v for (k, v) in bi_gram_statistics.items() if v > freq}
    bi_gram_statistics = {k: v for (k, v) in sorted(bi_gram_statistics.items(), key=lambda item: item[1], reverse=True)}

    return list(bi_gram_statistics.keys())


with open(data_path, 'r', encoding='utf-8') as f:
    data = f.read().split('\n')[-num_data:]


max_source_seq_len, max_target_seq_len = find_max_seq_data_len(data)
rn.shuffle(data)
source_voc, target_voc = get_voc_from_data(data)
data_train, data_valid = split_data(data)
print('len(data):', len(data), 'max_source_seq_len:', max_source_seq_len, 'max_target_seq_len:', max_target_seq_len)
print('source_voc:', source_voc)
print('target_voc:', target_voc)
# print('len(source_voc):', len(source_voc))
# print('len(target_voc):', len(target_voc))

In [None]:

class DataSupplier(tf.keras.utils.Sequence):
    def __init__(self, batch_size, max_source_seq_len, max_target_seq_len, data, source_voc, target_voc):
        self.batch_size = batch_size
        self.data = data
        self.source_voc = source_voc
        self.target_voc = target_voc
        self.max_source_seq_len = max_source_seq_len
        self.max_target_seq_len = max_target_seq_len
        rn.shuffle(self.data)

    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, ndx):
        source, target = self.extract_batch(ndx, self.batch_size, self.data)
        return self.encode_data(source, target)

    def on_epoch_end(self):
        rn.shuffle(self.data)

    # secondary auxiliary methods
    def encode_data(self, source, target):
        encoder_input_data = np.zeros(
            (len(source), self.max_source_seq_len, len(self.source_voc)), dtype="float32"
        )
        decoder_input_data = np.zeros(
            (len(target), self.max_target_seq_len, len(self.target_voc)), dtype="float32"
        )
        decoder_target_data = np.zeros(
            (len(target), self.max_target_seq_len, len(self.target_voc)), dtype="float32"
        )

        for i, (source_text, target_text) in enumerate(zip(source, target)):
            for t, symbol in enumerate(source_text):
                encoder_input_data[i, t, self.source_voc.index(symbol)] = 1.

            for t, symbol in enumerate(target_text):
                symbol_ndx = self.target_voc.index(symbol)
                decoder_input_data[i, t, symbol_ndx] = 1.
                if t > 0:
                    decoder_target_data[i, t - 1, symbol_ndx] = 1.

        return [encoder_input_data, decoder_input_data], decoder_target_data

    @staticmethod
    def append_sample(sample, source, target):
        source_item, target_item = sample.split('\t')
        source.append(source_item)
        target.append(encode_target(target_item))
        return source, target

    def extract_batch(self, ndx, batch_size, data):
        source = []
        target = []
        ndx_from = ndx * batch_size
        ndx_to = min(ndx * batch_size + batch_size, len(data))

        for sample in data[ndx_from: ndx_to]:
            source, target = self.append_sample(sample, source, target)

        if ndx_to % batch_size != 0:
            for sample in rn.sample(data[:ndx_from], batch_size - len(data) % batch_size):
                source, target = self.append_sample(sample, source, target)

        return source, target


In [None]:
encoder_inputs = tf.keras.Input(shape=(None, len(source_voc)))
bidirectional = tf.keras.layers.Bidirectional
encoder = bidirectional(
    tf.keras.layers.LSTM(
        latent_dim,
        return_sequences=True,
        return_state=True
    )
)
encoder_stack_h, forward_last_h, forward_last_c, backward_last_h, backward_last_c = encoder(encoder_inputs)

encoder_last_h = tf.keras.layers.Concatenate()([forward_last_h, backward_last_h])
encoder_last_c = tf.keras.layers.Concatenate()([forward_last_c, backward_last_c])

encoder_states = [encoder_last_h, encoder_last_c]

decoder_inputs = tf.keras.Input(shape=(None, len(target_voc)))

decoder = tf.keras.layers.LSTM(latent_dim*2, return_sequences=True, return_state=True)
decoder_stack_h, _, _ = decoder(decoder_inputs, initial_state=encoder_states)

context = tf.keras.layers.Attention()([decoder_stack_h, encoder_stack_h])
decoder_concat_input = tf.keras.layers.concatenate([context, decoder_stack_h])

dense = tf.keras.layers.Dense(len(target_voc), activation='softmax')
decoder_stack_h = tf.keras.layers.TimeDistributed(dense)(decoder_concat_input)

model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_stack_h)

In [None]:
model.compile(
    optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']
)

train_supplier = DataSupplier(
    batch_size,
    max_source_seq_len,
    max_target_seq_len,
    data_train,
    source_voc,
    target_voc
)

valid_supplier = DataSupplier(
    batch_size,
    max_source_seq_len,
    max_target_seq_len,
    data_valid,
    source_voc,
    target_voc
)

# es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.fit(
    train_supplier,
    validation_data=valid_supplier,
    epochs=epochs,
    shuffle=True,
    # callbacks=[es],
)

model.save("models/" + model_name + ".h5")