In [4]:
import random
import os
import keras as k
import numpy as np
import re

In [5]:
DISTRIBUTION_DATA_COUNT = [20 * 1000, 40 * 1000, 60 * 1000]
VOCABULARY_PUNCTUATION = ['!', '?', '.', ',']
DATA_SUFFIXES = ['one', 'two', 'three']
SENTINELS = ['^', '~']

MODEL_NAME = 'seq2seq_with_attention.h5'
DATA_NAME = 'data/original.txt'

SHIFTED_SEQ_COUNT = 1
LATENT_DIMENSIONS = 128
BATCH_SIZE = 64
EPOCHS = 100
MAX_SEQUENCE = 100
ACCEPTED_DIFF = .01

In [6]:
def get_lines(path, formatted):
    lines = list()
    with open(path, "r", encoding='utf-8') as file:
        [lines.append(formatted(i)) for i in file.readlines()]
    return lines


def split_with_keep_delimiters(string, delimiters):
    return re.split('(' + '|'.join(map(re.escape, delimiters)) + ')', string)


def seq_to_text(encoded_input, voc):
    return ''.join([voc[np.argmax(encoded_input[0, i, :])] for i in range(len(encoded_input[0]))])


def punctuation_translate(x):
    return x.translate({ord(i): None for i in VOCABULARY_PUNCTUATION})


def find_max_sequence(samples):
    return max([len(sample) for sample in samples])


def decode_sequence(input_seq, encoder_model, decoder_model, vocabulary, vocabulary_len):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, vocabulary_len))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, vocabulary.index(SENTINELS[0])] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = vocabulary[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == SENTINELS[1] or len(decoded_sentence) > MAX_SEQUENCE:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, vocabulary_len))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


def linear_regression_equality(y_true, y_pred):
    diff = k.backend.abs(y_true - y_pred)
    return k.backend.mean(k.backend.cast(diff < ACCEPTED_DIFF, 'float32'))


def get_vocabulary(data):
    voc = SENTINELS
    for k in data:
        [[voc.append(w) for w in s.split() if w not in voc] for s in data[k]]
    voc = sorted(voc)
    voc_size = len(voc)
    return voc, voc_size


def split_data(data, coefficient):
    validation = list()
    train = list()

    for k in data:
        cluster = data[k]
        cluster_len = int(len(cluster) * coefficient // len(data))
        [validation.append(i) for i in cluster[-cluster_len:]]
        [train.append(i) for i in cluster[:int(len(cluster) - cluster_len)]]

    random.shuffle(validation)
    random.shuffle(train)
    return train, validation


def calculate_steps(train, validation):
    steps_per_epoch = int(len(train) // BATCH_SIZE)
    validation_steps = int(len(validation) // BATCH_SIZE)
    return steps_per_epoch, validation_steps

In [7]:
class DataSupplier(k.utils.Sequence):
    def __init__(self, batch_size, sentences, voc, voc_size):
        self.batch_size = batch_size
        self.sentences = sentences

        self.voc_size = voc_size
        self.voc = voc

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.sentences) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        return self.__data_generation(indexes)

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sentences))
        np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        encoded_input = np.zeros((self.batch_size, MAX_SEQUENCE, self.voc_size), dtype='float32')
        decoded_input = np.zeros((self.batch_size, MAX_SEQUENCE, self.voc_size), dtype='float32')
        decoded_output = np.zeros((self.batch_size, MAX_SEQUENCE, self.voc_size), dtype='float32')

        cluster = [self.sentences[i] for i in indexes]

        for n in range(len(cluster)):
            words = (SENTINELS[0] + ' ' + cluster[n] + ' ' + SENTINELS[1]).split()

            for i in range(len(words)):
                c = self.voc.index(words[i])
                # a number of sample, an index of position in the current sentence,
                # an index of character in the vocabulary
                decoded_output[n, i, c] = 1.

                # a number of sample, an index of shifted position in the current sentence,
                # an index of character in the vocabulary
                decoded_input[n, i + 1, c] = 1.

            sentence_without_punctuation = punctuation_translate(cluster[n]).split()

            for i in range(len(sentence_without_punctuation)):
                c = self.voc.index(sentence_without_punctuation[i])
                # a number of sample, an index of position in the current sentence,
                # an index of character in the vocabulary
                encoded_input[n, i, c] = 1.

        return [encoded_input, decoded_input], decoded_output

In [None]:
def get_data(count_coefficient, shuffle, split_coefficient):
    data = dict()
    for suffix in DATA_SUFFIXES:
        normalized_data = get_lines('data/normalized/eng_' + suffix + '.txt', lambda l: l[:-1])
        incomplete_data = get_lines('data/incomplete/eng_' + suffix + '.txt', lambda l: l[:-1])

        normalized_data = normalized_data[:int(len(normalized_data) * count_coefficient)]
        incomplete_data = incomplete_data[:int(len(incomplete_data) * count_coefficient)]
        data[suffix] = normalized_data + incomplete_data

    if shuffle is True:
        [random.shuffle(data[k]) for k in data]

    voc, voc_size = get_vocabulary(data)
    # print(max([max([len(sentence.split()) for sentence in data[k]]) for k in data]))

    train, validation = split_data(data, split_coefficient)
    validation_generator = DataSupplier(BATCH_SIZE, validation, voc, voc_size)
    generator = DataSupplier(BATCH_SIZE, train, voc, voc_size)
    print('Data', str(len(train)) + '/' + str(len(validation)), voc_size)
    return (generator, validation_generator), calculate_steps(train, validation), (voc, voc_size)


def compile_model(model):
    model.compile(optimizer='Adamax', loss='categorical_crossentropy', metrics=[linear_regression_equality])
    return model


def create_model(n_input, n_units):
    encoder_input = k.Input(shape=(None, n_input,))
    encoder = k.layers.LSTM(n_units, return_sequences=True, return_state=True)
    encoder_output, state_h, state_c = encoder(encoder_input)
    encoder_states = [state_h, state_c]

    decoder_input = k.Input(shape=(None, n_input,))
    decoder = k.layers.LSTM(n_units, return_sequences=True)
    decoder_output = decoder(decoder_input, initial_state=encoder_states)

    # decoder_dense = K.layers.Dense(n_input, activation="softmax")
    # output = decoder_dense(decoder_output)

    attention = k.layers.dot([decoder_output, encoder_output], axes=(2, 2))
    attention = k.layers.Activation('softmax', name='attention')(attention)
    context = k.layers.dot([attention, encoder_output], axes=(2, 1))
    decoder_combined_context = k.layers.concatenate([context, decoder_output])

    output = k.layers.TimeDistributed(k.layers.Dense(n_units, activation="relu"))(decoder_combined_context)
    output = k.layers.TimeDistributed(k.layers.Dense(n_input, activation="softmax"))(output)

    model = k.Model([encoder_input, decoder_input], output)
    model = compile_model(model)
    return model


def restore_model():
    model = k.models.load_model(MODEL_NAME, compile=False)
    return compile_model(model)


(train_data, validation_data), (steps_per_epoch, validation_steps), (voc, voc_size) = get_data(.15, True, .2)

if os.path.isfile('./' + MODEL_NAME):
    model = restore_model()
    for i in range(20):
        input_data = validation_data.__getitem__(0)[0]
        encoded_input = input_data[0]
        decoded_input = input_data[1]
        input_seq = [encoded_input[i: i + 1], decoded_input[i: i + 1]]
        print(seq_to_text(input_seq[0], voc))
        output = seq_to_text(model.predict(input_seq), voc)[1:]

        for i in range(len(output)):
            if output[i] in SENTINELS:
                print(output[:i])
                print()
                break

else:
    # it's a train process
    model = create_model(voc_size, LATENT_DIMENSIONS)

    model.fit_generator(generator=train_data,
                        validation_data=validation_data,
                        steps_per_epoch=steps_per_epoch,
                        validation_steps=validation_steps,
                        epochs=EPOCHS,
                        verbose=2,
                        use_multiprocessing=False,
                        shuffle=True)

    model.save(MODEL_NAME)



Data 10500/750 19
Epoch 1/100
