In [4]:
import numpy as np
import re
import random
import tensorflow as tf

In [5]:
DISTRIBUTION_DATA_COUNT = [20 * 1000, 40 * 1000, 60 * 1000]
VOCABULARY_PUNCTUATION = ['!', '?', '.', ',']
DATA_SUFFIXES = ['one', 'two', 'three']
SENTINELS = ['^', '~']

MODEL_NAME = 'seq2seq_with_attention'
MODEL_PATH = 'models/' + MODEL_NAME + '.h5'
DATA_NAME = 'data/original.txt'

SHIFTED_SEQ_COUNT = 1
LATENT_DIMENSIONS = 128
BATCH_SIZE = 64
EPOCHS = 50
MAX_SEQUENCE = 142
ACCEPTED_DIFF = .01

In [6]:
def get_lines(path, formatted):
    lines = list()
    with open(path, "r", encoding='utf-8') as file:
        [lines.append(formatted(i)) for i in file.readlines()]
    return lines


def split_with_keep_delimiters(string, delimiters):
    return re.split('(' + '|'.join(map(re.escape, delimiters)) + ')', string)


def tokenize_sequence(seq):
    return seq.split()


def encode_seq(seq, voc):
    encoded_input = np.zeros((MAX_SEQUENCE, len(voc)), dtype='float32')
    for i in range(len(seq)):
        c = voc.index(seq[i])
        # a number of sample, an index of position in the current sentence,
        # an index of character in the vocabulary
        encoded_input[i, c] = 1.
    return encoded_input


def decompose_tokens(tokens, shuffle):
    decomposed = list()
    for i, token in enumerate(tokens):
        decomposed.append(tokens[:i+1])
    if shuffle:
        random.shuffle(decomposed)
    return decomposed


def clothe_to(str_list, symbols):
    str_list.insert(0, symbols[0])
    str_list.append(symbols[1])
    return str_list


def seq_to_tokens(seq, voc):
    return [voc[np.argmax(seq[i, :])] for i in range(len(seq))]


def decode_seq(input_seq, encoder_model, decoder_model, voc):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, len(voc)))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, voc.index(SENTINELS[0])] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = voc[sampled_token_index]
        decoded_sentence += sampled_token

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_token == SENTINELS[1] or len(decoded_sentence) > MAX_SEQUENCE:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, len(voc)))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


def linear_regression_equality(y_true, y_pred):
    import tensorflow as tf
    diff = tf.keras.backend.abs(y_true - y_pred)
    return tf.keras.backend.mean(tf.keras.backend.cast(diff < ACCEPTED_DIFF, 'float32'))


def get_voc(data):
    voc = SENTINELS
    delimiters = [' ']
    for k in data:
        [[voc.append(w) for w in split_with_keep_delimiters(s, delimiters) if w not in voc] for s in data[k]]
    voc = sorted(voc)
    return voc


def split_data(data, coefficient):
    validation = list()
    train = list()

    for k in data:
        cluster = data[k]
        cluster_len = int(len(cluster) * coefficient // len(data))
        [validation.append(i) for i in cluster[-cluster_len:]]
        [train.append(i) for i in cluster[:int(len(cluster) - cluster_len)]]

    random.shuffle(validation)
    random.shuffle(train)
    return train, validation


def calculate_steps(train, validation):
    steps_per_epoch = int(len(train) // BATCH_SIZE)
    validation_steps = int(len(validation) // BATCH_SIZE)
    return steps_per_epoch, validation_steps

In [7]:
def get_raw_data(count_coefficient, decompose, assign_max_sequence=False):
    global MAX_SEQUENCE
    raw_data = dict()
    for suffix in DATA_SUFFIXES:
        sequences = get_lines('data/normalized/eng_' + suffix + '.txt', lambda l: l[:-1])

        if assign_max_sequence:
            max_seq = max([len(tokenize_sequence(seq)) for seq in sequences])
            if max_seq > MAX_SEQUENCE:
                MAX_SEQUENCE = max_seq

        if decompose:
            sequences_count = len(sequences)
            for i in range(sequences_count):
                seq = sequences[i]
                decomposed_sequences = decompose_tokens(tokenize_sequence(seq), False)[:-1]
                [sequences.append(' '.join(tokens)) for tokens in decomposed_sequences]

        random.shuffle(sequences)
        raw_data[suffix] = sequences[:int(len(sequences) * count_coefficient)]

    if assign_max_sequence:
        MAX_SEQUENCE += len(SENTINELS) + SHIFTED_SEQ_COUNT
        print('assigned_max_sequence(' + str(MAX_SEQUENCE) + ')')

    return raw_data


def get_fit_data(count_coefficient, split_coefficient):
    raw_data = get_raw_data(count_coefficient, decompose=False)
    voc = get_voc(raw_data)

    train, validation = split_data(raw_data, split_coefficient)
    validation_generator = DataSupplier(BATCH_SIZE, validation, voc)
    generator = DataSupplier(BATCH_SIZE, train, voc)

    print('\ndata(' + str(len(train)) + ', ' + str(len(validation)) + '),',
          'voc_size(' + str(len(voc)) + '),',
          'max_sequence(' + str(MAX_SEQUENCE) + ')\n',
          'voc(' + str(voc) + ')\n')

    return (generator, validation_generator), calculate_steps(train, validation), voc

In [None]:
class DataSupplier(tf.keras.utils.Sequence):
    def __init__(self, batch_size, sentences, voc):
        self.batch_size = batch_size
        self.sentences = sentences

        self.voc_size = len(voc)
        self.voc = voc

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.sentences) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        return self.__data_generation(indexes)

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sentences))
        np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        encoded_input = np.zeros((self.batch_size, MAX_SEQUENCE, self.voc_size), dtype='float32')
        decoded_input = np.zeros((self.batch_size, MAX_SEQUENCE, self.voc_size), dtype='float32')
        decoded_output = np.zeros((self.batch_size, MAX_SEQUENCE, self.voc_size), dtype='float32')

        cluster = [self.sentences[i] for i in indexes]

        for n in range(len(cluster)):
            tokens = tokenize_sequence(cluster[n])
            tokens_without_punctuation = [i for i in tokens if i not in VOCABULARY_PUNCTUATION]
            tokens = clothe_to(tokens, SENTINELS)

            decoded_output[n] = encode_seq(tokens, self.voc)

            a = np.insert(decoded_output[n], 0, np.zeros(self.voc_size, dtype='float32'))
            decoded_input[n] = a[:-self.voc_size].reshape((MAX_SEQUENCE, self.voc_size))

            encoded_input[n] = encode_seq(tokens_without_punctuation, self.voc)

        return [encoded_input, decoded_input], decoded_output

Data 10500/750 19
Epoch 1/100


In [None]:
def compile_model(model):
    model.compile(optimizer='Adamax', loss='categorical_crossentropy', metrics=[linear_regression_equality])
    return model


def create_model(n_input, n_units):
    encoder_input = tf.keras.layers.Input(shape=(None, n_input,))
    encoder = tf.keras.layers.LSTM(n_units, return_sequences=True, return_state=True)
    encoder_output, encoder_state_h, encoder_state_c = encoder(encoder_input)
    encoder_state = [encoder_state_h, encoder_state_c]

    decoder_input = tf.keras.layers.Input(shape=(None, n_input,))
    decoder = tf.keras.layers.LSTM(n_units, return_sequences=True, return_state=True)
    decoder_output, decoder_state_h, decoder_state_c = decoder(decoder_input, initial_state=encoder_state)

    # seq2seq
    # decoder_dense = tf.keras.layers.Dense(n_input, activation="softmax")
    # output = decoder_dense(decoder_output)

    #seq2seq with attention
    context = tf.keras.layers.Attention()([encoder_output, decoder_output])
    decoder_combined_context = tf.keras.layers.concatenate([context, decoder_output])
    output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_units, activation="relu"))(decoder_combined_context)
    output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_input, activation="softmax"))(output)

    model = tf.keras.Model([encoder_input, decoder_input], output)
    model = compile_model(model)
    return model

In [None]:
def train_model():
    (train_data, validation_data), (steps_per_epoch, validation_steps), voc = get_fit_data(.15, .2)
    model = create_model(len(voc), LATENT_DIMENSIONS)

    model.fit_generator(generator=train_data,
                        validation_data=validation_data,
                        steps_per_epoch=steps_per_epoch,
                        validation_steps=validation_steps,
                        epochs=EPOCHS,
                        verbose=2,
                        use_multiprocessing=False,
                        shuffle=True)

    model.save(MODEL_PATH)

In [None]:
train_model()