In [2]:
print('Imports')
import io, sys, random
from settings.data_norm_constants import MIN_WORD_FREQUENCY, SEQUENCE_LEN
import keras
import numpy as np
from settings.model_constants import BATCH_SIZE
from utils.model_utils import generator, get_model


Imports


Using TensorFlow backend.


Data tools

In [3]:
def corpus_to_dictionary(path: str):
    print('corupus to dict')
    with io.open(path) as f:
        # Get words from corpus file
        text = f.read().lower().replace('\n', ' \n ').replace('\\', ' \n ')
        text_in_words = keras.preprocessing.text.text_to_word_sequence(text, filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t',
                                                                       lower=True,
                                                                       split=' ')
        print('Corpus length in words:', len(text_in_words))
        if ('\n' in text_in_words):
            index = text_in_words.index('\n')
            test = text_in_words[index]

        # Count how many times word appears in text_in_words
        word_freq = {}
        for word in text_in_words:
            word_freq[word] = word_freq.get(word, 0) + 1

        # Get ignored words and add them to ignored_words_set
        ignored = set()
        for k, v in word_freq.items():
            if word_freq[k] < MIN_WORD_FREQUENCY:
                ignored.add(k)

        words = set(text_in_words)
        print('Unique words:', len(words))

        # Remove ignored words from set
        words = sorted(set(words) - ignored)
        print('Unique words after removing ignored words:', len(words))

        # Create two dictionaries. One with word as a key and index as value. One with index as key and word as a value
        word_indices = dict((c, i) for i, c in enumerate(words))
        indices_word = dict((i, c) for i, c in enumerate(words))

        print('EOF: corpus_to_dictionary()')
        return text_in_words, ignored, word_indices, indices_word, words


def create_and_filter_sequences(text_in_words, ignored_words):
    print('start: create_and_filter_sequences')
    STEP = 1
    sentences = []
    next_words = []
    ignored = 0

    # Loop original corpus. Add SEQUENCES_LEN long sentences to sentences and SEQUENCES_LEN next words to next_words
    # Only add sentences that don't contain ignored words
    for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
        # Only add sequences where no word is in ignored_words
        if len(set(text_in_words[i: i + SEQUENCE_LEN + 1]).intersection(ignored_words)) == 0:
            sentences.append(text_in_words[i: i + SEQUENCE_LEN])
            next_words.append(text_in_words[i + SEQUENCE_LEN])
        else:
            ignored = ignored + 1
    print('Ignored sequences:', ignored)
    print('Remaining sequences:', len(sentences))

    return sentences, next_words


def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []

    for i in np.random.RandomState(seed=42).permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1. - (percentage_test / 100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))

    print('end: shuffle_and_split_training_set')

    return (x_train, y_train), (x_test, y_test)

In [4]:
corpus_path = 'data/lyrics.txt'
text_in_words, ignored_words, word_indices, indices_word, words = corpus_to_dictionary(corpus_path)
sequences, next_words = create_and_filter_sequences(text_in_words, ignored_words)
(sentences_train, next_words_train), (sentences_test, next_words_test) = shuffle_and_split_training_set(sequences,
                                                                                                            next_words)

corupus to dict
Corpus length in words: 20576252
Unique words: 284587
Unique words after removing ignored words: 33942
EOF: corpus_to_dictionary()
start: create_and_filter_sequences
Ignored sequences: 7690908
Remaining sequences: 12885314
Shuffling sentences
Size of training set = 12627607
Size of test set = 257707
end: shuffle_and_split_training_set


In [5]:
print(sequences[0])

['\n', 'from', 'cancer', 'in', 'the', 'brain', '\n', 'our', 'bombs', 'they', 'crush', '\n', 'destroying', 'all', 'of', 'us', '\n', 'and', 'now', "we're", 'dead', '\n', 'bodies', 'full', 'of', 'lead', '\n', '\n', 'inner', 'transformation']


In [6]:
model = get_model(words)
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

model.summary()


W0729 19:46:09.254709 139993619598976 deprecation_wrapper.py:119] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0729 19:46:09.265489 139993619598976 deprecation_wrapper.py:119] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0729 19:46:09.266838 139993619598976 deprecation_wrapper.py:119] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



Build model...


W0729 19:46:13.862148 139993619598976 deprecation_wrapper.py:119] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0729 19:46:13.867522 139993619598976 deprecation.py:506] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0729 19:46:13.887973 139993619598976 deprecation_wrapper.py:119] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0729 19:46:13.905623 139993619598976 deprecatio

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 1024)        34756608  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               2623488   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 33942)             17412246  
_________________________________________________________________
activation_1 (Activation)    (None, 33942)             0         
Total params: 54,792,342
Trainable params: 54,792,342
Non-trainable params: 0
_________________________________________________________________


In [7]:
file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % (
    len(words),
    SEQUENCE_LEN,
    MIN_WORD_FREQUENCY
)
checkpoint = keras.callbacks.ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = keras.callbacks.LambdaCallback()
early_stopping = keras.callbacks.EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [checkpoint, print_callback, early_stopping]


In [None]:
h1=model.fit_generator(generator(sentences_train, next_words_train, BATCH_SIZE, word_indices),
                    steps_per_epoch=int(len(sequences)/BATCH_SIZE) + 1,
                    epochs=50,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE, word_indices),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)


W0729 19:46:14.234670 139993619598976 deprecation.py:323] From /home/markus/.venv/mnist_cnn/metalRNN/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/50


In [13]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [14]:
for diversity in [0.2, 0.5, 1.0, 1.2]:
    seed_index = np.random.randint(len(sequences+sentences_test))
    seed = (sequences+sentences_test)[seed_index]
    sentence=seed
    print('----- diversity:', diversity)

    print('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
    #print(' '.join(sentence))

    for i in range(200):
        x_pred = np.zeros((1, SEQUENCE_LEN))
        for t, word in enumerate(sentence):
            x_pred[0, t] = word_indices[word]

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_word = indices_word[next_index]

        sentence = sentence[1:]
        sentence.append(next_word)


        sys.stdout.write(" "+next_word)
        sys.stdout.flush()
    print()
    print()

----- diversity: 0.2
----- Generating with seed:
"
 we bathe in the blood of the unlucky stiffs 
 keep their eyes tongues and brains in glass cases 
 smear our naked writhing bodies in the grue and"

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 


----- diversity: 0.5
----- Generating with seed:
"
 
 we make the world go round in tears 
 we are the shadow when faith disappears 
 we are anxiety and fearless scum 
 awaking your sorrow like"

 
 
 
 
 
 
 
 
 
 this 
 
 
 i 
 
 so 
 is 
 
 and 
 
 
 
 
 
 the 
 
 
 the 
 
 the 
 
 
 the this 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 to 
 
 is 
 
 
 
 
 
 
 
 
 
 and 
 
 
 
 
 
 
 the