In [106]:
import collections
import os
import time
import numpy as np
import tensorflow as tf
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:

url = 'https://www.cs.cmu.edu/~spok/grimmtmp/'

dir_name = 'stories'
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

def download_stories(filename):
    print("Downloading file: ", dir_name + os.sep + filename)
    if not os.path.exists(os.path.join(dir_name, filename)):
        filename, _ = urlretrieve(url + filename, dir_name+os.sep+filename)
    else:
        print("File %s already exits" % filename)
    return filename


filenames = [format(i, '03d') + '.txt' for i in range(1, 101)]

for fn in filenames:
    download_stories(fn)

Downloading file:  stories\001.txt
File 001.txt already exits
Downloading file:  stories\002.txt
File 002.txt already exits
Downloading file:  stories\003.txt
File 003.txt already exits
Downloading file:  stories\004.txt
File 004.txt already exits
Downloading file:  stories\005.txt
File 005.txt already exits
Downloading file:  stories\006.txt
File 006.txt already exits
Downloading file:  stories\007.txt
File 007.txt already exits
Downloading file:  stories\008.txt
File 008.txt already exits
Downloading file:  stories\009.txt
File 009.txt already exits
Downloading file:  stories\010.txt
File 010.txt already exits
Downloading file:  stories\011.txt
File 011.txt already exits
Downloading file:  stories\012.txt
File 012.txt already exits
Downloading file:  stories\013.txt
File 013.txt already exits
Downloading file:  stories\014.txt
File 014.txt already exits
Downloading file:  stories\015.txt
File 015.txt already exits
Downloading file:  stories\016.txt
File 016.txt already exits
Download

In [3]:

def read_data(filename):
    with open(filename) as f:
        data =  tf.compat.as_str(f.read())
        data = data.lower()
        data = list(data)
    return data

global documents
documents = []
num_files = 100
for i in range(num_files):
    print("processing file %s" % os.path.join(dir_name, filenames[i]))
    chars = read_data(os.path.join(dir_name, filenames[i]))

    # break into bigrams
    two_grams = [''.join(chars[ch_i:ch_i+2]) for ch_i in range(0, len(chars)-2, 2)]
    # Create document
    documents.append(two_grams)
    print("Data size (chars) (document %d) %d" % (i, len(two_grams)))
    print("Sample string %s\n" % (two_grams[:50]))

processing file stories\001.txt
Data size (chars) (document 0) 3667
Sample string ['in', ' o', 'ld', 'en', ' t', 'im', 'es', ' w', 'he', 'n ', 'wi', 'sh', 'in', 'g ', 'st', 'il', 'l ', 'he', 'lp', 'ed', ' o', 'ne', ', ', 'th', 'er', 'e ', 'li', 've', 'd ', 'a ', 'ki', 'ng', '\nw', 'ho', 'se', ' d', 'au', 'gh', 'te', 'rs', ' w', 'er', 'e ', 'al', 'l ', 'be', 'au', 'ti', 'fu', 'l,']

processing file stories\002.txt
Data size (chars) (document 1) 4928
Sample string ['ha', 'rd', ' b', 'y ', 'a ', 'gr', 'ea', 't ', 'fo', 're', 'st', ' d', 'we', 'lt', ' a', ' w', 'oo', 'd-', 'cu', 'tt', 'er', ' w', 'it', 'h ', 'hi', 's ', 'wi', 'fe', ', ', 'wh', 'o ', 'ha', 'd ', 'an', '\no', 'nl', 'y ', 'ch', 'il', 'd,', ' a', ' l', 'it', 'tl', 'e ', 'gi', 'rl', ' t', 'hr', 'ee']

processing file stories\003.txt
Data size (chars) (document 2) 9745
Sample string ['a ', 'ce', 'rt', 'ai', 'n ', 'fa', 'th', 'er', ' h', 'ad', ' t', 'wo', ' s', 'on', 's,', ' t', 'he', ' e', 'ld', 'er', ' o', 'f ', 'wh', 'om', ' w

In [4]:

# Build dictionaries
# dictionary: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})
# reverse_dictionary: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}
# count: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]
# data : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])

def build_dataset(documents):
    chars = []
    # list of lists
    data_list = []

    for d in documents:
        chars.extend(d)
    print('%d character found.' % len(chars))

    count = []
    # bigrams sorted by their frequency
    count.extend(collections.Counter(chars).most_common())

    # Create dict map word to id by given the current length of the dictionary
    # UNK is for two rare word
    dictionary = dict({'UNK': 0})
    for char, c in count:
        # Only add if its frequency is more than 10
        if c > 10:
            dictionary[char] = len(dictionary)
    unk_count = 0
    # replace word with id of word
    for d in documents:
        data = list()
        for char in d:
            # if word in dictionary use the id of word
            # otherwise use id of UNK
            if char in dictionary:
                index = dictionary[char]
            else:
                index = dictionary['UNK']
                unk_count += 1
            data.append(index)
        data_list.append(data)

    # dict map id to word
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data_list, count, dictionary, reverse_dictionary

data_list, count, dictionary, reverse_dictionary = build_dataset(documents)
print('Most common words (+UNK)', count[:5])
print('Least common words (+UNK)', count[-15:])
print('Sample data', data_list[0][:10])
print('Sample data', data_list[1][:10])
print('Vocabulary: ',len(dictionary))
vocabulary_size = len(dictionary)
del documents  # To reduce memory.

449177 character found.
Most common words (+UNK) [('e ', 15229), ('he', 15164), (' t', 13443), ('th', 13076), ('d ', 10687)]
Least common words (+UNK) [('bj', 1), ('ii', 1), ('i?', 1), ('z ', 1), ('c.', 1), ('"k', 1), ('pw', 1), ('f?', 1), (' z', 1), ('xq', 1), ('nm', 1), ('m?', 1), ('\t"', 1), ('\tw', 1), ('tz', 1)]
Sample data [15, 28, 86, 23, 3, 95, 74, 11, 2, 16]
Sample data [22, 156, 25, 37, 82, 185, 43, 9, 90, 19]
Vocabulary:  544


In [5]:
idx2char = np.array([i for i in dictionary.keys()])
idx2char[1]

'e '

In [6]:

class DataGeneratorOHE(object):
    def __init__(self, text, batch_size, num_unroll):
        # text bigrams by its id
        self._text = text
        # number of bigrams in text
        self._text_size = len(self._text)
        self._batch_size = batch_size
        # Num unroll is the number of steps unroll the RNN
        # in a single training step
        self._num_unroll = num_unroll
        # Break text into several segments and the batch data
        # is sampled by sampling a single item from a single segment
        self._segments = self._text_size // self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]

    def next_batch(self):
        """

        :return: next batch of data
        """
        # train inputs (one hot encoded) and train outputs (one hot encoded)
        batch_data = np.zeros((self._batch_size, vocabulary_size,), dtype=np.float32)
        batch_label = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32)

        for b in range(self._batch_size):
            # reset back to begin when exceed batch_size
            if self._cursor[b] + 1 >= self._text_size:
                self._cursor[b] = b * self._segments

            # Add text at cursor as input
            batch_data[b, self._text[self._cursor[b]]] = 1.0

            # Add preceding bigrams as the label
            batch_label[b, self._text[self._cursor[b] + 1]] = 1.0

            self._cursor[b] = (self._cursor[b] + 1) % self._text_size

        return batch_data, batch_label

    def unroll_batches(self):
        """

        :return: a list of num_unroll batches required by training of the RNN
        """
        unroll_data, unroll_labels = [], []
        for ui in range(self._num_unroll):
            data, labels = self.next_batch()
            unroll_data.append(data)
            unroll_labels.append(labels)
        return unroll_data, unroll_labels

    def reset_indices(self):
        """
        Reset indices
        :return:
        """
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]

data_gen = DataGeneratorOHE(data_list[0][25:50], 5, 5)
u_data_unroll, u_label_unroll = data_gen.unroll_batches()

for ui, (data, label) in enumerate(zip(u_data_unroll, u_label_unroll)):
    print('\n\nUnrolled index %d'%ui)
    dat_ind = np.argmax(data,axis=1)
    lbl_ind = np.argmax(label,axis=1)
    print('\tInputs:')
    for sing_dat in dat_ind:
        print('\t%s (%d)'%(reverse_dictionary[sing_dat],sing_dat),end=", ")
    print('\n\tOutput:')
    for sing_lbl in lbl_ind:
        print('\t%s (%d)'%(reverse_dictionary[sing_lbl],sing_lbl),end=", ")



Unrolled index 0
	Inputs:
	e  (1), 	ki (131), 	 d (48), 	 w (11), 	be (70), 
	Output:
	li (98), 	ng (33), 	au (195), 	er (14), 	au (195), 

Unrolled index 1
	Inputs:
	li (98), 	ng (33), 	au (195), 	er (14), 	au (195), 
	Output:
	ve (41), 	
w (169), 	gh (106), 	e  (1), 	ti (112), 

Unrolled index 2
	Inputs:
	ve (41), 	
w (169), 	gh (106), 	e  (1), 	ti (112), 
	Output:
	d  (5), 	ho (62), 	te (61), 	al (84), 	fu (228), 

Unrolled index 3
	Inputs:
	d  (5), 	ho (62), 	te (61), 	al (84), 	fu (228), 
	Output:
	a  (82), 	se (58), 	rs (137), 	l  (57), 	l, (257), 

Unrolled index 4
	Inputs:
	a  (82), 	se (58), 	rs (137), 	l  (57), 	be (70), 
	Output:
	ki (131), 	 d (48), 	 w (11), 	be (70), 	au (195), 

In [6]:
embedding_dim = 256
rnn_units = 1024
BATCH_SIZE = 64

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=(batch_size, None)),
        tf.keras.layers.GRU(units=rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model


model = build_model(vocabulary_size, embedding_dim, rnn_units, BATCH_SIZE)
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           139264    
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 544)           557600    
Total params: 4,635,168
Trainable params: 4,635,168
Non-trainable params: 0
_________________________________________________________________


In [94]:
len(data_list)
x = []
for i in data_list:
    x.extend(i)
    # print(len(x))
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [95]:
seq_length = 100
sequences = dataset.batch(seq_length + 1, drop_remainder=True)

# for item in sequences.take(5):
#     for i in item.numpy():
#         print(reverse_dictionary[i], end='')
#
for item in sequences.take(5):
    # print(item)
    # print(idx2char[item.numpy()])
    print(repr(''.join(idx2char[item.numpy()])))

'in olden times when wishing still helped one, there lived a king\nwhose daughters were all beautiful, but the youngest was so beautiful\nthat the sun itself, which has seen so much, was astonished wheneve'
"r\nit shone in her face.  close by the king's castle lay a great dark\nforest, and under an old lime-tree in the forest was a well, and when\nthe day was very warm, the king's child went out into the fores"
't and\nsat down by the side of the cool fountain, and when she was bored she\ntook a golden ball, and threw it up on high and caught it, and this\nball was her favorite plaything.\n\nnow it so happened that '
"on one occasion the princess's golden ball\ndid not fall into the little hand which she was holding up for it,\nbut on to the ground beyond, and rolled straight into the water.  the\nking's daughter follow"
'ed it with her eyes, but it vanished, and the\nwell was deep, so deep that the bottom could not be seen.  at this\nshe began to cry, and cried louder and louder, and 

In [96]:
# Map text to input and target (both input and target have
# the same seq_length but target is shifted to right one character)
def split_input_target(chunk):
    input_text = chunk[:-1] # take all except the last character
    target_text = chunk[1:] # take all except the first character

    return input_text, target_text

dataset = sequences.map(split_input_target)
dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

In [97]:

for input_exp, target_exp in dataset.take(1):
    print('Input data', repr(''.join(idx2char[input_exp.numpy()])))
    print("Target data", repr(''.join(idx2char[target_exp.numpy()])))


Input data 'in olden times when wishing still helped one, there lived a king\nwhose daughters were all beautiful, but the youngest was so beautiful\nthat the sun itself, which has seen so much, was astonished whene'
Target data ' olden times when wishing still helped one, there lived a king\nwhose daughters were all beautiful, but the youngest was so beautiful\nthat the sun itself, which has seen so much, was astonished wheneve'


In [98]:
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 100

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(1)



In [99]:
dataset.take(1)


<TakeDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [101]:
"""
    For each character the model looks up the
    embedding, runs the GRU one timestep with
    the embedding as input, and applies the dense
    layer to generate logits predicting the log-likelihood of the next character:
"""
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 544) # (batch_size, sequence_length, vocab_size)


In [102]:

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([  1, 113, 226, 419, 212, 118,   3, 463, 397, 452, 122, 391, 333,
       322, 185, 454, 422, 328, 519, 514, 402, 301, 410,  23, 211, 135,
       181, 303, 151, 204, 339, 397, 345, 174, 468, 202, 256,  36, 233,
       325, 467, 338, 250, 235, 364, 372, 239, 494, 208, 532, 379, 356,
        61, 155, 180, 399, 405,  95,  90, 318, 306, 139, 497, 105,  32,
       196, 384, 182, 120,  48, 279,  41, 239, 414,  73, 149, 449, 165,
       463, 219, 253, 326, 320, 211, 239, 255,  31, 171,  65, 535,  41,
       251, 522, 167, 160, 402, 144, 448, 184, 343], dtype=int64)

In [103]:

print("Input: \n", repr(''.join(idx2char[input_example_batch[0]])))
print("Next Char Predictions: \n", repr(''.join(idx2char[sampled_indices])))

Input: 
 '\nawhile, and looked at everything in amazement, then she touched the\nlight a little with her finger, and her finger became quite golden.\nimmediately a great fear fell on her.  she shut the door violen'
Next Char Predictions: 
 'e ce\nbccapbu t-glpekdoyaglocgrp-xeoexamng-mbk\nenr.d\nag\nplymisclph.fat-pr." fosl.bslt\nio\ngu.\'cr"hdrmr-mrgteraawggf.imfoiajuolnrkeitivn\'od k dclvecr"tilam\'ibr-g" uc\n a\nr.crspatigidt?veop-attdig-k ebt\nud'


In [104]:

def loss_sparse(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss_sparse(target_example_batch, example_batch_predictions)
example_batch_loss.numpy().mean()


6.2987657

In [105]:

# model.compile(optimizer='adam', loss=loss)
#
# checkpoint_dir = './training_checkpoint'
#
# checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
#
# checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_predix,
#     save_weights_only=True
# )
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(inp, target):
    with tf.GradientTape() as g:
        predictions = model(inp)
        loss = tf.reduce_mean(loss_sparse(target, predictions))

    gradients = g.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


In [107]:

EPOCHS = 10
checkpoint_dir = './training_checkpoint'

checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_predix,
    save_weights_only=True
)

for epoch in range(EPOCHS):
    start = time.time()

    # reset hidden state
    model.reset_states()

    for (batch_n, (inp, target)) in enumerate(dataset):
        loss = train_step(inp, target)

        if batch_n % 100 == 0:
            print('Epoch {} Batch {} Loss {}'.format(epoch + 1, batch_n, loss))

    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_predix.format(epoch=epoch))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


model.save_weights(checkpoint_predix.format(epoch=epoch))


Epoch 1 Batch 0 Loss 6.29873514175415
Epoch 1 Loss 4.2748
Time taken for 1 epoch 175.19048285484314 sec

Epoch 2 Batch 0 Loss 4.160902500152588
Epoch 2 Loss 3.5232
Time taken for 1 epoch 164.40199971199036 sec

Epoch 3 Batch 0 Loss 3.5004918575286865
Epoch 3 Loss 3.2167
Time taken for 1 epoch 165.2790002822876 sec

Epoch 4 Batch 0 Loss 3.1772332191467285
Epoch 4 Loss 3.0115
Time taken for 1 epoch 186.88235330581665 sec

Epoch 5 Batch 0 Loss 2.9823694229125977
Epoch 5 Loss 2.8274
Time taken for 1 epoch 213.66966199874878 sec

Epoch 6 Batch 0 Loss 2.8156843185424805
Epoch 6 Loss 2.7111
Time taken for 1 epoch 216.92969584465027 sec

Epoch 7 Batch 0 Loss 2.7132556438446045
Epoch 7 Loss 2.5780
Time taken for 1 epoch 214.7644681930542 sec

Epoch 8 Batch 0 Loss 2.560864210128784
Epoch 8 Loss 2.4978
Time taken for 1 epoch 215.11507606506348 sec

Epoch 9 Batch 0 Loss 2.4635009765625
Epoch 9 Loss 2.3251
Time taken for 1 epoch 214.786940574646 sec

Epoch 10 Batch 0 Loss 2.3294990062713623
Epoch 1

In [117]:

def generate_text(model, start_string):
    """
    Chose a start string, init RNN state and set the number
    of characters to generate
    Get the prediction distribution of next character using the start string and RNN state
    Use categorical distribution to calculate the index of predicted character
    and use this predicted character as our next input
    The RNN state returned by the model is fed back into the model so that it now has more context,
    After predicting the next character, the modified RNN states are again fed back into the model
    :param model:
    :param start_string:
    :return:
    """
    num_generate = 1000
    start_string = start_string.lower()
    # two_grams = [''.join(chars[ch_i:ch_i+2]) for ch_i in range(0, len(chars)-2, 2)]
    two_grams = []
    for ch_i in range(0, len(start_string) - 2, 2):
        two_grams.append(start_string[ch_i: ch_i+2])

    # fuck = [ch_i for ch_i in range(0, len(start_string)-2, 2) for s in start_string[ch_i:ch_i+2]]
    input_eval = [dictionary[s] for s in two_grams]
    print(input_eval)
    input_eval = tf.expand_dims(input_eval, 0) # convert to 2d tensor

    text_generated = []

    # Love results in more predictable text
    # High otherwise
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        # remove batch dimension
        predictions = tf.squeeze(predictions, axis=0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], axis=0)

        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

print(generate_text(model, start_string="In olden times"))





[15, 28, 86, 23, 3, 95]


ValueError: Input 0 is incompatible with layer gru: expected shape=(64, None, 256), found shape=(1, 6, 256)