In [1]:
import collections
import matplotlib.pyplot as plt
import math
import numpy as np
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import csv
import os

In [2]:
url = 'https://www.cs.cmu.edu/~spok/grimmtmp/'

dir_name = 'stories'
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

def download_stories(filename):
    print("Downloading file: ", dir_name + os.sep + filename)
    if not os.path.exists(os.path.join(dir_name, filename)):
        filename, _ = urlretrieve(url + filename, dir_name+os.sep+filename)
    else:
        print("File %s already exits" % filename)
    return filename


filenames = [format(i, '03d') + '.txt' for i in range(1, 101)]

for fn in filenames:
    download_stories(fn)

Downloading file:  stories\001.txt
File 001.txt already exits
Downloading file:  stories\002.txt
File 002.txt already exits
Downloading file:  stories\003.txt
File 003.txt already exits
Downloading file:  stories\004.txt
File 004.txt already exits
Downloading file:  stories\005.txt
File 005.txt already exits
Downloading file:  stories\006.txt
File 006.txt already exits
Downloading file:  stories\007.txt
File 007.txt already exits
Downloading file:  stories\008.txt
File 008.txt already exits
Downloading file:  stories\009.txt
File 009.txt already exits
Downloading file:  stories\010.txt
File 010.txt already exits
Downloading file:  stories\011.txt
File 011.txt already exits
Downloading file:  stories\012.txt
File 012.txt already exits
Downloading file:  stories\013.txt
File 013.txt already exits
Downloading file:  stories\014.txt
File 014.txt already exits
Downloading file:  stories\015.txt
File 015.txt already exits
Downloading file:  stories\016.txt
File 016.txt already exits
Download

In [3]:
def read_data(filename):
    with open(filename) as f:
        data =  tf.compat.as_str(f.read())
        data = data.lower()
        data = list(data)
    return data

global documents
documents = []
num_files = 100
for i in range(num_files):
    print("processing file %s" % os.path.join(dir_name, filenames[i]))
    chars = read_data(os.path.join(dir_name, filenames[i]))

    # break into bigrams
    two_grams = [''.join(chars[ch_i:ch_i+2]) for ch_i in range(0, len(chars)-2, 2)]
    # Create document
    documents.append(two_grams)
    print("Data size (chars) (document %d) %d" % (i, len(two_grams)))
    print("Sample string %s\n" % (two_grams[:50]))

processing file stories\001.txt
Data size (chars) (document 0) 3667
Sample string ['in', ' o', 'ld', 'en', ' t', 'im', 'es', ' w', 'he', 'n ', 'wi', 'sh', 'in', 'g ', 'st', 'il', 'l ', 'he', 'lp', 'ed', ' o', 'ne', ', ', 'th', 'er', 'e ', 'li', 've', 'd ', 'a ', 'ki', 'ng', '\nw', 'ho', 'se', ' d', 'au', 'gh', 'te', 'rs', ' w', 'er', 'e ', 'al', 'l ', 'be', 'au', 'ti', 'fu', 'l,']

processing file stories\002.txt
Data size (chars) (document 1) 4928
Sample string ['ha', 'rd', ' b', 'y ', 'a ', 'gr', 'ea', 't ', 'fo', 're', 'st', ' d', 'we', 'lt', ' a', ' w', 'oo', 'd-', 'cu', 'tt', 'er', ' w', 'it', 'h ', 'hi', 's ', 'wi', 'fe', ', ', 'wh', 'o ', 'ha', 'd ', 'an', '\no', 'nl', 'y ', 'ch', 'il', 'd,', ' a', ' l', 'it', 'tl', 'e ', 'gi', 'rl', ' t', 'hr', 'ee']

processing file stories\003.txt
Data size (chars) (document 2) 9745
Sample string ['a ', 'ce', 'rt', 'ai', 'n ', 'fa', 'th', 'er', ' h', 'ad', ' t', 'wo', ' s', 'on', 's,', ' t', 'he', ' e', 'ld', 'er', ' o', 'f ', 'wh', 'om', ' w

In [4]:
# Build dictionaries
# dictionary: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})
# reverse_dictionary: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}
# count: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]
# data : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])

def build_dataset(documents):
    chars = []
    # list of lists
    data_list = []

    for d in documents:
        chars.extend(d)
    print('%d character found.' % len(chars))

    count = []
    # bigrams sorted by their frequency
    count.extend(collections.Counter(chars).most_common())

    # Create dict map word to id by given the current length of the dictionary
    # UNK is for two rare word
    dictionary = dict({'UNK': 0})
    for char, c in count:
        # Only add if its frequency is more than 10
        if c > 10:
            dictionary[char] = len(dictionary)
    unk_count = 0
    # replace word with id of word
    for d in documents:
        data = list()
        for char in d:
            # if word in dictionary use the id of word
            # otherwise use id of UNK
            if char in dictionary:
                index = dictionary[char]
            else:
                index = dictionary['UNK']
                unk_count += 1
            data.append(index)
        data_list.append(data)

    # dict map id to word
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data_list, count, dictionary, reverse_dictionary

data_list, count, dictionary, reverse_dictionary = build_dataset(documents)
print('Most common words (+UNK)', count[:5])
print('Least common words (+UNK)', count[-15:])
print('Sample data', data_list[0][:10])
print('Sample data', data_list[1][:10])
print('Vocabulary: ',len(dictionary))
vocabulary_size = len(dictionary)
del documents  # To reduce memory.

449177 character found.
Most common words (+UNK) [('e ', 15229), ('he', 15164), (' t', 13443), ('th', 13076), ('d ', 10687)]
Least common words (+UNK) [('bj', 1), ('ii', 1), ('i?', 1), ('z ', 1), ('c.', 1), ('"k', 1), ('pw', 1), ('f?', 1), (' z', 1), ('xq', 1), ('nm', 1), ('m?', 1), ('\t"', 1), ('\tw', 1), ('tz', 1)]
Sample data [15, 28, 86, 23, 3, 95, 74, 11, 2, 16]
Sample data [22, 156, 25, 37, 82, 185, 43, 9, 90, 19]
Vocabulary:  544


In [5]:
class DataGeneratorOHE(object):
    def __init__(self, text, batch_size, num_unroll):
        # text bigrams by its id
        self._text = text
        # number of bigrams in text
        self._text_size = len(self._text)
        self._batch_size = batch_size
        # Num unroll is the number of steps unroll the RNN
        # in a single training step
        self._num_unroll = num_unroll
        # Break text into several segments and the batch data
        # is sampled by sampling a single item from a single segment
        self._segments = self._text_size // self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]

    def next_batch(self):
        """

        :return: next batch of data
        """
        # train inputs (one hot encoded) and train outputs (one hot encoded)
        batch_data = np.zeros((self._batch_size, vocabulary_size,), dtype=np.float32)
        batch_label = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32)

        for b in range(self._batch_size):
            # reset back to begin when exceed batch_size
            if self._cursor[b] + 1 >= self._text_size:
                self._cursor[b] = b * self._segments

            # Add text at cursor as input
            batch_data[b, self._text[self._cursor[b]]] = 1.0

            # Add preceding bigrams as the label
            batch_label[b, self._text[self._cursor[b] + 1]] = 1.0

            self._cursor[b] = (self._cursor[b] + 1) % self._text_size

        return batch_data, batch_label

    def unroll_batches(self):
        """

        :return: a list of num_unroll batches required by training of the RNN
        """
        unroll_data, unroll_labels = [], []
        for ui in range(self._num_unroll):
            data, labels = self.next_batch()
            unroll_data.append(data)
            unroll_labels.append(labels)
        return unroll_data, unroll_labels

    def reset_indices(self):
        """
        Reset indices
        :return:
        """
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]

data_gen = DataGeneratorOHE(data_list[0][25:50], 5, 5)
u_data_unroll, u_label_unroll = data_gen.unroll_batches()

for ui, (data, label) in enumerate(zip(u_data_unroll, u_label_unroll)):
    print('\n\nUnrolled index %d'%ui)
    dat_ind = np.argmax(data,axis=1)
    lbl_ind = np.argmax(label,axis=1)
    print('\tInputs:')
    for sing_dat in dat_ind:
        print('\t%s (%d)'%(reverse_dictionary[sing_dat],sing_dat),end=", ")
    print('\n\tOutput:')
    for sing_lbl in lbl_ind:
        print('\t%s (%d)'%(reverse_dictionary[sing_lbl],sing_lbl),end=", ")



Unrolled index 0
	Inputs:
	e  (1), 	ki (131), 	 d (48), 	 w (11), 	be (70), 
	Output:
	li (98), 	ng (33), 	au (195), 	er (14), 	au (195), 

Unrolled index 1
	Inputs:
	li (98), 	ng (33), 	au (195), 	er (14), 	au (195), 
	Output:
	ve (41), 	
w (169), 	gh (106), 	e  (1), 	ti (112), 

Unrolled index 2
	Inputs:
	ve (41), 	
w (169), 	gh (106), 	e  (1), 	ti (112), 
	Output:
	d  (5), 	ho (62), 	te (61), 	al (84), 	fu (228), 

Unrolled index 3
	Inputs:
	d  (5), 	ho (62), 	te (61), 	al (84), 	fu (228), 
	Output:
	a  (82), 	se (58), 	rs (137), 	l  (57), 	l, (257), 

Unrolled index 4
	Inputs:
	a  (82), 	se (58), 	rs (137), 	l  (57), 	be (70), 
	Output:
	ki (131), 	 d (48), 	 w (11), 	be (70), 	au (195), 

In [6]:
# LSTM: (each gate has three sets of weights (1 for current, 1 for previous, 1 for bias)
#   Cell state
#   Hidden state
#   Input gate
#   Forget gate
#   Output gate

# hyperparameter

# neurons in hidden state
num_nodes = 128

batch_size = 64

# the number of time steps used in truncated BPTT
num_unrolling = 50

# regular
dropout = .0

filename_extension = ''
if dropout>0.0:
    filename_extension = '_dropout'

filename_to_save = 'lstm'+filename_extension+'.csv'

In [7]:
tf.compat.v1.reset_default_graph()
tf.compat.v1.disable_eager_execution()

train_inputs, train_labels = [], []
# Unroll training inputs
for ui in range(num_unrolling):
    train_inputs.append(tf.compat.v1.placeholder(tf.float32, shape=(batch_size, vocabulary_size),
                                                 name='train_inputs_%d' % ui))
    train_labels.append(tf.compat.v1.placeholder(tf.float32, shape=(batch_size, vocabulary_size),
                                                 name='train_label_%d' % ui))

# Validation data
valid_inputs = tf.compat.v1.placeholder(tf.float32, shape=(1, vocabulary_size), name='valid_inputs')
valid_labels = tf.compat.v1.placeholder(tf.float32, shape=(1, vocabulary_size), name='valid_labels')

# Test data
test_input = tf.compat.v1.placeholder(tf.float32, shape=(1, vocabulary_size), name='test_input')

In [8]:
# Input gate - How much memory to write to cell state
# connects current input to the input gate
ix = tf.Variable(tf.compat.v1.truncated_normal(shape=(vocabulary_size, num_nodes), stddev=.02))
# connects the previous hidden state to the input gate
im = tf.Variable(tf.compat.v1.truncated_normal((num_nodes, num_nodes), stddev=.02))
# bias of input gate
ib = tf.Variable(tf.compat.v1.random_uniform((1, num_nodes),-0.02, 0.02))

# Forget gate - how much memory to discard from cell state
# connect current input to he forget gate
fx = tf.Variable(tf.compat.v1.truncated_normal((vocabulary_size, num_nodes), stddev=.02))
# connect the previous hidden state to the forget gate
fm = tf.Variable(tf.random.truncated_normal((num_nodes, num_nodes), stddev=0.02))
# bias of forget gate
fb = tf.Variable(tf.random.uniform(shape=(1, num_nodes), minval=-0.02, maxval=0.02))

# Candidate - compute the current cell state
# connect current input to candidate
cx = tf.Variable(tf.random.truncated_normal((vocabulary_size, num_nodes), stddev=0.02))
# connect previous hidden state to the candidate
cm = tf.Variable(tf.random.truncated_normal((num_nodes, num_nodes), stddev=0.02))
# bias of candidate
cb = tf.Variable(tf.random.uniform((1, num_nodes), minval=-0.02, maxval=0.02))

# Output gate - how much memory to output from cell state
ox = tf.Variable(tf.random.truncated_normal((vocabulary_size, num_nodes), stddev=.02))
om = tf.Variable(tf.random.truncated_normal((num_nodes, num_nodes), stddev=.02))
ob = tf.Variable(tf.random.uniform((1, num_nodes), minval=-0.02, maxval=0.02))

# Softmax classifier weights and biases
w = tf.Variable(tf.random.truncated_normal((num_nodes, vocabulary_size), stddev=0.02))
b = tf.Variable(tf.random.uniform((vocabulary_size,), minval=-0.02, maxval=0.02))

# Variables saving state across unrollings
# hidden state
saved_output = tf.Variable(tf.zeros((batch_size, num_nodes)), trainable=False, name="train_hidden")
# cell state
saved_state = tf.Variable(tf.zeros((batch_size, num_nodes)), trainable=False, name="train_cell")

# Variables for validation
saved_valid_output = tf.Variable(tf.zeros((1, num_nodes)), trainable=False, name='valid_hidden')
saved_valid_state = tf.Variable(tf.zeros((1, num_nodes)), trainable=False, name='valid_cell')

# Variables for testing
saved_test_output = tf.Variable(tf.zeros((1, num_nodes)), trainable=False, name="test_hidden")
saved_test_state = tf.Variable(tf.zeros((1, num_nodes)), trainable=False, name='test_cell')

w, b

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


(<tf.Variable 'Variable_12:0' shape=(128, 544) dtype=float32>,
 <tf.Variable 'Variable_13:0' shape=(544,) dtype=float32>)

In [9]:
# cell computation
def lstm_cell(i, o, state):
    """

    :param i: input text for training
    :param o: output is output from previous cell or hidden state
    :param state: the previous cell state

    forget_gate = σ(Wf · concat(h_t-1, x_t) + bf)
    input_gate = σ(Wi · concat(h_t-1, x_t) + bi)
    candidate = tanh(Wc · concat(h_t-1, x_t) + bc)
    cell_state = forget_gate * previous_cell_state + input_gate * candidate
    output_gate = σ(Wo · concat(h_t-1, x_t) + bo)
    hidden_state = output_gate * tanh(cell_state)
    :return: output (hidden state) and cell state
    """
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    candidate_update = tf.tanh(tf.matmul(i, cx) + tf.matmul(o, cm) + cb)
    state = forget_gate * state + input_gate * candidate_update
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

In [10]:
epsilon = 1e-10
# keeps the calculated state outputs in all the unrollings to calculate loss
outputs = []

# will be update at each step of unrolling
output = saved_output
state = saved_state

# Compute the hidden state (output) and cell state (state)
for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    output = tf.nn.dropout(output, rate=dropout)
    outputs.append(output)

logits = tf.matmul(tf.concat(values=outputs, axis=0), w) + b

train_prediction = tf.nn.softmax(logits)

# training perplexity
train_perplexity_without_exp = tf.reduce_sum(
    tf.concat(train_labels, axis=0) * -tf.math.log(tf.concat(train_prediction, axis=0) + epsilon)) / (num_unrolling *
                                                                                                batch_size)

# Validation
valid_output, valid_state = lstm_cell(
    valid_inputs, saved_valid_output, saved_valid_state
)
# Logits
valid_logits = tf.matmul(valid_output, w) + b

# Make sure state are updated
with tf.control_dependencies([saved_valid_output.assign(valid_output),
                              saved_valid_state.assign(valid_state)]):
    valid_prediction = tf.nn.softmax(valid_logits)

# Validation perplexity
valid_perplexity_without_exp = tf.reduce_sum(valid_labels * -tf.math.log(valid_prediction + epsilon))

# Testing
test_output, test_state = lstm_cell(
    test_input, saved_test_output, saved_test_state
)
# Make sure state are updated
with tf.control_dependencies([saved_test_output.assign(test_output),
                              saved_test_state.assign(test_state)]):
    test_prediction = tf.nn.softmax(tf.matmul(test_output, w) + b)

In [11]:
# Loss
with tf.control_dependencies([saved_output.assign(output),
                              saved_state.assign(state)]):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(values=train_labels, axis=0),
        logits=logits
    ))


In [12]:
# Optimizer
lr_step = tf.Variable(0, trainable=False, name='global_step')
inc_lr_step = tf.compat.v1.assign(lr_step, lr_step + 1)
tf_learning_rate = tf.compat.v1.train.exponential_decay(0.001, lr_step, decay_steps=1, decay_rate=.5)

optimizer = tf.compat.v1.train.AdamOptimizer(tf_learning_rate)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)

optimizer = optimizer.apply_gradients(zip(gradients, variables))

In [13]:
# Reset train
reset_train_state = tf.group(tf.compat.v1.assign(saved_state, tf.zeros((batch_size, num_nodes))),
                             tf.compat.v1.assign(saved_output, tf.zeros((batch_size, num_nodes))))

# Reset valid state
reset_valid_state = tf.group(tf.compat.v1.assign(saved_valid_state, tf.zeros((1, num_nodes))),
                             tf.compat.v1.assign(saved_valid_output, tf.zeros((1, num_nodes))))

# Reset test state
reset_test_state = tf.group(saved_test_output.assign((tf.random.normal((1, num_nodes), stddev=.05))),
                            saved_test_state.assign((tf.random.normal((1, num_nodes), stddev=.05))))

reset_train_state, reset_valid_state, reset_test_state

(<tf.Operation 'group_deps' type=NoOp>,
 <tf.Operation 'group_deps_1' type=NoOp>,
 <tf.Operation 'group_deps_2' type=NoOp>)

In [14]:
# break repetition in text
# instead of getting the word with highest prediction, we sample randomly
# where the probability of being selected given by their prediction prob

def sample(distribution):
    """
    Greedy Sampling
    Pick the three best prob given by LSTM and sample one
    of them with very high prob of pick the best one
    :param distribution:
    :return:
    """

    best_indices = np.argsort(distribution)[-3:]
    best_probs = distribution[best_indices] / np.sum(distribution[best_indices])
    best_idx = np.random.choice(best_indices, p=best_probs)

    return best_idx

In [15]:
# Train LSTM
# For each document train LSTM with steps_per_document steps
# And then generate some text from random picked bigram
decay_threshold = 5
# If valid perplexity does not decrease for decay_threshold
# then decrease the learning rate
# perplexity increase count
decay_count = 0

min_perplexity = 1e10

def decay_learning_rate(session, perplexity):
    global min_perplexity, decay_count, min_perplexity
    if perplexity < min_perplexity:
        decay_count = 0
        min_perplexity = perplexity
    else:
        decay_count += 1

    if decay_count >= decay_threshold:
        print("Reducing learning rate...")
        decay_count = 0
        session.run(inc_lr_step)

In [16]:
# Training Validation and Generation

num_steps = 20
steps_per_document = 100
valid_summary = 1
train_doc_count = 100
docs_per_step = 10

# Train perplexity over time
train_perplexity = []
valid_perplexity = []

session = tf.compat.v1.InteractiveSession()

# Initialize variables
print("Initialize global variables ...")
tf.compat.v1.global_variables_initializer().run()

# mean loss
average_loss = 0.0

# Use first 10 documents for validation

doc_ids = []
for di in range(num_files):
    if len(data_list[di]) > 10 * steps_per_document:
        doc_ids.append(di)
    if len(doc_ids) == 10:
        break

data_gens = []
valid_gens = []

for fi in range(num_files):
    # Get all bigrams if document id is not in the validation document
    if fi not in doc_ids:
        data_gens.append(DataGeneratorOHE(data_list[fi], batch_size=batch_size, num_unroll=num_unrolling))
    # if document is in the validation doc ids, only get the up to steps_per_document bigrams
    # and use the last steps_per_document as validation
    else:
        data_gens.append(DataGeneratorOHE(data_list[fi][:-steps_per_document], batch_size, num_unrolling))
        # Validation data generator
        valid_gens.append(DataGeneratorOHE(data_list[fi][-steps_per_document:], 1, 1))



Initialize global variables ...


In [23]:
feed_dict = {}
save_path = "./my_model/my_saved_variable"

for step in range(num_steps):
    print("\n" +  "-"*24 + "Training step %d " % step + "-"*24)
    for di in np.random.permutation(train_doc_count)[:docs_per_step]:
        doc_perplexity = 0
        for doc_step_id in range(steps_per_document):
            # Get set of unrolled batches
            u_data, u_labels = data_gens[di].unroll_batches()

            # Populate the feed dict by using each of the data batches
            # present in the unrolled batches
            for ui, (data, label) in enumerate(zip(u_data, u_labels)):
                feed_dict[train_inputs[ui]] = data
                feed_dict[train_labels[ui]] = label

            # Run operation
            _, l, step_perplexity = session.run([optimizer, loss, train_perplexity_without_exp],
                                               feed_dict=feed_dict)
            # Update perplexity
            doc_perplexity += step_perplexity

            # Update loss
            average_loss += step_perplexity
        print("(%d), " % di, end='')
        # resetting hidden state after processing a single document
        # It's still questionable if this adds value in terms of learning
        # One one hand it's intuitive to reset the state when learning a new document
        # On the other hand this approach creates a bias for the state to be zero
        # We encourage the reader to investigate further the effect of resetting the state
        #session.run(reset_train_state) # resetting hidden state for each document

    if (step + 1) % valid_summary == 0:
        # average loss
        average_loss = average_loss / (valid_summary * docs_per_step * steps_per_document)

        print("\nStep: %d Average loss %.3f, Perplexity %.3f" % (step + 1, average_loss, np.exp(average_loss)))
        train_perplexity.append(np.exp(average_loss))

        # reset loss
        average_loss = 0
        valid_loss = 0

        # Valid perplexity
        for v_doc_id in range(10):
            # divide by 2 due to bigrams
            for v_step in range(steps_per_document // 2):
                u_valid_data, u_valid_label = valid_gens[v_doc_id].unroll_batches()

                # Run validation phase
                v_perplexity_out = session.run([valid_perplexity_without_exp],
                                           feed_dict={valid_inputs: u_valid_data[0], valid_labels:
                                               u_valid_label[0]})
                valid_loss += v_perplexity_out[0]

            session.run(reset_valid_state)

            # Reset validation date generator curosr
            valid_gens[v_doc_id].reset_indices()

        v_perplexity = np.exp(valid_loss / (steps_per_document * 10.0 // 2))
        print("Valid perplexity: %.3f\n" % v_perplexity)
        valid_perplexity.append(v_perplexity)
        decay_learning_rate(session, v_perplexity)

        # Generate new text
        # Generate 500 bigrams with one segment
        segments_to_generate = 1
        chars_in_segment = 500

        for _ in range(segments_to_generate):
            print("="*24 + "New text" + "="*24 )

            # start with random word
            test_word = np.zeros((1, vocabulary_size), dtype=np.float32)
            rand_doc = data_list[np.random.randint(0, num_files)]
            test_word[0, rand_doc[np.random.randint(0, len(rand_doc))]] = 1.0
            print(reverse_dictionary[np.argmax(test_word[0])], end='')

            # Generating words by feeding the previous prediction
            # as current input in a recursive manner
            for _ in range(chars_in_segment):
                sample_pred = session.run(test_prediction, feed_dict={test_input: test_word})
                next_ind = sample(sample_pred.ravel())
                test_word = np.zeros((1, vocabulary_size), dtype=np.float32)
                test_word[0, next_ind] = 1.0
                print(reverse_dictionary[next_ind], end='')

            # Reset train state
            session.run(reset_test_state)
            print("-" * 28)


# Saved all variables in session into file to restore later
print("-" * 12 + "Saved variable" + "-" * 12)
saver = tf.compat.v1.train.Saver()
saver.save(session, save_path, global_step=num_steps)

session.close()
#
# with open(filename_to_save, 'wt') as f:
#     writer = csv.writer(f, delimiter=',')
#     writer.writerow(train_perplexity)
#     writer.writerow(valid_perplexity)



------------------------Training step 0 ------------------------
(47), (74), (94), (50), (58), (9), (7), (66), (62), (76), 
Step: 1 Average loss 4.307, Perplexity 74.218
Valid perplexity: 71.261

awetar, and
and not the will then the and the young to the begar, the the young to then eat to the night, and not the young wrens the will, and said, and the young that, and said, but that to the and saw hen and and and anver the young to to the
and the
and sat the
young the young to look the young to the fare to to the will the fardoing then to the young wren, and said, and that the fore to the begand said, and said, but the
and the and the
young to the beached, and said, and not that will the young wrat to come, and the will, and the young, to the to the begand the young, the begand not the young to then the will wast the will wast the
will, and the and said, then that beare ing and the withe and dren, and the young the with to the
and said, but, and begand not that the for as and said,
and

In [17]:
# Beam search
beam_length = 5
beam_neighbors = 5
sample_beam_inputs = [tf.compat.v1.placeholder(tf.float32, shape=(1, vocabulary_size)) for _ in range(beam_neighbors)]

best_beam_index = tf.compat.v1.placeholder(shape=None, dtype=tf.int32)
best_neighbor_beam_indices = tf.compat.v1.placeholder(shape=(beam_neighbors,), dtype=tf.int32)

# output of each beam
saved_sample_beam_output = [tf.Variable(tf.zeros((1, num_nodes))) for _ in range(beam_neighbors)]
# state of each beam
saved_sample_beam_state = [tf.Variable(tf.zeros((1, num_nodes))) for _ in range(beam_neighbors)]

# Reset the sample beam states
reset_sample_beam_state = tf.group(
    *[saved_sample_beam_output[vi].assign(tf.zeros((1, num_nodes))) for vi in range(beam_neighbors)],
    *[saved_sample_beam_state[vi].assign(tf.zeros((1, num_nodes))) for vi in range(beam_neighbors)]
)

# Stack to perform gather
stacked_beam_outputs = tf.stack(saved_sample_beam_output)
stacked_beam_states = tf.stack(saved_sample_beam_state)

# The beam states for each beam (there are beam_neighbor-many beams) needs to be updated at every depth of tree
# Consider an example where you have 3 classes where we get the best two neighbors (marked with star)
#     a`      b*       c
#   / | \   / | \    / | \
#  a  b c  a* b` c  a  b  c
# Since both the candidates from level 2 comes from the parent b
# We need to update both states/outputs from saved_sample_beam_state/output to have index 1 (corresponding to parent b)
update_sample_beam_state = tf.group(
    *[saved_sample_beam_output[vi].assign(tf.gather_nd(stacked_beam_outputs, [best_neighbor_beam_indices[vi]]))
      for vi in range(beam_neighbors)],
    *[saved_sample_beam_state[vi].assign(tf.gather_nd(stacked_beam_states, [best_neighbor_beam_indices[vi]]))
      for vi in range(beam_neighbors)]
)

sample_beam_outputs, sample_beam_states = [], []
for vi in range(beam_neighbors):
    tmp_output, tmp_state = lstm_cell(
        sample_beam_inputs[vi],
        saved_sample_beam_output[vi],
        saved_sample_beam_state[vi]
    )
    sample_beam_outputs.append(tmp_output)
    sample_beam_states.append(tmp_state)


# For a given set of beams, outputs a list of prediction vectors of size beam_neighbors
# Each beam having the predictions for full vocabulary
sample_beam_predictions = []
for vi in range(beam_neighbors):
    with tf.control_dependencies([saved_sample_beam_output[vi].assign(sample_beam_outputs[vi]),
                                  saved_sample_beam_state[vi].assign(sample_beam_states[vi])]):
        sample_beam_predictions.append(tf.nn.softmax(tf.matmul(sample_beam_outputs[vi], w) + b))

In [18]:
# Train LSTM on the available data and generate text using
# the trained LSTM for several steps. From each document we extract text
# for steps_per_document steps to train the LSTM on.

# Learning rate schedule

decay_threshold = 5
decay_count = 0

min_perplexity = 1e10

def decay_learning_rate(session, v_perplexity):
    global decay_count, decay_threshold, min_perplexity
    if v_perplexity < min_perplexity:
        decay_count = 0
        min_perplexity = v_perplexity
    else:
        decay_count += 1

    if decay_count >= decay_threshold:
        print("\t Reducing learning rate")
        decay_count = 0
        session.run(inc_lr_step)



In [23]:
# Beam Prediction search
test_word = None

def get_beam_prediction(session):
    """
    # Generating words within a segment with Beam Search
    # To make some calculations clearer, we use the example as follows
    # We have three classes with beam_neighbors=2 (best candidate denoted by *,
    # second best candidate denoted by `)
    # For simplicity we assume best candidate always have probability of 0.5 in output prediction
    # second best has 0.2 output prediction
    #           a`                   b*                   c                <--- root level
    #    /     |     \         /     |     \        /     |     \
    #   a      b      c       a*     b`     c      a      b      c         <--- depth 1
    # / | \  / | \  / | \   / | \  / | \  / | \  / | \  / | \  / | \
    # a b c  a b c  a b c   a*b c  a`b c  a b c  a b c  a b c  a b c       <--- depth 2
    # So the best beams at depth 2 would be
    # b-a-a and b-b-a
    :param session:
    :return:
    """

    global test_word
    global sample_beam_predictions
    global update_sample_beam_state

    # Calculate the candidates at the root level
    feed_dict = {}
    for b_n_i in range(beam_neighbors):
        feed_dict.update({sample_beam_inputs[b_n_i]: test_word})

    # Calculate sample predictions for all neighbors with the same starting word
    # It is important to update the state for all instances of beam search
    sample_preds_root = session.run(sample_beam_predictions, feed_dict=feed_dict)
    sample_preds_root = sample_preds_root[0]

    this_level_candidates = (np.argsort(sample_preds_root, axis=1).ravel()[::-1])[:beam_neighbors]

    # probabilities of top k candidates is .5 and .2
    this_level_probs = sample_preds_root[0, this_level_candidates]

    # Update test sequence produced by each beam from the root level calculation
    test_sequences = ['' for _ in range(beam_neighbors)]
    for b_n_i in range(beam_neighbors):
        test_sequences[b_n_i] += reverse_dictionary[this_level_candidates[b_n_i]]

    # Make the calculations for the rest of the depth of the beam search tree
    for b_i in range(beam_length-1):
        # Candidate words for each beam
        test_words = []
        # Predicted word for each beam
        pred_words = []

        # computing feed_dict for the beam search (except root)
        # feed dict should contain the best words found by the previous level of search

        feed_dict = {}
        for p_idx, pred_i in enumerate(this_level_candidates):
            # Update the feed_dict for getting next predictions
            test_words.append(np.zeros((1, vocabulary_size), dtype=np.float32))
            test_words[p_idx][0, this_level_candidates[p_idx]] = 1.0

            feed_dict.update({sample_beam_inputs[p_idx]:test_words[p_idx]})

        # Calculation the predictions for all neighbors in beams search
        # This is a list of vectors where each vector is the prediction vector for a certain beam
        # For level 1 in our example, the prediction values for
        #      b             a  (previous beam search results)
        # [a,  b,  c],  [a,  b,  c] (current level predictions) would be
        # [0.1,0.1,0.1],[0.5,0.2,0]
        sample_preds_all_neighbors = session.run(sample_beam_predictions, feed_dict=feed_dict)

        # Create a single vector with
        # Making our example [0.1,0.1,0.1,0.5,0.2,0]
        sample_preds_all_neighbors_concat = np.concatenate(sample_preds_all_neighbors, axis=1)

        # Update this_level_candidates to be used for the next iter
        # Update the probabilities for each beams with maximum value from above vector
        this_level_candidates = np.argsort(sample_preds_all_neighbors_concat.ravel())[::-1][:beam_neighbors]

        # In the example this would be [1,1]
        parent_beam_indices = this_level_candidates // vocabulary_size

        # normalize this_level_candidates to fall between [0,vocabulary_size]
        # In this example this would be [0,1]
        this_level_candidates = (this_level_candidates % vocabulary_size).tolist()

        # Here we update the final state of each beam to be
        # the state that was at the index 1. Because for both the candidates at this level the parent is
        # at index 1 (that is b from root level)
        session.run(update_sample_beam_state, feed_dict={best_neighbor_beam_indices: parent_beam_indices})

        # Here we update the joint probabilities of each beam and add the newly found candidates to the sequence
        tmp_this_level_probs = np.asarray(this_level_probs)
        tmp_test_sequences = list(test_sequences) # currently [b,a]

        for b_n_i in range(beam_neighbors):
            # We make the b_n_i element of this_level_probs to be the probability of parents
            # In the example the parent indices are [1,1]
            # So this_level_probs become [0.5,0.5]
            this_level_probs[b_n_i] = tmp_this_level_probs[parent_beam_indices[b_n_i]]

            # Next we multiple these by the probabilities of the best candidates from current level
            # [0.5*0.5, 0.5*0.2] = [0.25,0.1]
            this_level_probs[b_n_i] *= sample_preds_all_neighbors[parent_beam_indices[b_n_i]][0, this_level_candidates[b_n_i]]

            # Make the b_n_i element of test_sequences to be the correct parent of the current best candidates
            # In the example this becomes [b, b]
            test_sequences[b_n_i] = tmp_test_sequences[parent_beam_indices[b_n_i]]

            # Now we append the current best candidates
            # In this example this becomes [ba,bb]
            test_sequences[b_n_i] += reverse_dictionary[this_level_candidates[b_n_i]]

            # Create one-hot-encoded representation for each candidate
            pred_words.append(np.zeros((1, vocabulary_size), dtype=np.float32))
            pred_words[b_n_i][0, this_level_candidates[b_n_i]] = 1.0

    # Calculate best beam id based on the highest beam probability
    # Using the highest beam probability always lead to very monotonic text
    # Let us sample one randomly where one being sampled is decided by the likelihood of that beam
    rand_cand_ids = np.argsort(this_level_probs)[-3:]
    rand_cand_probs = this_level_candidates[rand_cand_ids] / np.sum(this_level_probs[rand_cand_ids])
    rand_id = np.random.choice(rand_cand_ids, p=rand_cand_probs)

    best_beam_id = parent_beam_indices[rand_id]

    # Update state and output variables for test prediction
    session.run(update_sample_beam_state, feed_dict={best_neighbor_beam_indices: [best_beam_id for _ in range(beam_neighbors)]})

    # Make the last word from the beam
    test_word = pred_words[best_beam_id]

    return test_sequences[best_beam_id]

In [24]:
# Training and validation
filename_to_save = 'lstm_beam_search_dropout'
save_path = "./my_model/my_saved_beam_search_lstm"

num_steps = 26
steps_per_document = 100
valid_summary = 1
train_doc_count = 100
docs_per_step = 10


beam_nodes = []

beam_train_perplexity_ot = []
beam_valid_perplexity_ot = []
tf.compat.v1.InteractiveSession.close(session)
session = tf.compat.v1.InteractiveSession()

tf.compat.v1.global_variables_initializer().run()

print('Initialized')
average_loss = 0

# We use the first 10 documents that has
# more than 10*steps_per_document bigrams for creating the validation dataset

# Identify the first 10 documents following the above condition
long_doc_ids = []
for di in range(num_files):
    if len(data_list[di]) > 10 * steps_per_document:
        long_doc_ids.append(di)
    if len(long_doc_ids) == 10:
        break

# Generating validation data
data_gens = []
valid_gens = []
for fi in range(num_files):
    # Get all the bigrams if the document id is not in the validation document ids
    if fi not in long_doc_ids:
        data_gens.append(DataGeneratorOHE(data_list[fi], batch_size=batch_size, num_unroll=num_unrolling))
        # If the document is in the validation doc ids, only get up to the last steps_per_document
        # bigrams and use the last steps_per_document bigrams as validation data
    else:
        data_gens.append(DataGeneratorOHE(data_list[fi][:-steps_per_document], batch_size, num_unrolling))
        # Define the validation data generator
        valid_gens.append(DataGeneratorOHE(data_list[fi][-steps_per_document:], 1, 1))


feed_dict = {}
for step in range(num_steps):
    print("\n" +  "-"*24 + "Training step %d " % step + "-"*24)
    for di in np.random.permutation(train_doc_count)[:docs_per_step]:
        doc_perplexity = 0
        for doc_step_id in range(steps_per_document):

            # Get a set of unrolled batches
            u_data, u_label = data_gens[di].unroll_batches()

            # Populate feed dict by using each of the data batches present in the unrolled data
            for ui, (data, label) in enumerate(zip(u_data, u_label)):
                feed_dict[train_inputs[ui]] = data
                feed_dict[train_labels[ui]] = label

            # Running the operations
            _, l, step_perplexity = session.run([optimizer, loss, train_perplexity_without_exp],
                                                feed_dict=feed_dict)

            # Update doc_perplexity
            doc_perplexity += step_perplexity

            # Update average loss
            average_loss += step_perplexity
        print("(%d)" % di, end='')

    # resetting hidden state after processing a single document
    # It's still questionable if this adds value in terms of learning
    # On one hand it's intuitive to reset the state when learning a new document
    # On the other hand this approach creates a bias for the state to be zero
    # We encourage the reader to investigate further the effect of resetting the state
    #session.run(reset_train_state) # resetting hidden state for each document

    if (step + 1) % valid_summary == 0:

        # Compute average loss
        average_loss = average_loss / (docs_per_step * steps_per_document * valid_summary)

        # Print Loss
        print('Average loss at step %d: %f' % (step+1, average_loss))
        print('\tPerplexity at step %d: %f' %(step+1, np.exp(average_loss)))
        beam_train_perplexity_ot.append(np.exp(average_loss))

        average_loss = 0
        valid_loss = 0

        # Calculate valid perplexity
        for v_doc_id in range(10):
            for v_step in range(steps_per_document // 2):
                u_valid_data, u_valid_labels = valid_gens[v_doc_id].unroll_batches()

                # Run validatoin phase related
                v_perp = session.run([valid_perplexity_without_exp], feed_dict={valid_inputs: u_valid_data[0],
                                                                                valid_labels: u_valid_labels[0]})
                valid_loss +=  v_perp.pop()

            session.run(reset_valid_state)

    v_perplexity = np.exp(valid_loss / (steps_per_document * 10.0 // 2))
    print("Valid Perplexity: %.2f\n" % v_perplexity)

    # Decay scheduler
    decay_learning_rate(session, v_perplexity)

    # Generate new text
    # Generate 500 bigrams with one segment
    segments_to_generate = 1
    chars_in_segment = 500

    for _ in range(segments_to_generate):
        print("="*24 + "New text" + "="*24 )

        # start with random word
        test_word = np.zeros((1, vocabulary_size), dtype=np.float32)
        rand_doc = data_list[np.random.randint(0, num_files)]
        test_word[0, rand_doc[np.random.randint(0, len(rand_doc))]] = 1.0
        print(reverse_dictionary[np.argmax(test_word[0])], end='')

        # Generating words by feeding the previous prediction
        # as current input in a recursive manner
        for _ in range(chars_in_segment):
            test_sequence = get_beam_prediction(session)
            print(test_sequence, end='')

        # Reset train state
        session.run(reset_test_state)
        print("-" * 28)


# Saved all variables in session into file to restore later
print("-" * 12 + "Saved variable" + "-" * 12)
saver = tf.compat.v1.train.Saver()
saver.save(session, save_path, global_step=num_steps)

session.close()


Initialized

------------------------Training step 0 ------------------------
(93)(37)(44)(41)(6)(78)(32)(14)(64)(1)Average loss at step 1: 4.381960
	Perplexity at step 1: 79.994663
Valid Perplexity: 59.20

e [3 1 0]


TypeError: only integer scalar arrays can be converted to a scalar index