In [None]:
import numpy as np
import seaborn as sb
import pandas
import sys
import itertools
import matplotlib.pyplot as plt
import nltk
import csv
import datetime
import tensorflow as tf
import collections
import time
%matplotlib notebook

# Character-level RNN

Let's try to train something for next-letter prediction on Shakespeare.

We will use a standard LSTM architecture with two LSTM-layers and one fully-connected layer at the end.

In [None]:
# how many epochs to use for training
num_epochs = 10
# how many layers the LSTM should have
num_layers = 2
# how many hidden units each LSTM cell should have
lstm_size = 512
# how many characters do we feed in at the same time
# this is the time context
seq_length = 25
# how many of these sequences we put in as one batch
batch_size = 50
# the learning rate for gradient descent
lr = 0.0002

# how much of Shakespeare to use for training the model
fraction_to_train = 0.5

# reading Shakespeare as full text
allData = open('data/shakespeare_plays.txt', 'r').read()
allData = allData[:int(fraction_to_train*len(allData))].lower()
# yet another way to read in the data
# put them into a collections Counter container
counter = collections.Counter(allData)
# sort the items from highest to lowest [according to the second 
# element in counter.items()]
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
# the * operator unpacks the list, so that we only get the first column
chars, _ = zip(*count_pairs)
# count how many characters we have
vocab_size = len(chars)
# and then put them into a dictionary with indices
vocab = dict(zip(chars, range(len(chars))))
# this is our input "tensor", which is simply a way to map all the 
# characters in Shakespeare to our dictionary entries 
tensor = np.array(list(map(vocab.get, allData)))

# this is the number of batches we process on our dataset
num_batches = int(tensor.size / (batch_size * seq_length))

# print the stats of our dataset
print("Shakespeare has",vocab_size,"unique characters in a total of",
      tensor.size,"characters. \nFrom this, we make",num_batches,"batches.")

# take the subset that exactly matches the number of characters we can fit
tensor = tensor[:num_batches * batch_size * seq_length]
# this is the training data
xdata = tensor
# now make the targets - we first copy the training data [to get the size]
ydata = np.copy(tensor)
# then we shift everything by one
ydata[:-1] = xdata[1:]
# and the last element becomes the first element - i.e., we wrap the dataset!
ydata[-1] = xdata[0]
# this is the final data, but now batched
x_batches = np.split(xdata.reshape(batch_size, -1),
                     num_batches, 1)
y_batches = np.split(ydata.reshape(batch_size, -1),
                     num_batches, 1)

# The LSTM training

In the following, we will build the tensorflow model for predicting characters.

This is a sequence-to-sequence model. That is, our input consists of a sequence, and our targets also consists of a sequence!

Note: In principle, this would be better as a function, or even better as a class, but for debugging purposes, I have exposed the functionality here inside the main memory space!

In [None]:
# contains the layers of LSTM cells, each with a pre-defined hidden units
model_cells = []
for _ in range(num_layers):
    model_cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size))

# final LSTM structure
model_cell=tf.contrib.rnn.MultiRNNCell(model_cells,state_is_tuple=True)

# input data and labels - note the dimensionality!
model_input_data = tf.placeholder(tf.int32,[batch_size,seq_length])
model_targets = tf.placeholder(tf.int32,[batch_size,seq_length])

# we start with a clean slate for all initial states in the batch
model_init_state = model_cell.zero_state(batch_size,tf.float32)

# now we initialize all variables

# these are the usual weights and biases - again, note the dimensions
model_softmax_w = tf.Variable(tf.random_normal([lstm_size, vocab_size]))
model_softmax_b = tf.Variable(tf.random_normal([vocab_size]))

# this is the character embedding, which will be used to determine the
# data structure - it is necessary for setting up the next line
model_embedding = tf.Variable(tf.random_normal([vocab_size,lstm_size]))

# this function is tricky:
# in principle, our data input would otherwise consist of a one-hot
# encoded vector (since we are doing classification)
# one-hot encoding, however, is simply like a matrix look-up operation,
# so this function does that
# for the model_input_data (which consists of numbers between 0 and 57),
# it retrieves the corresponding rows of model_embedding
model_inputs = tf.nn.embedding_lookup(model_embedding, model_input_data)

# this input will need to be split into seq_length chunks 
# (along the second axis!)
model_inputs = tf.split(model_inputs,seq_length,1)

# and we concatenate all elements again into a longer "vector"
model_inputs = [tf.squeeze(x_,[1]) for x_ in model_inputs]

# this uses a sequence-to-sequence learning scheme, which
# unrolls the RNN back in time - all of our reformatting before
# was necessary, so that we can fit the data into this function!
model_outputs, model_last = tf.contrib.legacy_seq2seq.rnn_decoder(
    model_inputs,model_init_state,model_cell,loop_function=None)

# we get the output, and now will need to transform it back 
# to the original shape
model_output = tf.reshape(tf.concat(model_outputs,1),[-1,lstm_size])

# on this output, we perform a final, fully connected layer pass
model_logits = tf.matmul(model_output,model_softmax_w)+model_softmax_b

# and convert the output of that layer to probabilities
model_probs = tf.nn.softmax(model_logits)

# this loss calculates the cross-entropy loss for a sequence of probabilities
# this will be a vector of [batch_size*seq_length] values
model_loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
    [model_logits],  # probabilities input [batch_size*seq_length x vocab_size]
    [tf.reshape(model_targets,[-1])],  # targets [batch_size*seq_length]
    [tf.ones([batch_size*seq_length])]) # weights for each example (set to one here)

# total "cost" or loss is simply the sum over all 
# [batch_size*seq_length] values - note, that we always have to deal
# with a full batch in learning here!
model_cost = tf.reduce_sum(model_loss)/batch_size/seq_length

# and we save the last state of the LSTM for later
model_final_state = model_last

# this is the learning rate - it may be variable, but should NOT
# be updated by the optimizer!
model_lr = tf.Variable(lr,trainable = False)

# these variables should be updated, however
# they are:
# --- model_softmax_w
# --- model_softmax_b
# --- model_embedding
# --- the internal weights of the two LSTM cells
model_tvars = tf.trainable_variables()

# evaluate the gradients of all trainable variables with respect to the loss
model_grads = tf.gradients(model_cost,model_tvars)

# initialize the optimization scheme (ADAM is very efficient)
model_optimizer = tf.train.AdamOptimizer(model_lr)

# define an operation for one training step using the gradients and variables
model_train_operation = model_optimizer.apply_gradients(
    zip(model_grads,model_tvars))
###################################


# Training 

Now we code the actual training loop. We have to go through epochs of training and then in each epoch run each batch through a forward pass, gradient pass and weight update. 

Since tensorflow evaluates the computational graph, when we invoke the names of the tensors associated with the outcome, we simply call `run` on  the loss, final state, and a train operation each time. 

Importantly, our LSTM has "memory", so we keep the state and feed it into the network at the next time step!

In [None]:
# starting the tensorflow interactive version
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

# run a number of epochs
for e in range(num_epochs):
    # in each epoch we start "fresh"
    state=sess.run(model_init_state)
    batch = 0
    # loop through all batches we have
    for b in range(num_batches):
        start = time.time()
        # get the current batch consisting of input characters 
        # and target characters
        x,y = x_batches[batch],y_batches[batch]
        # present this to the model via a dictionary
        feed = {model_input_data: x, model_targets: y}
        # go through the initial states (2 in our case for each cell)
        for i, (c, h) in enumerate(model_init_state):
            # and initialize the coefficients of the LSTM from the
            # previous state
            feed[c] = state[i].c
            feed[h] = state[i].h
        # get the training loss on the batch and the new state
        train_loss, state, _ = sess.run([model_cost, model_final_state, model_train_operation], feed)    
        end=time.time()
        print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                  .format(e * num_batches + b,
                          num_epochs * num_batches,
                          e, train_loss, end - start))
        batch+=1

# Saving the output of tensorflow
Since we trained a bit, let's try to save the output:

In [None]:
saver = tf.train.Saver(None)
saver.save(sess, 'char_LSTM_small2epochs.ckpt', global_step=e * num_batches + b)
print([str(i.name) for i in tf.global_variables()])

# Sampling from the model

Now for the next tricky bit. Given the trained model, we would like to sample from it.

For this, we need need to re-use the existing LSTM-architecture (i.e., the weights of course), but we need to change one thing:

The batch_size and the seq_length variables need to be changed to "1". We now want to predict from 1 character already, and we do not do any batches here.

In the tensorflow tutorial, there is another change to the sampling version of the model:

They add another evaluation-function on top that will be used in the decoding step in order to decide what to put into the next LSTM decoding step given the current state. That is, instead of inputting the full output, the LSTM's parameters are used to sample from its output distribution. This is done by introducing a `loop_function` parameter for the `seq2seq` helper function. In this step one could also introduce attentional parameters, for example, that help the LSTM to decide what to pay attention to.

See 
https://theneuralperspective.com/2016/11/20/recurrent-neural-network-rnn-part-4-attentional-interfaces/
for more information on how to implement attentional interfaces with the tensorflow architecture!

For our model, this step does not make a whole lot of difference, so I have put in the function definition, but not activated it for the sampling stage.

So, let's define our LSTM for this case again:

In [None]:
batch_size = 1
seq_length = 1

# contains the number of LSTM cells
model_cells = []
for _ in range(num_layers):
    model_cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size))

# final LSTM structure
model_cell=tf.contrib.rnn.MultiRNNCell(model_cells,state_is_tuple=True)


# input data and labels
model_input_data = tf.placeholder(tf.int32,[batch_size,seq_length])
model_targets = tf.placeholder(tf.int32,[batch_size,seq_length])

# we start with a clean slate for all initial states in the batch
model_init_state = model_cell.zero_state(batch_size,tf.float32)

# now we initialize all variables

# these are the usual weights and biases
model_softmax_w = tf.Variable(tf.random_normal([lstm_size, vocab_size]))
model_softmax_b = tf.Variable(tf.random_normal([vocab_size]))

# this is the character embedding, which will be used to determine the
# data structure
model_embedding = tf.Variable(tf.random_normal([vocab_size,lstm_size]))

# this function is tricky:
# for the model_input_data (which consists of numbers between 0 and 58),
# it retrieves the corresponding rows of model_embedding
model_inputs = tf.nn.embedding_lookup(model_embedding, model_input_data)

# this input will need to be split into seq_length chunks 
# (along the first axis)
model_inputs = tf.split(model_inputs,seq_length,1)

# and we concatenate all elements again into a longer "vector"
model_inputs = [tf.squeeze(x_,[1]) for x_ in model_inputs]

# this loop function can be used to select features in each LSTM step according
# to the existing distribution and then to feed a weighted version of these
# into the next step of the LSTM
def loop(prev,_):
    prev = tf.matmul(prev,model_softmax_w)+model_softmax_b
    prev_symbol = tf.stop_gradient(tf.argmax(prev,1))
    return tf.nn.embedding_lookup(model_embedding,prev_symbol)


# this uses a sequence two sequence learning scheme, which
# unrolls the RNN back in time

# note, we set loop_function=None here, if you set
# loop_function=loop, it will use the above sampling function
# to reweight data that goes into the next stage of the LSTM processing
# for our architecture this does not make a big difference, but for
# deeper networks with more LSTM layers it may!
model_outputs, model_last = tf.contrib.legacy_seq2seq.rnn_decoder(
    model_inputs,model_init_state,model_cell,loop_function=loop)

# the output will need to be transformed back to the original shape
model_output = tf.reshape(tf.concat(model_outputs,1),[-1,lstm_size])

# now we perform our standard weight operation
model_logits = tf.matmul(model_output,model_softmax_w)+model_softmax_b

# and convert to probabilities
model_probs = tf.nn.softmax(model_logits)

# this loss calculates the cross-entropy loss for a sequence of probabilities
# this will be a vector of [batch_size*seq_length] values
model_loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
    [model_logits],  # probabilities input [batch_size*seq_length x vocab_size]
    [tf.reshape(model_targets,[-1])],  # targets [batch_size*seq_length]
    [tf.ones([batch_size*seq_length])]) # weights for each example (set to one here)

# total "cost" or loss is simply the sum over all [batch_size*seq_length] values
model_cost = tf.reduce_sum(model_loss)/batch_size/seq_length

# and we save the last state of the LSTM
model_final_state = model_last

# this is the learning rate - it may be variable, but should NOT
# be updated by the optimizer!
model_lr = tf.Variable(lr,trainable = False)

# these variables should be updated, however
# they are:
# --- model_softmax_w
# --- model_softmax_b
# --- model_embedding
# --- the internal weights of the two LSTM cells
model_tvars = tf.trainable_variables()

# evaluate the gradients of all trainable variables with respect to the loss
model_grads = tf.gradients(model_cost,model_tvars)

# initialize the optimization scheme (ADAM is very efficient)
model_optimizer = tf.train.AdamOptimizer(model_lr)

# define an operation for one training step using the gradients and variables
model_train_operation = model_optimizer.apply_gradients(zip(model_grads,model_tvars))
###################################



Now let's load the model back from disk. For this, we initialize all variables and then load the model back from the last `checkpoint`.

In [None]:
# now let's go
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(None)
ckpt = tf.train.get_checkpoint_state('.')
saver.restore(sess,ckpt.model_checkpoint_path)

Now we do the actual sampling. 

For this, we first initialize the model cells to zero and then push the "prime", that is, the start of the sentence into the model.

Here, we have selected "the " as the prime, so the sentence will start with these four characters. We simply convert these into our usual vocabulary vector and push it into the model. The output will be the state after "the " has been seen.

Now for the next characters, we feed the last character into the model, get the new state and the probabilities for all 58 characters. Now, we could simply do an `argmax` on this, but this would most likely get us into infinite loops again. So, we do a weighted choice, in which we take our probabilities, sum them all up, and then get the index from that array in which a random number from 0-1 would be inserted via the `np.searchsorted` function.

This index is converted into a character, added to our return string and then the process is repeated with the current state of the LSTM being updated.

In [None]:
np.random.seed(0)
state = sess.run(model_cell.zero_state(1, tf.float32))
prime = 'the '
for char in prime:
    x = np.zeros((1, 1))
    x[0, 0] = vocab[char]
    feed = {model_input_data: x, model_init_state: state}
    [state] = sess.run([model_final_state], feed)

def weighted_pick(weights):
    t = np.cumsum(weights)
    s = np.sum(weights)
    return(int(np.searchsorted(t, np.random.rand(1)*s)))
    
ret = prime
char = prime[-1]
for n in range(500):
    x = np.zeros((1, 1))
    x[0, 0] = vocab[char]
    feed = {model_input_data: x, model_init_state: state}
    [probs, state] = sess.run([model_probs, model_final_state], feed)
    p = probs[0]
    sample = weighted_pick(p)
    pred = chars[sample]
    ret += pred
    char = pred

In [None]:
print(ret)