# <font color='blue'>Build your own RNN</font>

Now that we all of the necessary ingredients, it's time to put the pieces together.

<font color='red'>**TODO:** Split up into smaller chunks and exercises.</font>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
%matplotlib inline

In [48]:
from glob import glob
from itertools import islice
from utils import SentenceEncoder


def reviews_iter(filepath_pattern):
    for filepath in glob(filepath_pattern):
        with open(filepath) as f:
            for line in f:
                line = line.strip().rsplit('\t', 2)  # review_text, y, review_id
                if len(line) == 3 and len(line[0]) > 20:                  
                    yield line[0]
                

sents = list(islice(reviews_iter("/home/kris/Data/hotel_reviews_positive_negative.tsv/part-*"), 128 * 1024))
sents[:10]

['Convenient, comfortable, safe',
 '- very nice decorated and clean rooms with AC  - great breakfast (breakfast area is very small and you might have to wait if it is full - so plan enough time for breakfast it is worth it)  - very friendly host  - great location within walking distance to many restaurants, chinese fishing nets, ferry, ...',
 '- accommodation is not easy to find as it is not directly in K B Jacob Road, but in a very small street parallel to it (close to a church)',
 'Good location.Clean room and bath perhaps needing a bit of renovation.Receptionist was very nice and helpful.Packed a take away breakfast for us early next morning.She was great.',
 'The room, the wifi and the breakfast',
 'There was no AC in August.',
 'Feel great with its rooftop pool.   Friendly staff although some english may not so fluent but they are try to serve well to us.   Location is some distance from pub street but still a good location which within town area.',
 'Fantastic fitness and spa are

In [49]:
%reload utils
from utils import SentenceEncoder
from itertools import islice, izip


# static shapes
n_hidden = 64
n_chars = 257  # |unicode| + EOS = 256 + 1 = 257
emb_dim = 13
batch_size = 128
warmup = 20

# clear any previous computation graph
tf.reset_default_graph()

# character embeddings
emb = tf.Variable(tf.random_uniform([n_chars, emb_dim], dtype=tf.float32))

# tf Graph input
seq_enc = tf.placeholder(tf.int32, [batch_size, None])  # shape: (batch_size, max_seqlen)
seq_mask = tf.placeholder(tf.bool, [batch_size, None])  # shape: (batch_size, max_seqlen)
max_seqlen = tf.placeholder(tf.int32, [])               # max_seqlen varies with each batch

# translate to dense vectors
x = tf.nn.embedding_lookup(emb, seq_enc)                # shape: (batch_size, max_seqlen, emb_dim])

# rnn cell
lstm_cell = tf.contrib.rnn.LSTMCell(n_hidden, num_proj=emb_dim, use_peepholes=True)

# memory cell states
c = tf.Variable(tf.random_uniform([1, n_hidden]))
m = tf.Variable(tf.random_uniform([1, emb_dim]))

# replicate c and m to allow for batch-wise processing
c = tf.tile(c, [batch_size, 1])
m = tf.tile(m, [batch_size, 1])


def cond(i, h, c, m):
    return i < max_seqlen

def body(i, h, c, m):
    # get token from previous time step
    prev_token = tf.cond(
        i < warmup,
        lambda: x[:, i, :],         # we traverse x in the 'max_seqlen' axis
        lambda: tf.squeeze(h[-1]))  # tf.squeeze undoes tf.expand_dims
    i += 1

    # apply LSTM cell
    h_new, (c, m) = lstm_cell(prev_token, (c, m))
    
    # append h_new to output tensor h
    h = tf.concat([h, tf.expand_dims(h_new, 0)], axis=0)

    return i, h, c, m


shape_invariants = map(tf.TensorShape, (
    [],                           # i.shape
    [None, batch_size, emb_dim],  # h.shape
    [batch_size, n_hidden],       # c.shape
    [batch_size, emb_dim],        # m.shape
))

# run while loop
h = tf.zeros([0, batch_size, emb_dim])
h = tf.while_loop(cond, body, [0, h, c, m], shape_invariants)[1]

# use Euclidean for inter-embedding distances
d = tf.norm(tf.map_fn(lambda e: h - e, emb), axis=-1)    # shape: (n_chars, max_seqlen, batch_size)
d = tf.transpose(d, [2, 1, 0])                           # shape: (batch_size, max_seqlen, n_chars)

# define loss function (Gaussian-kernel KL divergence)
logits = -d ** 2                                         # shape: (batch_size, max_seqlen, n_chars)
labels = tf.one_hot(seq_enc, n_chars, dtype=tf.float32)  # shape: (batch_size, max_seqlen, n_chars)
loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
loss = tf.reduce_mean(tf.boolean_mask(loss, seq_mask))   # mask discards loss due to zero-padding

# get the most likely predicted characters
y_hat = tf.argmax(logits, axis=-1)                       # shape: (batch_size, max_seqlen)


   
def fit_model(sents, learning_rate=1e-2, n_epochs=128):
    """
    This function contains the usual boiler plate required to feed the
    input data and to print some training diagnostics.
    
    Params
    ------
    sents : sequence of strings
        The text on which to train / predict.
        
    learning_rate : float
        AdamOptimizer learning rate.
        
    n_epochs : int
        Numer of epochs to use at training time.
    
    """
    # optimizer
    train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    encoder = SentenceEncoder(sents, batch_size=batch_size)

    with tf.Session() as s:
        s.run(tf.global_variables_initializer())
        learning_curve = {}
        for epoch in xrange(1, n_epochs + 1):
            for seq_enc_, seqlen_, seq_mask_, max_seqlen_ in encoder:
                _, loss_ = s.run(
                    (train, loss),
                    feed_dict={
                        seq_enc: seq_enc_,
                        seq_mask: seq_mask_,
                        max_seqlen: max_seqlen_})

            if np.log2(epoch).is_integer():
                print "Epoch: {}, loss: {}\n".format(epoch, loss_)
                learning_curve[epoch] = loss_

                # create prediction data
                seq_enc_ = list(encoder)[0][0]  # get just two sentences
                seqlen_ = 100
                seq_mask_ = np.zeros([batch_size, seqlen_], dtype=np.bool)
                seed = np.zeros(seq_enc_.shape, dtype=np.int32) 
                seed[:,:warmup] = seq_enc_[:,:warmup]
                y_hat_ = s.run(
                    y_hat,
                    feed_dict={
                        seq_enc: seed,
                        seq_mask: seq_mask_,
                        max_seqlen: seqlen_})

                for s1, s2 in islice(izip(encoder.decode(seq_enc_), encoder.decode(y_hat_)), 3):
                    print u"Seed: \"{}\"".format(s1[:warmup])
                    print u"Orig: {}".format(s1)
                    print u"Pred: {}\n".format(s2)

                print "-" * 80

    learning_curve = pd.Series(learning_curve)
    learning_curve.plot(logx=True, style='o-', title='KL divergence')


# some simple input sentences
# sents = ["Hello, world!", "Hi again!", "Good bye now."]

fit_model(sents)

module reloaded: utils


KeyboardInterrupt: 