In [1]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow.contrib.data import unbatch

In [2]:
def prepare_texts(texts, cutoff_length):    
    # cutoff all texts
    cutoff = [text[:cutoff_length] for text in texts]
    
    # fill short texts with zeros
    zeropad = [np.pad(text, (0,cutoff_length-len(text)), 'constant', constant_values=(0, 0)) for text in cutoff]
    
    return np.array(zeropad)

In [3]:
vocab_size = 20000
training, validation = tf.keras.datasets.imdb.load_data(
                        path='imdb.npz',
                        num_words=vocab_size,
                        skip_top=0,
                        maxlen=None,
                        seed=113,
                        start_char=1,
                        oov_char=2,
                        index_from=3
                    )

training_texts, training_labels = training
validation_texts, validation_labels = validation

In [4]:
cutoff_length = 300

training_texts = prepare_texts(training_texts, cutoff_length)
validation_texts = prepare_texts(validation_texts, cutoff_length)

In [5]:
word_to_id = tf.keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in training_texts[2] ))

<START> this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had <UNK> working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how <UNK> this is to watch save yourself an hour a bit of your life <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

In [6]:
tf.reset_default_graph()

# Generate the tf.data.Dataset from the above defined lists.
training_dataset = tf.data.Dataset.from_tensor_slices((training_texts, training_labels))
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_texts, validation_labels))

# Shuffle and batch the data.
training_dataset = training_dataset.shuffle(buffer_size=1000000)
training_batchsize = 250
training_dataset = training_dataset.batch(training_batchsize)
validation_batchsize = 2500
validation_dataset = validation_dataset.batch(validation_batchsize)

# Generate the iterator.
iterator = tf.data.Iterator.from_structure(training_dataset.output_types,training_dataset.output_shapes)
training_init_op = iterator.make_initializer(training_dataset)
validation_init_op = iterator.make_initializer(validation_dataset)

In [7]:
# Get the sequences and labels of the next batch.
next_batch = iterator.get_next()
sequences = next_batch[0]
labels = next_batch[1]
labels = tf.cast(labels, dtype=tf.float32)
labels = tf.expand_dims(labels,axis=-1)

In [8]:
# Define hyperparams.
lstm_size = 64
embedding_size = 64
subseq_length = 100 

# Initialize placeholders to feed in the subsequences.
subsequences = tf.placeholder(shape=[None, subseq_length], dtype=tf.int32)
subsequences_labels = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# Initialize placeholder to feed in cell and hidden state.
init_hs = tf.placeholder(shape=[None, lstm_size], dtype=tf.float32)
init_cs = tf.placeholder(shape=[None, lstm_size], dtype=tf.float32)

In [9]:
# LSTM

with tf.variable_scope("LSTM", reuse=tf.AUTO_REUSE) as scope:
    
    # Initialize list to save the hidden states and outputs of the sequence.
    hs = []
    
    # Initialize Embeddings
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    embedding = tf.get_variable("embedding", [vocab_size, embedding_size], initializer = embedding_initializer)
    
    # Initialize all weights and biases.
    initializer = tf.random_normal_initializer(stddev=0.125)
    
    # forget gate
    Wxf = tf.get_variable("Wxf", [embedding_size, lstm_size], initializer=initializer)
    Whf = tf.get_variable("Whf", [lstm_size, lstm_size], initializer=initializer)
    bf = tf.get_variable("bf", [lstm_size], initializer=initializer)
    
    # new candidate
    Wxc = tf.get_variable("Wxc", [embedding_size, lstm_size], initializer=initializer)
    Whc = tf.get_variable("Whc", [lstm_size, lstm_size], initializer=initializer)
    bc = tf.get_variable("bc", [lstm_size], initializer=initializer)
    
    # input gate
    Wxi = tf.get_variable("Wxi", [embedding_size, lstm_size], initializer=initializer)
    Whi = tf.get_variable("Whi", [lstm_size, lstm_size], initializer=initializer)
    bi = tf.get_variable("bi", [lstm_size], initializer=initializer)
    
    # output gate
    Wxo = tf.get_variable("Wxo", [embedding_size, lstm_size], initializer=initializer)
    Who = tf.get_variable("Who", [lstm_size, lstm_size], initializer=initializer)
    bo = tf.get_variable("bo", [lstm_size], initializer=initializer)
        
    # readout
    Why = tf.get_variable("Why", [lstm_size,1], initializer=initializer)
    by = tf.get_variable("by", [1], initializer=initializer)
                         
    
    
    # Get the embeddings.
    subsequences_embed = tf.nn.embedding_lookup(embedding, subsequences)
    
    # Initialize the hidden and the cell state.
    h_t = init_hs
    c_t = init_cs
    

    for t in range(subseq_length):

        # Read out the ith input 
        x_t = subsequences_embed[:,t]
        
        # forget gate
        f_t = tf.sigmoid(tf.matmul(x_t, Wxf) + tf.matmul(h_t, Whf) + bf)
        
        # input gate
        c_new_t = tf.tanh(tf.matmul(x_t, Wxc) + tf.matmul(h_t, Whc) + bc)
        i_t = tf.sigmoid(tf.matmul(x_t, Wxi) + tf.matmul(h_t, Whi) + bi)
        
        # update cell state
        c_t = f_t * c_t + i_t * c_new_t
        
        # output_gate
        h_new_t = tf.tanh(c_t)
        o_t = tf.sigmoid(tf.matmul(x_t, Wxo) + tf.matmul(h_t, Who) + bo)
        
        # update hidden state
        h_t = o_t * h_new_t
        
        
        
        
        # save hidden state
        hs.append(h_t)
        

    hs_mean = tf.reduce_mean(hs, axis=0) 
    
    logits = tf.matmul(hs_mean, Why) + by
    

In [10]:
with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE) as scope:    
    
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=subsequences_labels)
    loss = tf.reduce_mean(cross_entropy, axis=0)
    loss = loss[0]
   

    guesses = tf.nn.sigmoid(logits) > 0.5
    accuracy = tf.equal(tf.cast(guesses, tf.float32), subsequences_labels)
    accuracy = tf.reduce_mean(tf.cast(accuracy, tf.float32))

    
with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE) as scope:
    optimizer = tf.train.AdamOptimizer(learning_rate=0.03)
    training_step = optimizer.minimize(loss)

In [11]:
tf.summary.scalar('loss', loss)
tf.summary.scalar('accuracy', accuracy)
merged_summaries = tf.summary.merge_all()
train_writer = tf.summary.FileWriter("./summaries/train", tf.get_default_graph())
validation_writer = tf.summary.FileWriter("./summaries/validation", tf.get_default_graph())

In [12]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    step = 0
    
    for epoch in range(3):
    
        
        
        # Load the dataset into the iterator.
        sess.run(training_init_op)
        

        # Go through the dataset until it's empty.
        while True:
            try:
                # Beginning of new sequences: set new hidden and cell state to zeros.
                hs_feed_val = np.zeros([training_batchsize,lstm_size])
                cs_feed_val = np.zeros([training_batchsize,lstm_size])
                
                # Get the new batch of sequences.
                sequences_feed, sequences_labels_feed = sess.run([sequences, labels])
                
                # Feed in subsequences.
                for i in range(cutoff_length//subseq_length):
                    subsequences_feed = sequences_feed[:,i*subseq_length:(i+1)*subseq_length]
                    hs_feed_val, cs_feed_val, summaries, _ = \
                                sess.run([h_t,c_t, merged_summaries, training_step], 
                                        feed_dict={init_hs: hs_feed_val,
                                                   init_cs: cs_feed_val, 
                                                   subsequences:subsequences_feed,
                                                   subsequences_labels: sequences_labels_feed})
                
                    train_writer.add_summary(summaries, global_step = step)
                
                    step += 1
               

            # Stop if iterator is empty.
            except tf.errors.OutOfRangeError:
                break
                
        
        
        sess.run(validation_init_op)
        # Go through the dataset until it's empty.
        while True:
            try:
                # Beginning of new sequences: set new hidden and cell state to zeros.
                hs_feed_val = np.zeros([validation_batchsize,lstm_size])
                cs_feed_val = np.zeros([validation_batchsize,lstm_size])

                # Get the new batch of sequences.
                sequences_feed, sequences_labels_feed = sess.run([sequences, labels])
                
                # Feed in subsequences.
                for i in range(cutoff_length//subseq_length):
                    subsequences_feed = sequences_feed[:,i*subseq_length:(i+1)*subseq_length]
                    hs_feed_val, cs_feed_val, summaries, _ = \
                                sess.run([h_t,c_t, merged_summaries, training_step], 
                                        feed_dict={init_hs: hs_feed_val,
                                                   init_cs: cs_feed_val, 
                                                   subsequences:subsequences_feed,
                                                   subsequences_labels: sequences_labels_feed})
                    
                    validation_writer.add_summary(summaries, global_step = step)
                
                
                    step += 1
               

            # Stop if iterator is empty.
            except tf.errors.OutOfRangeError:
                break
                
                
       
       
      
             
