In [23]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
import time
from six.moves.urllib.request import urlretrieve

In [24]:
def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close() 
filename = 'text8.zip'
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [25]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [26]:
letters = string.ascii_lowercase + ' '
vocabulary_size = len(letters)**2
let_vocabulary_size = len(letters)
first_letter = ord('a')

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter
    if char == ' ':
        return 26
    else:
        print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
    return letters[dictid]

def bigram2id((c1,c2)):
    return char2id(c1)*27 + char2id(c2)

def id2bigram(dictid):
    return (id2char(dictid // 27), id2char(dictid% 27))
    

In [27]:
batch_size=64
num_unrollings=10

class BatchGenerator:
    
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
        
    def _next_batch(self):
        batch = np.zeros(shape=(self._batch_size, let_vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
    
    def next(self):
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

class BigramBatchGenerator:
    
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ (offset * segment + 1) for offset in range(batch_size)]
        self._last_batch = self._next_batch()
        
    def _next_batch(self):
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, bigram2id((self._text[self._cursor[b] - 1], self._text[self._cursor[b]]))] = 1.0
            self._cursor[b] = (self._cursor[b] + 1)% self._text_size
        return batch
    
    def next(self):
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches
    
def bigrams(probabilities):
      return [id2bigram(c) for c in np.argmax(probabilities, 1)]

def bigram2string((c1,c2)):
    return c1 + c2

def characters(probabilities):
      return [id2char(c) for c in np.argmax(probabilities, 1)]


def batches2string(batches):
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BigramBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BigramBatchGenerator(valid_text, 1, 1)

In [28]:
def logprob(predictions, labels):
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
    distribution_size = prediction.size
    p = np.zeros(shape=[1, distribution_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p


def random_distribution():
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b/np.sum(b, 1)[:,None]

In [29]:
num_nodes = 64
embedding_size = 128
graph = tf.Graph()
with graph.as_default():
    # Parameters:
    #embedding matrix
    em = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], -0.1, 0.1)) 
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    def lstm_cell(i, o, state):
        ei = tf.matmul(i, em)
        input_gate = tf.sigmoid(tf.matmul(ei, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(ei, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(ei, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(ei, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state
    
    train_data = list()
    train_labels = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
           tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
            
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits, tf.concat(0, train_labels)))
    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [30]:
num_steps = 7001
summary_frequency = 200
start_time = time.time()
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        tr_batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = tr_batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(tr_batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = bigram2string(bigrams(feed)[0])
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += bigram2string(bigrams(feed)[0])[1]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                va_batch = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: va_batch[0]})
                valid_logprob = valid_logprob + logprob(predictions, va_batch[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))
            
elapsed_time = time.time() - start_time
print ('Elapsed time: ', elapsed_time)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Average loss at step 0: 6.595048 learning rate: 10.000000
Minibatch perplexity: 731.46
qyndnsrjryklwsnwxduzntiyfqsuqtheytokwenv qpaxsbleydjcmtgaeus tclqhwixbyxobkq fwxw
uvwpxkrk uxrkneirbpslmcaykiy g zxyqfvebktgrt mlffxzsklrnmx qevslktclsvvwhujp h lb
jreccatuaxjgfqwqyrdvlrimgnbtnpsywsgvlxtwkrmiyfasgvarizuzyclwrytzjmnfpffmjxjury ut
diuzgxy xweylbvgrxzqpcudbcdttz xbwoewblkslehejpwdhpnumjpqqlldhidkcyddyegxbj hpcii
kvkzsuurvbdsienxtmipqrnzmvewwkwuuwaobnlsxmdf rwyemfuonbkaxrsqd dqzcyfohabdnchhwaa
Validation set perplexity: 568.45
Average loss at step 200: 3.798975 learning rate: 10.000000
Minibatch perplexity: 8.37
Validation set perplexity: 9.62
Average loss at step 400: 2.050988 learning rate: 10.000000
Minibatch perplexity: 6.67
Validation set perplexity: 7.01
Average loss at step 600: 1.856344 learning rate: 10.000000
Minibatch perplexity: 5.45
Validation set perplexity: 6.43
Average loss at step 800: 