Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '))
print(id2char(1), id2char(26), id2char(0))

1 26 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
        batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)


print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [6]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]


def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [31]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [32]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.295208 learning rate: 10.000000
Minibatch perplexity: 26.98
yatheajtllatrskhmep invotlq  q pi t laf ga sahnpknylpwfl ggrqaneoe ugtpmtaf p s 
ynhvajhiovfb dowdvwpoewm d xxeewxi x pbpa  sim azf e edp tghb b dvel mxsehn e ke
pntjis ebjdtcxto ja gpbootnceupoowghydoycijjtdnfy jkmw izminq  q vosuranleav tog
efnuaicetxeprafgbymdnnls sdnftlteteiydgohjnpqsjh g yfsntlailngtvnznnmwraeuer swe
i wcgkhznzsicgcobohnoqd cauhtu e eehlkfrfmixu rvlrspy  eq cscuclgcjosttr neyjvzc
Validation set perplexity: 20.14
Average loss at step 100: 2.588530 learning rate: 10.000000
Minibatch perplexity: 10.97
Validation set perplexity: 10.44
Average loss at step 200: 2.246915 learning rate: 10.000000
Minibatch perplexity: 8.53
Validation set perplexity: 8.60
Average loss at step 300: 2.094018 learning rate: 10.000000
Minibatch perplexity: 7.39
Validation set perplexity: 7.97
Average loss at step 400: 1.996125 learning rate: 10.000000
Minibatch perplexity: 7.40
Validation set per

Validation set perplexity: 4.39
Average loss at step 4500: 1.615679 learning rate: 10.000000
Minibatch perplexity: 5.38
Validation set perplexity: 4.52
Average loss at step 4600: 1.614758 learning rate: 10.000000
Minibatch perplexity: 5.12
Validation set perplexity: 4.70
Average loss at step 4700: 1.624560 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.55
Average loss at step 4800: 1.627901 learning rate: 10.000000
Minibatch perplexity: 4.36
Validation set perplexity: 4.52
Average loss at step 4900: 1.630480 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.63
Average loss at step 5000: 1.608328 learning rate: 1.000000
Minibatch perplexity: 4.55
gan mutholh called a cain through and the sage hall after litisunian howish musi
galled between the bed than be israbling and psypheile of s swith was masis inde
k in one nine six eight one nine one six three zero one six five years cominal p
emer first the mikam by distriptance o

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [34]:
x = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
x.shape

TensorShape([Dimension(27), Dimension(256)])

In [47]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
    # Parameters:
    # Gates: input, memory, forget, output.
    x = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
    # State
    m = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    #bias
    bias = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
  
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        #extract matmul part into the single variable
        matmul_part = tf.matmul(i, x) + tf.matmul(o, m) + bias
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
          tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),
      saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
      sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [49]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.296088 learning rate: 10.000000
Minibatch perplexity: 27.01
gnyruvqeoxko oiz z hw ilv bpb kx lidyvglnpfdergeahl  nfsnavviqloeqlvsatisknrtadr
o drqklnk gz gqi uhn qiigo kudqiilajlvw  n lk nm wedhe eiecarhxsqhlnt zteojcaoef
dvjectz fnh kirlialdyaxebmcjetm tfibcdxh k iba li vqtglbd uidsooojc cuc did e  e
weadhedhwsnlhwtlcqrfiallfr if qncuitr ea tpbnrewsduoseiffzislcocsr kekeswkqrttly
n sbe kan icp vcfnt v v xcid  ozniqtmosoj ejzpeim mgc agwt  joenefucpafgdzp joll
Validation set perplexity: 19.86
Average loss at step 100: 2.578006 learning rate: 10.000000
Minibatch perplexity: 12.18
Validation set perplexity: 10.70
Average loss at step 200: 2.241599 learning rate: 10.000000
Minibatch perplexity: 8.45
Validation set perplexity: 8.84
Average loss at step 300: 2.080559 learning rate: 10.000000
Minibatch perplexity: 7.02
Validation set perplexity: 7.93
Average loss at step 400: 1.993168 learning rate: 10.000000
Minibatch perplexity: 6.95
Validation set per

Validation set perplexity: 5.11
Average loss at step 4500: 1.626669 learning rate: 10.000000
Minibatch perplexity: 4.94
Validation set perplexity: 5.04
Average loss at step 4600: 1.630820 learning rate: 10.000000
Minibatch perplexity: 5.28
Validation set perplexity: 4.89
Average loss at step 4700: 1.597774 learning rate: 10.000000
Minibatch perplexity: 5.44
Validation set perplexity: 4.95
Average loss at step 4800: 1.583371 learning rate: 10.000000
Minibatch perplexity: 5.18
Validation set perplexity: 5.03
Average loss at step 4900: 1.599418 learning rate: 10.000000
Minibatch perplexity: 5.11
Validation set perplexity: 4.91
Average loss at step 5000: 1.623120 learning rate: 1.000000
Minibatch perplexity: 5.42
s aduative of gthinamation seccent gomania atposes the offscy r for autor sha mo
kf evely are moxitin increas and neater a ennekhione in perlip the to s policion
ed the each hole artiures entile one five zero sell two five in the themen of so
ys it wotter both puldimal adchand per

Validation set perplexity: 4.34
Average loss at step 9100: 1.590926 learning rate: 1.000000
Minibatch perplexity: 5.73
Validation set perplexity: 4.36
Average loss at step 9200: 1.595583 learning rate: 1.000000
Minibatch perplexity: 5.27
Validation set perplexity: 4.35
Average loss at step 9300: 1.590074 learning rate: 1.000000
Minibatch perplexity: 4.68
Validation set perplexity: 4.32
Average loss at step 9400: 1.573143 learning rate: 1.000000
Minibatch perplexity: 4.50
Validation set perplexity: 4.34
Average loss at step 9500: 1.592387 learning rate: 1.000000
Minibatch perplexity: 4.83
Validation set perplexity: 4.32
Average loss at step 9600: 1.581841 learning rate: 1.000000
Minibatch perplexity: 4.58
Validation set perplexity: 4.37
Average loss at step 9700: 1.577665 learning rate: 1.000000
Minibatch perplexity: 5.26
Validation set perplexity: 4.35
Average loss at step 9800: 1.577118 learning rate: 1.000000
Minibatch perplexity: 4.98
Validation set perplexity: 4.37
Average loss at 

Validation set perplexity: 4.35
Average loss at step 14100: 1.596736 learning rate: 0.100000
Minibatch perplexity: 4.67
Validation set perplexity: 4.36
Average loss at step 14200: 1.573053 learning rate: 0.100000
Minibatch perplexity: 4.68
Validation set perplexity: 4.36
Average loss at step 14300: 1.580519 learning rate: 0.100000
Minibatch perplexity: 4.96
Validation set perplexity: 4.36
Average loss at step 14400: 1.595691 learning rate: 0.100000
Minibatch perplexity: 5.47
Validation set perplexity: 4.36
Average loss at step 14500: 1.600329 learning rate: 0.100000
Minibatch perplexity: 4.74
Validation set perplexity: 4.36
Average loss at step 14600: 1.570035 learning rate: 0.100000
Minibatch perplexity: 5.02
Validation set perplexity: 4.36
Average loss at step 14700: 1.592610 learning rate: 0.100000
Minibatch perplexity: 5.55
Validation set perplexity: 4.36
Average loss at step 14800: 1.574893 learning rate: 0.100000
Minibatch perplexity: 4.70
Validation set perplexity: 4.36
Average 

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

a: Cause of high demension in bigram model, we will use word embeddings.

In [272]:
valid_batches = BatchGenerator(valid_text, 1, 1)

In [268]:
num_nodes = 64
embedding_size = 20
graph = tf.Graph()
with graph.as_default():
    
     # Input data.
    train_inputs = []
    train_labels = []
    for _ in range(num_unrollings):
        train_inputs.append(
            tf.placeholder(tf.int32, shape = [batch_size]))
        train_labels.append(
          tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    
    embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    embeds = []
    
    for inputs in train_inputs:
        embed = tf.nn.embedding_lookup(embeddings, inputs)
        embeds.append(embed)
        
    # Parameters:
    # Gates: input, memory, forget, output.
    x = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1))
    # State
    m = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    #bias
    bias = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    


    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))



    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        #extract matmul part into the single variable
        matmul_part = tf.matmul(i, x) + tf.matmul(o, m) + bias
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in embeds:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input_idx = tf.placeholder(tf.int32, shape=[1])
    sample_input_embed = tf.nn.embedding_lookup(embeddings, sample_input_idx)
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),
      saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
      sample_input_embed, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [262]:
(train_batches.next()[1]).shape
(train_batches.next()[0]).shape

(64, 27)

In [273]:
num_steps = 8001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings):
      feed_dict[train_inputs[i]] = batches[i].argmax(axis=1)
      feed_dict[train_labels[i]] = batches[i+1]
    
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input_idx: feed.argmax(axis=1)})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input_idx: b[0].argmax(axis=1)})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.299145 learning rate: 10.000000
Minibatch perplexity: 27.09
oylgmcepa z fpvtywiumju   erw iboay cqer y voa sdc nidk us ql n a rcu tpanm xoxe
klhlvbez empyziikatzpuiorc iwc wisuirghrtljrlm isen vuely  i vaypb  eteehhzsjpil
rzkaoww ndlpomgfeg eotyueoanetze kr fimw  dbcn ez   efoexuupletyqthm  kdyzlcccqi
clrsgyfditku o ttvd gh e z iiejrek  reiiaknnhr ueu e hiosh yeal bk qlso iomvzco 
khcpsaa odtjissm rravggbeixsq lqoq x    jlwltsecxxgvyo ayroh fbsvee aia in etisz
Validation set perplexity: 19.90
Average loss at step 100: 2.386494 learning rate: 10.000000
Minibatch perplexity: 8.68
Validation set perplexity: 9.11
Average loss at step 200: 2.050725 learning rate: 10.000000
Minibatch perplexity: 7.40
Validation set perplexity: 8.08
Average loss at step 300: 1.958449 learning rate: 10.000000
Minibatch perplexity: 6.42
Validation set perplexity: 7.19
Average loss at step 400: 1.878871 learning rate: 10.000000
Minibatch perplexity: 6.94
Validation set perpl

Validation set perplexity: 5.24
Average loss at step 4500: 1.636049 learning rate: 10.000000
Minibatch perplexity: 5.30
Validation set perplexity: 5.19
Average loss at step 4600: 1.631538 learning rate: 10.000000
Minibatch perplexity: 5.34
Validation set perplexity: 5.07
Average loss at step 4700: 1.627726 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 5.04
Average loss at step 4800: 1.633210 learning rate: 10.000000
Minibatch perplexity: 4.91
Validation set perplexity: 5.03
Average loss at step 4900: 1.634437 learning rate: 10.000000
Minibatch perplexity: 5.25
Validation set perplexity: 5.08
Average loss at step 5000: 1.623997 learning rate: 1.000000
Minibatch perplexity: 4.89
ching the percession be kearges as they evil gpuce ordhlitipe to trivity this kh
saured to the courquent as asternered as popel on films and lee while zero zero 
nap refirs staracted by selvcity use in anustimple in the creat mooking the one 
nicousivels vatels a macipmda withbror

**b:**  Change graph for bigrams model.

In [11]:
valid_batches = BatchGenerator(valid_text, 1, 2)
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

[' an']
['nar']
['rch']


In [13]:
num_nodes = 64
embedding_size = 32
graph = tf.Graph()
with graph.as_default():
    
     # Input data.
    train_data = []
    train_inputs = []
    train_labels = []
    for _ in range(num_unrollings):
        train_inputs.append(
            tf.placeholder(tf.int32, shape = [batch_size]))
    for _ in range(num_unrollings - 1):
        train_labels.append(
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))
    
    embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    embeds = []
    
    for index in range(num_unrollings - 1):
        embed_first_symb = tf.nn.embedding_lookup(embeddings, train_inputs[index])
        embed_second_symb = tf.nn.embedding_lookup(embeddings, train_inputs[index + 1])
        embed = tf.concat([embed_first_symb, embed_second_symb], 1)
        print(embed.shape)
        embeds.append(embed)
        
    # Parameters:
    # Gates: input, memory, forget, output.
    x = tf.Variable(tf.truncated_normal([2 * embedding_size, 4 * num_nodes], -0.1, 0.1))
    # State
    m = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    #bias
    bias = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))

    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        #extract matmul part into the single variable
        matmul_part = tf.matmul(i, x)+ tf.matmul(o, m) + bias
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in embeds:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    print(len(embeds))
    print(len(train_labels))
    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input0 = (tf.placeholder(tf.int32, shape=[1]))
    sample_input1 = (tf.placeholder(tf.int32, shape=[1]))
    
    embed1 = tf.reshape(tf.nn.embedding_lookup(embeddings, sample_input0), [1, -1])
    embed2 = tf.reshape(tf.nn.embedding_lookup(embeddings, sample_input1), [1, -1])
    sample_input_embed = tf.concat([embed1, embed2], 1)
    
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),
      saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
      sample_input_embed, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
9
9


In [296]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings - 1):
      feed_dict[train_inputs[i]] = batches[i].argmax(axis = 1)
      feed_dict[train_inputs[i + 1]] = batches[i + 1].argmax(axis = 1)
      feed_dict[train_labels[i]] = batches[i + 2]
        
    
    _, l, predictions, lr = session.run(
       [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[2:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feeds = [sample(random_distribution()),sample(random_distribution())]
          sentence = characters(feeds[0])[0] + characters(feeds[1])[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input0: feeds[0].argmax(axis = 1),
                                                 sample_input1: feeds[1].argmax(axis = 1)})
            feed = sample(prediction)
            #print("prediction: {0} \n feed {1}:{2} \n".format(prediction, feed, characters(feed)))
            sentence += characters(feed)[0]
            feeds.append(feed)
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input0: b[0].argmax(axis=1),
                                              sample_input1: b[1].argmax(axis=1)})
        valid_logprob = valid_logprob + logprob(predictions, b[2])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.289089 learning rate: 10.000000
Minibatch perplexity: 26.82
wzkgfrmne gvme  ehgfah ykezsac teejk r  eypo nyhtvia  xxngdwyioy t nmaaodilnnswvc
tbiprrpjz   ztkpscan hh woawatxxheovklocr   lepnzbwva eemkaridb w  ahictra i qe z
qcc rxvloagsuw ssaifevcjnltctcehwp ofenygtnfgzl o  zbso afcsr iuwnthczv fxofesdax
ds bisawakfgy p  wfcpkeo     ujdt eukdoktvylrb j pg d xawdg jxe dchkfbukfvyedcsto
ixthpk oxnqanbtvmea bwplx me gtqjpwbtn tfgoearcg xiwz lh  nps vfphah suqgdoxxtprn
Validation set perplexity: 18.89
Average loss at step 100: 2.302889 learning rate: 10.000000
Minibatch perplexity: 8.06
Validation set perplexity: 9.10
Average loss at step 200: 2.027483 learning rate: 10.000000
Minibatch perplexity: 6.56
Validation set perplexity: 8.89
Average loss at step 300: 1.912729 learning rate: 10.000000
Minibatch perplexity: 7.20
Validation set perplexity: 8.14
Average loss at step 400: 1.855559 learning rate: 10.000000
Minibatch perplexity: 6.16
Validation set 

Validation set perplexity: 8.14
Average loss at step 4500: 1.744508 learning rate: 10.000000
Minibatch perplexity: 5.72
Validation set perplexity: 7.93
Average loss at step 4600: 1.737338 learning rate: 10.000000
Minibatch perplexity: 5.26
Validation set perplexity: 8.26
Average loss at step 4700: 1.701295 learning rate: 10.000000
Minibatch perplexity: 6.07
Validation set perplexity: 8.00
Average loss at step 4800: 1.736424 learning rate: 10.000000
Minibatch perplexity: 5.46
Validation set perplexity: 8.40
Average loss at step 4900: 1.739173 learning rate: 10.000000
Minibatch perplexity: 5.74
Validation set perplexity: 8.01
Average loss at step 5000: 1.724270 learning rate: 1.000000
Minibatch perplexity: 5.72
ce mvrlplpmipr rpdlppnlprprmpprdnbrpnrlrdndpllsrrtnmdmdrrvdsrn mrpdpdrpsllvpsdrdl
xdoeie errirerm em me reuemeuom  iu ime meee  im eeeiemmu im me  leieem   o ml mi
kbimaaneaaaauaelaaltamaeaauaeaoaeuaaomealleura laaonloarraaaelaaaraornepaammaarar
xmaesasaneaaaceeaalalsiaaaepaaaddea

**c:** Add dropout layer to output and input

In [96]:
valid_batches = BatchGenerator(valid_text, 1, 2)

In [110]:
num_nodes = 256
embedding_size = 64
graph = tf.Graph()
with graph.as_default():
    
     # Input data.
    train_data = []
    train_inputs = []
    train_labels = []
    for _ in range(num_unrollings):
        train_inputs.append(
            tf.placeholder(tf.int32, shape = [batch_size]))
    for _ in range(num_unrollings - 1):
        train_labels.append(
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))
    
    embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    embeds = []
    
    for index in range(num_unrollings - 1):
        embed_first_symb = tf.nn.embedding_lookup(embeddings, train_inputs[index])
        embed_second_symb = tf.nn.embedding_lookup(embeddings, train_inputs[index + 1])
        embed = tf.concat([embed_first_symb, embed_second_symb], 1)
        embeds.append(embed)
        
    # Parameters:
    # Gates: input, memory, forget, output.
    x = tf.Variable(tf.truncated_normal([2 * embedding_size, 4 * num_nodes], -0.1, 0.1))
    # State
    m = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    #bias
    bias = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))

    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        #extract matmul part into the single variable
        matmul_part = tf.matmul(i, x)+ tf.matmul(o, m) + bias
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in embeds:
        input_drop = tf.nn.dropout(i, 0.7)
        output, state = lstm_cell(input_drop, output, state)
        output_drop = tf.nn.dropout(output, 0.7)
        outputs.append(output_drop)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input0 = (tf.placeholder(tf.int32, shape=[1]))
    sample_input1 = (tf.placeholder(tf.int32, shape=[1]))
    
    embed1 = tf.reshape(tf.nn.embedding_lookup(embeddings, sample_input0), [1, -1])
    embed2 = tf.reshape(tf.nn.embedding_lookup(embeddings, sample_input1), [1, -1])
    sample_input_embed = tf.concat([embed1, embed2], 1)
    
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),
      saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
      sample_input_embed, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [38]:
print(batches2string(valid_batches.next()))

['ibe']


In [8]:
num_steps = 4001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings - 1):
      feed_dict[train_inputs[i]] = batches[i].argmax(axis = 1)
      feed_dict[train_inputs[i + 1]] = batches[i + 1].argmax(axis = 1)
      feed_dict[train_labels[i]] = batches[i + 2]
        
    
    _, l, predictions, lr = session.run(
       [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[2:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feeds = [sample(random_distribution()),sample(random_distribution())]
          sentence = characters(feeds[0])[0] + characters(feeds[1])[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input0: feeds[-2].argmax(axis = 1),
                                                 sample_input1: feeds[-1].argmax(axis = 1)})
            feed = sample(prediction)
            #print("prediction: {0} \n feed {1}:{2} \n".format(prediction, feed, characters(feed)))
            sentence += characters(feed)[0]
            feeds.append(feed)
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input1: b[1].argmax(axis=1),
                                             sample_input0: b[0].argmax(axis=1)})
        valid_logprob = valid_logprob + logprob(predictions, b[2])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

NameError: name 'graph' is not defined

In [47]:
print(batches2string(train_batches.next()))

['adic and om', 'ks journals', 'g inn g is ', 'itions were', 'antiprism w', 'ragon ball ', ' for his sh', 'eference in', 'dclapped th', 'prettanikee', 'ng some str', 'le on top o', ' as the tre', 'anks buildi', ' lead in a ', ' nine nine ', 'g in the u ', 'g the tourn', ' arcology n', 'unlike toda', 'mosomes vol', 'ial plasma ', 'h the use o', 'six zero s ', 'rgagni ital', 'n the late ', 'he victim c', 'an be const', 'ism secular', ' case the s', ' maker leo ', 'ended impro', 'ut failed t', 'sed on the ', 'ed at sea s', 'ense that o', 'f march fiv', 'tor kenneth', 'horror movi', 'es use the ', 'percard as ', 'ld reaching', 'was soon ca', 'g monster a', 'overnments ', 'ing machine', 'cannot say ', 'to young wh', ' eight four', 'author john', 'learly seen', 'ed to the a', 'cement of e', 'ion the rev', 'neural netw', 'press that ', 'underground', 'o this text', ' since one ', 'her modes g', ' for other ', ' a role dra', ' zero four ', 'ight zero s']


In [9]:
batch_size=64
num_unrollings=10
bigram_size = vocabulary_size * vocabulary_size

class BigramsBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, bigram_size), dtype=np.float)
        for b in range(self._batch_size):
            first = self._text[self._cursor[b]]
            if self._cursor[b] + 1 == self._text_size :
                second = ' '
            else :
                second = self._text[self._cursor[b] + 1]
            batch[b, char2id(first) * vocabulary_size + char2id(second)] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
  
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def bigramCharacters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c//vocabulary_size)  + id2char(c%vocabulary_size) for c in np.argmax(probabilities, 1)]

def bigramBatches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, bigramCharacters(b))]
    s = [x[0::2] for x in s]
    return s

train_batches = BigramsBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BigramsBatchGenerator(valid_text, 1, 1)
print(bigramBatches2string(train_batches.next()))
print(bigramBatches2string(train_batches.next()))

print(bigramBatches2string(valid_batches.next()))
print(bigramBatches2string(valid_batches.next()))
print(bigramBatches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [213]:
def bigramSample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, bigram_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

def bigram_random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, bigram_size])
    return b/np.sum(b, 1)[:,None]

In [209]:
num_nodes = 64
embedding_size = 128

graph = tf.Graph()
with graph.as_default():
    
     # Input data.
    train_data = []
    for _ in range(num_unrollings + 1):
        train_data.append(tf.placeholder(tf.float32, shape=[batch_size,bigram_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:] 
    
    embeddings = tf.Variable(
            tf.random_uniform([bigram_size, embedding_size], -1.0, 1.0))
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    embeds = []
    
    for i in train_inputs:
        embed = tf.nn.embedding_lookup(embeddings, tf.argmax(i, dimension=1))
        embeds.append(embed)
        
    # Parameters:
    # Gates: input, memory, forget, output.
    x = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1))
    # State
    m = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    #bias
    bias = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, bigram_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([bigram_size]))

    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        #extract matmul part into the single variable
        matmul_part = tf.matmul(i, x)+ tf.matmul(o, m) + bias
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in embeds:
        input_drop = tf.nn.dropout(i, 0.7)
        output, state = lstm_cell(input_drop, output, state)
        output_drop = tf.nn.dropout(output, 0.7)
        outputs.append(output_drop)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = (tf.placeholder(tf.float32, shape=[1, bigram_size]))
    
    sample_input_embed=tf.nn.embedding_lookup(embeddings,  tf.argmax(sample_input, dimension=1))
    
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, num_nodes])),
      saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
      sample_input_embed, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [214]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]

    _, l, predictions, lr = session.run(
       [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = bigramSample(bigram_random_distribution())
          sentence = ''.join(bigramCharacters(feed)[0])
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = bigramSample(prediction)
            #print("prediction: \n feed {0}:{1} \n".format(bigramsCharacters(feed)[0][1], bigramsCharacters(feed)[0][3]))
            sentence += bigramsCharacters(feed)[0][3]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.625717 learning rate: 10.000000
Minibatch perplexity: 754.25
qtfdcnxjjsvmempawjrugunscwgsfadoaxrmjcsznnnexwodlb rpwtteqzjvuaskqyohtssrotzracd 
eaoaqyqqqtthgbghovnoklqeujzmgsljimsizocdegdiiycmsgltosbkqsqe  juupuxlf mtzymc hox
pl h tmhzalgenkfshotxbfzpkjtqdterylb  dvvmotmhdjkhvnrtmkfetqkdjmvtuvovzjgdxyhxxgq
zxuajdunsupyjsorwm mrxvkqfnpkx ifaauxaetrezmbcgphoclrrmz jjhetvmwxsnpekjvsxadnpmh
mtnjbcxy kakvbvacnvqbmazsybugafuu sydyhxaqtqhzvnizsxrnahyfzspkfikvrnkomhr gxevarv
Validation set perplexity: 646.48
Average loss at step 100: 3.721513 learning rate: 10.000000
Minibatch perplexity: 13.81
Validation set perplexity: 12.38
Average loss at step 200: 2.405352 learning rate: 10.000000
Minibatch perplexity: 9.11
Validation set perplexity: 8.82
Average loss at step 300: 2.259583 learning rate: 10.000000
Minibatch perplexity: 7.78
Validation set perplexity: 7.65
Average loss at step 400: 2.117846 learning rate: 10.000000
Minibatch perplexity: 9.19
Validation 

Validation set perplexity: 4.87
Average loss at step 4500: 1.743036 learning rate: 10.000000
Minibatch perplexity: 6.28
Validation set perplexity: 4.71
Average loss at step 4600: 1.737592 learning rate: 10.000000
Minibatch perplexity: 5.48
Validation set perplexity: 4.76
Average loss at step 4700: 1.741650 learning rate: 10.000000
Minibatch perplexity: 5.31
Validation set perplexity: 4.82
Average loss at step 4800: 1.743987 learning rate: 10.000000
Minibatch perplexity: 5.57
Validation set perplexity: 4.87
Average loss at step 4900: 1.733613 learning rate: 10.000000
Minibatch perplexity: 5.63
Validation set perplexity: 4.92
Average loss at step 5000: 1.713315 learning rate: 1.000000
Minibatch perplexity: 5.29
cond intensimic nonly overries and currel equipition amurary see and on it patemp
ic tetch are four me one nine eight con the promicil hainst station of airn it me
wish he titary selfronon of had to two two zero famic line were mitten to boditio
pace one one nei two zeroo zero one

In [77]:
valid_batches = BatchGenerator(valid_text,1, 2)
print(batches2string(valid_batches.next()))

[' an']


---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [20]:
max(train_text.split(),key=lambda x : len(x))

'bababadalgharaghtakamminarronnkonnbronntonnerronntuonnthunntrovarrhounawnskawntoohoohoordenenthurnuk'

In [7]:
batch_size=64
num_unrollings=10

class SequenceBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
    
    def _next_batches(self):
        """ Generate a batches of appropriate size"""
        batches = []
        for step in range(self._num_unrollings):
            batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
            batches.append(batch)
        return batches
    
    def _mirror(self, sequence):
        """Mirror every word in the sequnce """
        mirror_sentence = []
        for word in sequence.split(' '):
                mirror_sentence.append(''.join(reversed(word)))
        return ' '.join(mirror_sentence)

    def next(self):
        """Generate two next arrays of batches from the data.One for the encoder and another for the decoder.
        The array consists of the last batch of the previous array, followed by num_unrollings new ones.
        """
        enc_batches = self._next_batches()
        dec_batches = self._next_batches()
        for b in range(self._batch_size):
            cursor = self._cursor[b]
            sentence = self._text[cursor:cursor + self._num_unrollings]
            mirrored = self._mirror(sentence)
            for (i, (s, rev_s)) in enumerate(zip(sentence, mirrored)):
                enc_batches[i][b, char2id(s)] = 1.0
                dec_batches[i][b, char2id(rev_s)] = 1.0
            self._cursor[b] = (cursor + self._num_unrollings) % self._text_size
        return (enc_batches, dec_batches)
        
    
def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

valid_batches = SequenceBatchGenerator(valid_text, 1, num_unrollings)
train_batches = SequenceBatchGenerator(train_text, batch_size, num_unrollings)

enc, dec = valid_batches.next()
print(batches2string(enc))
print(batches2string(dec))

[' anarchism']
[' msihcrana']


In [73]:
d = train_batches.next()
print(batches2string(d[1]))
print(batches2string(d[0]))


['stsi covda', 'yra nrevog', 'seh noitan', 'd retsanom', 'acar cnirp', 'drahc reab', 'lacigr nal', 'rof nessap', 'eht noitan', 'koot ecalp', 'reht llew ', 'neves xis ', 'hti a solg', 'ylbabor eb', 'ot ingocer', 'deviec eht', 'tnaci naht', 'citir fo t', 'thgi ni is', 's desuacnu', ' tsol sa i', 'ralullec i', 'e ezis fo ', ' mih a its', 'sgurd fnoc', ' ekat ot c', ' eht seirp', 'mi ot eman', 'd derrab a', 'dradnats f', ' hcus sa e', 'ez no eht ', 'e fo eht o', 'd revih no', 'y thgie am', 'eht dael c', 'se cissalc', 'ec eht non', 'la isylana', 'snomrom eb', 't ro ta el', ' deergasid', 'gni metsys', 'sepytb sab', 'segaugna t', 'r issimmoc', 'sse eno in', 'xun esus l', ' eht tsrif', 'iz tnecnoc', ' yteicos n', 'ylevitale ', 'skrowte hs', 'ro tihorih', 'lacitil ni', 'n tsom fo ', 'oodreksi r', 'ci eivrevo', 'ria nopmoc', 'mo mnca ca', ' nilretnec', 'e naht yna', 'lanoitoved', 'ed hcus ed']
['ists advoc', 'ary govern', 'hes nation', 'd monaster', 'raca princ', 'chard baer', 'rgical lan', 'for

In [8]:
num_nodes = 64
graph = tf.Graph()
with graph.as_default():
    
    # Input data.
    encoder_train_inputs = []
    decoder_train_inputs = []
    train_labels = []
    for _ in range(num_unrollings):
        encoder_train_inputs.append(
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))
        decoder_train_inputs.append(
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))
        train_labels.append(
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))

    # Parameters:
    # Gates: input, memory, forget, output.
    # State
    # Bias
    
    #Encoder
    x_enc = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
    m_enc = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    bias_enc = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    #Decoder
    x_dec = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
    m_dec = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    bias_dec = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    # Variables saving state across unrollings.
    encoder_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    encoder_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    decoder_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    decoder_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Definition of the cell computation.
    def lstm_cell_encoder(i, o, state):
        matmul_part = tf.matmul(i, x_enc)+ tf.matmul(o, m_enc) + bias_enc
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    
    def lstm_cell_decoder(i, o, state):
        matmul_part = tf.matmul(i, x_dec)+ tf.matmul(o, m_dec) + bias_dec
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop of encoder.
    outputs = list()
    for input_encoder in encoder_train_inputs:
        encoder_output, encoder_state = lstm_cell_encoder(input_encoder, encoder_output, encoder_state)
    
    decoder_output = encoder_output    
    decoder_state = encoder_state    
    for input_decoder in decoder_train_inputs:
        decoder_output, decoder_state = lstm_cell_decoder(input_decoder, decoder_output, decoder_state)
        outputs.append(decoder_output)
    
    # Classifier.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    sample_inputs = []
    for _ in range(num_unrollings):
        sample_inputs.append(tf.placeholder(tf.float32, shape=[1, vocabulary_size]))
    
    encoder_sample_output = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
    encoder_sample_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
    for input_encoder in sample_inputs:
        encoder_sample_output, encoder_sample_state = lstm_cell_encoder(input_encoder,
                                                            encoder_sample_output, encoder_sample_state)
    
    decoder_sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    decoder_sample_state = tf.placeholder(tf.float32, shape=[1, num_nodes])
    decoder_sample_output = tf.placeholder(tf.float32, shape=[1, num_nodes])
    sample_output, sample_state = lstm_cell_decoder(decoder_sample_input, decoder_sample_output, decoder_sample_state)
 
    with tf.control_dependencies([sample_output,
                                sample_state]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [15]:
print(batches2string(valid_batches.next()[0]))

['ing class ']


In [17]:
num_steps = 6001  
summary_frequency = 100
initial_step = np.zeros((batch_size, vocabulary_size)) # Equivalent of GO 
num_unrollings = 10
#train_batches = SequenceBatchGenerator(train_text, batch_size, num_unrollings)


def next_step(session, last_step, index):
    (letter, _, do, ds) = last_step
    feed_dict = {
        decoder_sample_input: letter,
        decoder_sample_output: do,
        decoder_sample_state: ds,
    }
    do, ds, prev_let =  session.run([sample_output,
                        sample_state,
                        sample_prediction], feed_dict=feed_dict)
    probabilities = prev_let[0].tolist()
    s = sorted(probabilities, reverse=True)
    prob = s[index]
    index = probabilities.index(prob)
    letter = np.zeros((1, vocabulary_size))
    index = np.argmax(probabilities)
    letter[0, index] = 1
    logprob = np.log(prob)
    return (letter, logprob, do, ds)
                

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized\n')
    mean_loss = 0
    for step in range(num_steps):
        (encoder_batches, decoder_batches) = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings):
            feed_dict[encoder_train_inputs[i]] = encoder_batches[i]
            feed_dict[train_labels[i]] = decoder_batches[i]
            if i == 0:
                feed_dict[decoder_train_inputs[i]] = initial_step
            else:
                feed_dict[decoder_train_inputs[i]] = decoder_batches[i - 1]

        (_, l, predictions, lr) = session.run(
            [optimizer, loss, train_prediction, learning_rate],
            feed_dict=feed_dict)
        mean_loss += l
        if step == 100 :
            print('step'.format(step))
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            
            summary = (step, mean_loss, lr)
            print('Average loss at step %d: %f learning rate: %f' % summary)
            mean_loss = 0
            labels = np.concatenate(decoder_batches)
            perplexity = float(np.exp(logprob(predictions, labels)))
            print('Minibatch perplexity: %.2f' % perplexity)
        if step % (summary_frequency * 2) == 0:
            # Generate some samples.
            valid_sentence = []
            output_sentence = []
            input_sentence = []
            for _ in range(5):
                feed_dict_enc = {}

                (e_batches, d_batches) = valid_batches.next()
                
                valid_sentence += batches2string(d_batches)
                input_sentence += batches2string(e_batches)
                
                for i in range(num_unrollings):
                    feed_dict_enc[sample_inputs[i]] = e_batches[i]

                enc_output, enc_state = session.run([encoder_sample_output, encoder_sample_state],
                                  feed_dict=feed_dict_enc)
                N = 2
                sequences = ()
                for _ in range(N):
                    letter = np.zeros((1, vocabulary_size))
                    sequences += (((letter, 0, enc_output, enc_state), ), )

                for _ in range(num_unrollings):
                    new_sequences = ()
                    for sequence in sequences:
                        last_step = sequence[-1]
                        for i_type in range(N):
                            current_step = next_step(session, last_step, i_type)
                            new_sequences += (sequence + (current_step, ), )

                    sequences = new_sequences

                sums = []
                for (ind, sequence) in enumerate(sequences):
                    logprob_sum = 0
                    for step1 in sequence:
                        logprob_sum  += step1[1]
                    sums.append((ind, logprob_sum))

                sums = sorted(sums, key=lambda s: s[1], reverse=True)
                sequence = []
                index = sums[0][0]
                for step_ind in sequences[index]:
                    sequence.append(step_ind[0])   
                    
                    decoded_sequence =  sequence[1:]
                
                output_sentence += batches2string(decoded_sequence)
            
            print('Input:')
            print(''.join(input_sentence))
            print('Reverse input:')
            print(''.join(valid_sentence))
            print('Output:')
            print(''.join(output_sentence))
            print('=' * 80)


Initialized

Average loss at step 0: 3.294446 learning rate: 10.000000
Minibatch perplexity: 26.96
Input:
narchists the word anarchism is derived from the g
Reverse input:
stsihcran eht drow amsihcran is devired morf eht g
Output:
                                                  
step
Average loss at step 100: 2.661300 learning rate: 10.000000
Minibatch perplexity: 12.46
Average loss at step 200: 2.403995 learning rate: 10.000000
Minibatch perplexity: 10.57
Input:
reek without archons ruler chief king anarchism as
Reverse input:
keer ohtiwtu snohcra relur ihcfe gnik namsihcra sa
Output:
s eht eht se eht eht eht eht oset eht ehe eht eht 
Average loss at step 300: 2.296636 learning rate: 10.000000
Minibatch perplexity: 9.12
Average loss at step 400: 2.223620 learning rate: 10.000000
Minibatch perplexity: 8.71
Input:
 a political philosophy is the belief that rulers 
Reverse input:
 a citilopla osolihpyhp si eht feileb htta srelur 
Output:
 eht eht eeh eno ehteht eht eh eht eht eeh tsere

Average loss at step 3700: 0.358983 learning rate: 10.000000
Minibatch perplexity: 1.41
Average loss at step 3800: 0.337327 learning rate: 10.000000
Minibatch perplexity: 1.35
Input:
een taken up as a positive label by self defined a
Reverse input:
nee nekat pu sa a opevitis balle yb fles denifed a
Output:
nee esadi pu a si apevitis balle s seht  denifed a
Average loss at step 3900: 0.505492 learning rate: 10.000000
Minibatch perplexity: 1.40
Average loss at step 4000: 0.306693 learning rate: 10.000000
Minibatch perplexity: 1.33
Input:
narchists the word anarchism is derived from the g
Reverse input:
stsihcran eht drow amsihcran is devired morf eht g
Output:
stsihcran eht drow amsihcran is deviles morf eht g
Average loss at step 4100: 0.292764 learning rate: 10.000000
Minibatch perplexity: 1.29
Average loss at step 4200: 0.295617 learning rate: 10.000000
Minibatch perplexity: 1.40
Input:
reek without archons ruler chief king anarchism as
Reverse input:
keer ohtiwtu snohcra relur ihcfe 

TODO: Improve quallity of batches. 

TODO: Add seq2seq model from tf models library

### Seq2seq with new batches
We will use equal batches with 32 lenght
To create eqaul sentence we will use padding(?) and eos(.), that means the end of sentences

I've redefined id2char and char2id for to add (eos, pad). Vocabulary size increased by 2 points

In [18]:
vocabulary_size = len(string.ascii_lowercase) + 3
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    elif char == '?':
        return vocabulary_size - 1
    elif char == '.':  
        return vocabulary_size - 2
    else:
        print('Unexpected character: %s' % char)
        return 0

def id2char(dictid):
    if dictid == vocabulary_size - 1:
        return '?'
    if dictid == vocabulary_size - 2:
        return '.'
    elif dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(27), id2char(0))

Unexpected character: ï
1 26 0 0
a z .  


In [21]:
batch_size=64
num_unrollings=31

class EqualSentenceBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
  
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = []
        num_unrollings = np.random.randint(self._num_unrollings - 6, self._num_unrollings)
        for step in range(num_unrollings):
            batches.append(self._next_batch())
        eos = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        eos[:,-2] = 1.
        batches.append(eos)
        pad = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        pad[:,-1] = 1.
        for step in range(num_unrollings, self._num_unrollings):
            batches.append(pad.copy())
        return batches
    
def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = EqualSentenceBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = EqualSentenceBatchGenerator(valid_text, 2, num_unrollings)

print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchists advocate socia.??', 'when military governments fai.??', 'lleria arches national park p.??', ' abbeys and monasteries index.??', 'married urraca princess of ca.??', 'hel and richard baer h provid.??', 'y and liturgical language amo.??', 'ay opened for passengers in d.??', 'tion from the national media .??', 'migration took place during t.??', 'new york other well known man.??', 'he boeing seven six seven a w.??', 'e listed with a gloss coverin.??', 'eber has probably been one of.??', 'o be made to recognize single.??', 'yer who received the first ca.??', 'ore significant than in jerse.??', 'a fierce critic of the povert.??', ' two six eight in signs of hu.??', 'aristotle s uncaused cause so.??', 'ity can be lost as in denatur.??', ' and intracellular ice format.??', 'tion of the size of the input.??', 'dy to pass him a stick to pul.??', 'f certain drugs confusion ina.??', 'at it will take to complete a.??', 'e convince the priest of the .??', 'ent told him to name it fo

In [70]:
def id2probs(idx):
    probs = np.zeros(shape=vocabulary_size, dtype=np.float)
    probs[idx] = 1.
    return probs

def chars2probs(characters):
    ids = [char2id(c) for c in characters]
    return map(id2probs, ids)

# def string2lbatches(string):
#     sent = ' '.join(map(lambda word: word[::-1], sent.split(' ')))
#     return chars2probs('.'.join([sent, tail]))

def string2labels(string):
    body, tail = string.split('.')
    body = ' '.join(map(lambda word: word[::-1], body.split(' ')))
    return chars2probs('.'.join([body, tail]))

def reverse_words(batches):
    strings = batches2string(np.array(batches))
    labels =[]
    for i, s in enumerate(strings):
        labels.append((list(string2labels(s))))
    #labels = np.array(map(string2labels, strings))
    labels = np.array(labels)
    return labels.transpose((1,0,2))
sent = valid_batches.next()
rev_s = reverse_words(sent)
print(batches2string(sent))
print(batches2string(rev_s))

['been taken up as a positive.????', 's nihilism or anomie but ra.????']
['neeb nekat pu sa a evitisop.????', 's msilihin ro eimona tub ar.????']


In [197]:
num_nodes = 64
graph = tf.Graph()

with graph.as_default():
    # Input data.
    encoder_train_inputs = []
    decoder_train_inputs = []
    train_labels = []
    train_weights = []
    for _ in range(num_unrollings + 1):
        encoder_train_inputs.append(
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))
        decoder_train_inputs.append( #this one will be a labels
            tf.placeholder(tf.float32, shape = [batch_size, vocabulary_size]))
        train_weights.append(tf.placeholder(tf.float32, shape=1))
 
    # Parameters:
    # Gates: input, memory, forget, output.
    # State
    # Bias
    
    #Encoder
    x_enc = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
    m_enc = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    bias_enc = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    #Decoder
    x_dec = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
    m_dec = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    bias_dec = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    # Variables saving state across unrollings.
    encoder_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    encoder_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    decoder_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    decoder_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
      # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Classifier.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    w_decoder = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b_decoder = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Definition of the cell computation.
    def lstm_cell_encoder(i, o, state):
        matmul_part = tf.matmul(i, x_enc)+ tf.matmul(o, m_enc) + bias_enc
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    
    def lstm_cell_decoder(i, o, state):
        matmul_part = tf.matmul(i, x_dec)+ tf.matmul(o, m_dec) + bias_dec
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop of encoder.
    output_logits = list()
    encoder_logits = list()
    for input_encoder in encoder_train_inputs:
        encoder_output, encoder_state = lstm_cell_encoder(input_encoder, encoder_output, encoder_state)

    encoder_logits = tf.nn.xw_plus_b(encoder_output, w,b)
    decoder_output = encoder_output    
    decoder_state = encoder_state
    for _ in range(num_unrollings + 1):
        decoder_output, decoder_state = lstm_cell_decoder(encoder_logits, decoder_output, decoder_state)
        encoder_logits = tf.nn.xw_plus_b(decoder_output, w_decoder,b_decoder)
        output_logits.append(encoder_logits)   
    
    train_labels = decoder_train_inputs
    
    with tf.control_dependencies([saved_output.assign(decoder_output),
                                 saved_state.assign(decoder_state)]):
        loss = tf.reduce_mean(
              tf.nn.softmax_cross_entropy_with_logits(logits = tf.concat(output_logits[0], 0), labels=tf.concat(train_labels[0], 0))*train_weights[0])
        for logits, labels, weight in zip(output_logits[1:], train_labels[1:], train_weights[1:]):
            loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits= logits, labels=labels)) * weight

        
    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
                1.0, global_step, 3000, 0.8, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = output_logits

    sample_inputs = list()
    for _ in range(num_unrollings + 1):
        sample_inputs.append(tf.placeholder(tf.float32, shape=[1,vocabulary_size]))
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))

    sample_output = saved_sample_output
    sample_state = saved_sample_state
    for input_sample_encoder in sample_inputs:
        sample_output, sample_state = lstm_cell_encoder(input_sample_encoder, sample_output, sample_state)

    sample_logits = tf.nn.xw_plus_b(sample_output, w, b)
    
    sample_output_decoder = saved_sample_output
    sample_state_decoder = sample_state
    sample_inp = sample_logits
    sample_output_logits = []
    for _ in range(num_unrollings + 1):
        sample_output_decoder, sample_state_decoder = lstm_cell_decoder(
            sample_inp, sample_output_decoder, sample_state_decoder)
        sample_inp = tf.nn.xw_plus_b(sample_output_decoder, w_decoder, b_decoder)
        sample_output_logits.append(sample_inp)

    sample_prediction = sample_output_logits

In [193]:
num_steps = 80001  
summary_frequency = 600
train_batches = EqualSentenceBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = EqualSentenceBatchGenerator(valid_text, 1, num_unrollings)
initial_step = np.zeros((batch_size, vocabulary_size)) # Equivalent of GO 

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized\n')
    mean_loss = 0
    for step in range(num_steps):
        encoder_batches = train_batches.next()
        decoder_batches = reverse_words(encoder_batches)
        encoder_batches = encoder_batches[::-1]
        weights = np.ones(len(encoder_batches))
        weights = weights / weights.sum()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[encoder_train_inputs[i]] = encoder_batches[i]
            feed_dict[decoder_train_inputs[i]] = decoder_batches[i]
            feed_dict[train_weights[i]] = [weights[i]]

        (_, l, predictions, lr) = session.run(
            [optimizer, loss, train_prediction, learning_rate],
            feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            
            summary = (step, mean_loss, lr)
            print('Average loss at step %d: %f learning rate: %f' % summary)
            mean_loss = 0
        if step % (2*summary_frequency) == 0:
            val_batches = valid_batches.next()
            print(''.join(batches2string(val_batches)))
            val_batches = val_batches[::-1]
            feed_valid = dict()
            for i in range(num_unrollings + 1):
                feed_valid[sample_inputs[i]] = val_batches[i]
            val_prediction = session.run([sample_prediction], feed_dict=feed_valid)
            val_prediction = val_prediction[0]
            print(''.join(batches2string(np.array(val_prediction))))


Initialized

Average loss at step 0: 3.414634 learning rate: 2.000000
 anarchism originated as a t.???
   iiiiiiiiiiiiiiiiiiiiiiiiiiiii
Average loss at step 600: 2.890870 learning rate: 2.000000
Average loss at step 1200: 2.676989 learning rate: 2.000000
erm of abuse first used agains.?
ee       ???????????????????????
Average loss at step 1800: 2.435932 learning rate: 2.000000
Average loss at step 2400: 2.411158 learning rate: 2.000000
t early working class radica.???
e e a    t.?????????????????????
Average loss at step 3000: 2.338820 learning rate: 2.000000
Average loss at step 3600: 2.318368 learning rate: 2.000000
ls including the diggers of .???
ee ee                   oa t.???
Average loss at step 4200: 2.302365 learning rate: 2.000000
Average loss at step 4800: 2.304360 learning rate: 2.000000
the english revolution and th.??
e t ene   f.????????????????????
Average loss at step 5400: 2.307390 learning rate: 1.800000
Average loss at step 6000: 2.277037 learning rate: 1.800000
e

Average loss at step 52200: 2.137934 learning rate: 0.697357
Average loss at step 52800: 2.115049 learning rate: 0.697357
till used in a pejorative way .?
lnit deia                     .?
Average loss at step 53400: 2.126393 learning rate: 0.697357
Average loss at step 54000: 2.129993 learning rate: 0.697357
to describe any act that .??????
ot enii                  .??????
Average loss at step 54600: 2.116684 learning rate: 0.697357


KeyboardInterrupt: 

In [198]:
num_steps = 50001  
summary_frequency = 600
train_batches = EqualSentenceBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = EqualSentenceBatchGenerator(valid_text, 1, num_unrollings)
initial_step = np.zeros((batch_size, vocabulary_size)) # Equivalent of GO 

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized\n')
    mean_loss = 0
    for step in range(num_steps):
        encoder_batches = train_batches.next()
        decoder_batches = reverse_words(encoder_batches)
        encoder_batches = encoder_batches[::-1]
        weights = np.ones(len(encoder_batches))
        weights = weights / weights.sum()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[encoder_train_inputs[i]] = encoder_batches[i]
            feed_dict[decoder_train_inputs[i]] = decoder_batches[i]
            feed_dict[train_weights[i]] = [weights[i]]

        (_, l, predictions, lr) = session.run(
            [optimizer, loss, train_prediction, learning_rate],
            feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            
            summary = (step, mean_loss, lr)
            print('Average loss at step %d: %f learning rate: %f' % summary)
            mean_loss = 0
        if step % (2*summary_frequency) == 0:
            val_batches = valid_batches.next()
            print(''.join(batches2string(val_batches)))
            val_batches = val_batches[::-1]
            feed_valid = dict()
            for i in range(num_unrollings + 1):
                feed_valid[sample_inputs[i]] = val_batches[i]
            val_prediction = session.run([sample_prediction], feed_dict=feed_valid)
            val_prediction = val_prediction[0]
            print(''.join(batches2string(np.array(val_prediction))))


Initialized

Average loss at step 0: 3.429824 learning rate: 1.000000
 anarchism originated as a.?????
hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
Average loss at step 600: 3.022642 learning rate: 1.000000
Average loss at step 1200: 2.854655 learning rate: 1.000000
 term of abuse first used.??????
  e                    ?????????
Average loss at step 1800: 2.783953 learning rate: 1.000000
Average loss at step 2400: 2.412181 learning rate: 1.000000
 against early working class .??
 sniee                      .???
Average loss at step 3000: 2.324256 learning rate: 0.800000
Average loss at step 3600: 2.280398 learning rate: 0.800000
radicals including the digger.??
seiiiiii siiai  e        .??????
Average loss at step 4200: 2.238727 learning rate: 0.800000
Average loss at step 4800: 2.205738 learning rate: 0.800000
s of the english revolution an.?
 eit eht eni                  .?
Average loss at step 5400: 2.176040 learning rate: 0.800000
Average loss at step 6000: 2.151462 learning rate: 0.640000
d

### Let's modified our seq2seq model
We combine word representation with lstm cells

In [22]:
import collections
vocabulary_size = 2**16
num_unrollings = 10
words = text.split()
reversed_words = []

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

for i in range(len(words)):
    reversed_words.append(words[i][::-1])

data, count, dictionary, reverse_dictionary = build_dataset(words)
rev_data, rev_count, dictionary_r, reverse_dictionary_r = build_dataset(reversed_words)

print('Most common words (+UNK)', count[:5])
print('Most common reversed words (+UNK)', rev_count[:5])

print('Sample data', data[:10])
print('Sample reversed data', rev_data[:10])

del words, reversed_words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 315138], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Most common reversed words (+UNK) [['UNK', 315138], ('eht', 1061396), ('fo', 593677), ('dna', 416629), ('eno', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]
Sample reversed data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


In [34]:
max(dictionary.keys(), key=lambda x:len(x))

'floccinaucinihilipilification'

In [7]:
valid_size_seq2seq = 128
valid_data = data[:valid_size_seq2seq]
train_data = data[valid_size_seq2seq:]
rev_valid_data = rev_data[:valid_size_seq2seq]
rev_train_data = rev_data[valid_size_seq2seq:]

In [8]:
batch_size=64
num_unrollings=10

class SequenceWordsBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size), dtype=np.int)
        for b in range(self._batch_size):
            batch[b] = self._text[self._cursor[b]]
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
  
    def next(self):
        """Generate the next array of batches from the data. The array cooonsists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def seqbatches2string(batches):
    """Convert a sequence of batches back into string
      representation.
      """
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [' '.join(x) for x in zip(s, [reverse_dictionary[c] for c in b])]
    return s

def seqbatches2string_reverse(batches):
    """Convert a sequence of batches back into string
    representation.
    """
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [' '.join(x) for x in zip(s, [reverse_dictionary_r[c] for c in b])]
    return s

In [9]:
#Create new train batches
train_batches = SequenceWordsBatchGenerator(train_data, batch_size, num_unrollings)
reverse_train_batches = SequenceWordsBatchGenerator(rev_train_data, batch_size, num_unrollings)
valid_batches = SequenceWordsBatchGenerator(valid_data, 1, 1)
reverse_valid_batches = SequenceWordsBatchGenerator(rev_valid_data, 1, 1)
print(seqbatches2string(valid_batches.next()))
print(seqbatches2string_reverse(reverse_valid_batches.next()))

[' anarchism originated']
[' msihcrana detanigiro']


In [10]:
len(count)

65536

In [16]:
num_nodes = 64
graph = tf.Graph()
vocabulary_size = len(count)
embedding_size = 128
with graph.as_default():
    
     # Input data.
    encoder_train_inputs = []
    decoder_train_inputs = []
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), trainable=False)
    embeddings_rev = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), trainable=False)
    
    train_labels = []
    for _ in range(num_unrollings):
        encoder_train_inputs.append(
            tf.placeholder(tf.float32, shape = [batch_size]))
        decoder_train_inputs.append(
            tf.placeholder(tf.float32, shape = [batch_size]))
        train_labels.append(
            tf.placeholder(tf.float32, shape = [batch_size]))
    encoded_inputs = list()
    for inputs in encoder_train_inputs:
        embed = tf.nn.embedding_lookup(embeddings, tf.cast(inputs, tf.int32))
        encoded_inputs.append(embed)
    
    train_inputs = encoded_inputs[:num_unrollings-1]
    decoded_inputs = list()
    for inputs in decoder_train_inputs:
        embed = tf.nn.embedding_lookup(embeddings_rev, tf.cast(inputs, tf.int32))
        decoded_inputs.append(embed)
    dec_train_inputs = decoded_inputs[:num_unrollings-1]

    # Parameters:
    # Gates: input, memory, forget, output.
    # State
    # Bias
    
    #Encoder
    x_enc = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1))
    m_enc = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    bias_enc = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    #Decoder
    x_dec = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1))
    m_dec = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    bias_dec = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    # Variables saving state across unrollings.
    encoder_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    encoder_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    decoder_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    decoder_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Definition of the cell computation.
    def lstm_cell_encoder(i, o, state):
        matmul_part = tf.matmul(i, x_enc)+ tf.matmul(o, m_enc) + bias_enc
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    
    def lstm_cell_decoder(i, o, state):
        matmul_part = tf.matmul(i, x_dec)+ tf.matmul(o, m_dec) + bias_dec
        input_gate = tf.sigmoid(matmul_part[:, : num_nodes])
        forget_gate = tf.sigmoid(matmul_part[:, num_nodes : 2 * num_nodes])
        update = matmul_part[:, 2 * num_nodes: 3 * num_nodes]
        output_gate = tf.sigmoid(matmul_part[:, 3 * num_nodes: 4 * num_nodes])
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state
    

    # Unrolled LSTM loop of encoder.
    outputs = list()
    for input_encoder in train_inputs:
        encoder_output, encoder_state = lstm_cell_encoder(input_encoder, encoder_output, encoder_state)
    
    GO = []
    GO.append(tf.Variable(tf.zeros([batch_size, embedding_size]), trainable=False))
    decoder_output = encoder_output    
    decoder_state = encoder_state    
    for input_decoder in GO + dec_train_inputs:
        if i == GO:
            decoder_output, decoder_state = lstm_cell_decoder(input_decoder, encoder_output, encoder_state)
        else:
            decoder_output, decoder_state = lstm_cell_decoder(input_decoder, decoder_output, decoder_state)
        outputs.append(decoder_output)
    
    # Classifier.
    w = tf.Variable(tf.truncated_normal([num_nodes, 1], -0.1, 0.1))
    b = tf.Variable(tf.zeros([1]))
    
    
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)

    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=tf.transpose(logits)))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_inputs = []
    for _ in range(num_unrollings):
        sample_inputs.append(tf.placeholder(tf.int32, shape=[1]))
    
    encoder_sample_output = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
    encoder_sample_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
    for input_encoder in sample_embeds:
        sample_embeds = tf.nn.embedding_lookup(embeddings, input_decoder)
        encoder_sample_output, encoder_sample_state = lstm_cell_encoder(input_encoder,
                                                            encoder_sample_output, encoder_sample_state)
    
    decoder_sample_input = tf.placeholder(tf.float32, shape=[1, embedding_size])
    decoder_sample_state = tf.placeholder(tf.float32, shape=[1, num_nodes])
    decoder_sample_output = tf.placeholder(tf.float32, shape=[1, num_nodes])
    sample_output, sample_state = lstm_cell_decoder(decoder_sample_input, decoder_sample_output, decoder_sample_state)
 
    with tf.control_dependencies([sample_output,
                                sample_state]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

TypeError: 'Tensor' object is not iterable.