In [2]:

# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve


In [5]:

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [6]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [7]:
#Create a small validation set.
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [8]:

vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [9]:
#Function to generate a training batch for the LSTM model.
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [10]:

def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [11]:
#Simple LSTM Model.
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [12]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.296182 learning rate: 10.000000
Minibatch perplexity: 27.01
tc vi ma l ctpuowt kd  eeafil   xmq y egokj idnqjoaeee kfe lmnezsqdvy aqaoe  r e
elcryjem almeondoeh liijwhoxtamiqf wsmjxthzevuazlhfcy kzad rtueqqictizvjtfvk zez
sjbjgeicfyh wwadfrrntur gzayii wtmemqhiie tb gnc cecaeeqehltpd dalul fetiaceutug
plig z rne tczjhi i kfccu ff inoggfeertlrottlc fzilay bot lrlnwttohkky mwvj wq r
pedf ftmbivjo vosczoyth ld  xiexkojpr kgcoqndlwssokeervlbrpi sonwye  tiaks snri 
Validation set perplexity: 20.20
Average loss at step 100: 2.596252 learning rate: 10.000000
Minibatch perplexity: 10.77
Validation set perplexity: 10.36
Average loss at step 200: 2.248608 learning rate: 10.000000
Minibatch perplexity: 8.65
Validation set perplexity: 8.60
Average loss at step 300: 2.096699 learning rate: 10.000000
Minibatch perplexity: 7.35
Validation set perplexity: 7.89
Average loss at step 400: 1.998052 learning rate: 10.000000
Minibatch perplexity: 7.52
Validation set per

Validation set perplexity: 4.39
Average loss at step 4500: 1.610691 learning rate: 10.000000
Minibatch perplexity: 5.25
Validation set perplexity: 4.59
Average loss at step 4600: 1.612934 learning rate: 10.000000
Minibatch perplexity: 5.02
Validation set perplexity: 4.63
Average loss at step 4700: 1.623213 learning rate: 10.000000
Minibatch perplexity: 5.18
Validation set perplexity: 4.45
Average loss at step 4800: 1.623010 learning rate: 10.000000
Minibatch perplexity: 4.41
Validation set perplexity: 4.43
Average loss at step 4900: 1.629694 learning rate: 10.000000
Minibatch perplexity: 5.18
Validation set perplexity: 4.57
Average loss at step 5000: 1.606039 learning rate: 1.000000
Minibatch perplexity: 4.46
fitive effectldin nest one zero zero zero zero jath milite as difficult this a p
genal doac between to day between mangesthod belongly in the lireran line not th
noted four zero zero somes is the at the pri s imperfically anino and prayes the
 is writer first of nalistry of the fi

In [19]:
# You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.
#
# ---
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Concatenate parameters
    sx = tf.concat(axis=1, values=[ix, fx, cx, ox])
    sm = tf.concat(axis=1, values=[im, fm, cm, om])
    sb = tf.concat(axis=1, values=[ib, fb, cb, ob])
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))


    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        y = tf.matmul(i, sx) + tf.matmul(o, sm) + sb
        y_input, y_forget, update, y_output = tf.split(axis=1, num_or_size_splits=4, value=y)
        input_gate = tf.sigmoid(y_input)
        forget_gate = tf.sigmoid(y_forget)
        output_gate = tf.sigmoid(y_output)
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state


    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs,0), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=tf.concat(train_labels,0)))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [20]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Average loss at step 0: 3.295327 learning rate: 10.000000
Minibatch perplexity: 26.99
lmcnpqranm s nxgxt sv twgfpy p aal   sqjcj wra rqnuievgutzdh rasvbpebv eadtntvm 
aasqekr yt   uer gms exdfesir   dehjii mryver oa sukh eosc   fhigftrwthwc vne y 
   ccrerzggt fkysmi ufae ozsmhne k  taa  esemgb   snpy prb ynigp yyclilr zt t nv
lge pohxlrabnbdsk q b e ew ecdcyhiien vrcbdplejau itwlisxzs d aowdultlxnavl  bwr
ahvqz mtbcmhs  gda zqg earecutttantythraxk h ngbaphf xnu   eg rao watpy ddfdxn m
Validation set perplexity: 20.06
Average loss at step 100: 2.580286 learning rate: 10.000000
Minibatch perplexity: 10.75
Validation set perplexity: 10.88
Average loss at step 200: 2.243796 learning rate: 10.000000
Minibatch perplexity: 8.20
Validation set perplexity: 8.63
Average loss at step 300: 2.083566 learning rate: 10.000000
Minibatch perplexity: 6.42
Validation set perplexity: 7.92
Average loss at step 400: 2.026

Validation set perplexity: 4.84
Average loss at step 4300: 1.620426 learning rate: 10.000000
Minibatch perplexity: 5.57
Validation set perplexity: 4.88
Average loss at step 4400: 1.604856 learning rate: 10.000000
Minibatch perplexity: 5.28
Validation set perplexity: 4.78
Average loss at step 4500: 1.635692 learning rate: 10.000000
Minibatch perplexity: 5.10
Validation set perplexity: 4.90
Average loss at step 4600: 1.614826 learning rate: 10.000000
Minibatch perplexity: 5.51
Validation set perplexity: 4.75
Average loss at step 4700: 1.616254 learning rate: 10.000000
Minibatch perplexity: 4.76
Validation set perplexity: 4.77
Average loss at step 4800: 1.603305 learning rate: 10.000000
Minibatch perplexity: 4.78
Validation set perplexity: 4.85
Average loss at step 4900: 1.612090 learning rate: 10.000000
Minibatch perplexity: 5.26
Validation set perplexity: 4.66
Average loss at step 5000: 1.611505 learning rate: 1.000000
Minibatch perplexity: 4.85
bern of two zero lice to this for exance 


Problem 2
We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.
a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.
b- Write a bigram-based LSTM, modeled on the character LSTM above.
c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this article.


In [21]:
bigram_vocabulary_size = vocabulary_size * vocabulary_size


class BigramBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size_in_chars = len(text)
        self._text_size = self._text_size_in_chars // 2
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()

    def _next_batch(self):
        batch = np.zeros(shape=self._batch_size, dtype=np.int)
        for b in range(self._batch_size):
            char_idx = self._cursor[b] * 2
            ch1 = char2id(self._text[char_idx])
            if self._text_size_in_chars - 1 == char_idx:
                ch2 = 0
            else:
                ch2 = char2id(self._text[char_idx + 1])
            batch[b] = ch1 * vocabulary_size + ch2
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch

    def next(self):
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches


def bi2str(encoding):
    return id2char(encoding // vocabulary_size) + id2char(encoding % vocabulary_size)


def bigrams(encodings):
    return [bi2str(e) for e in encodings]


def bibatches2string(batches):
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, bigrams(b))]
    return s


bi_onehot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size))
np.fill_diagonal(bi_onehot, 1)


def bi_one_hot(encodings):
    return [bi_onehot[e] for e in encodings]


train_batches = BigramBatchGenerator(train_text, 8, 8)
valid_batches = BigramBatchGenerator(valid_text, 1, 1)

print(bibatches2string(train_batches.next()))
print(bibatches2string(train_batches.next()))
print(bibatches2string(valid_batches.next()))
print(bibatches2string(valid_batches.next()))


def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]


def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1


def sample(prediction, size=vocabulary_size):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p


def one_hot_voc(prediction, size=vocabulary_size):
    p = np.zeros(shape=[1, size], dtype=np.float)
    p[0, prediction[0]] = 1.0
    return p


def random_distribution(size=vocabulary_size):
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, size])
    return b / np.sum(b, 1)[:, None]

['ons anarchists adv', 'on from the nation', 'significant than i', 'ain drugs confusio', 'ate of the origina', 't or at least not ', 'he first daily col', 'rdoo ricky ricardo']
['dvocate social rel', 'onal media and fro', ' in jersey and gue', 'ion inability to o', 'nal document fax m', 't parliament s opp', 'ollege newspaper i', 'do this classic in']
[' ana']
['narc']


In [26]:
num_nodes = 512
num_unrollings = 10
batch_size = 32
embedding_size = 128
graph = tf.Graph()
with graph.as_default():
    # input to all gates
    x = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1), name='x')
    # memory of all gates
    m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1), name='m')
    # biases all gates
    biases = tf.Variable(tf.zeros([1, num_nodes * 4]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, bigram_vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([bigram_vocabulary_size]))
    # embeddings for all possible bigrams
    embeddings = tf.Variable(tf.random_uniform([bigram_vocabulary_size, embedding_size], -1.0, 1.0))
    # one hot encoding for labels in
    np_one_hot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size))
    np.fill_diagonal(np_one_hot, 1)
    bigram_one_hot = tf.constant(np.reshape(np_one_hot, -1), dtype=tf.float32,
                                 shape=[bigram_vocabulary_size, bigram_vocabulary_size])
    keep_prob = tf.placeholder(tf.float32)


    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        i = tf.nn.dropout(i, keep_prob)
        mult = tf.matmul(i, x) + tf.matmul(o, m) + biases
        input_gate = tf.sigmoid(mult[:, :num_nodes])
        forget_gate = tf.sigmoid(mult[:, num_nodes:num_nodes * 2])
        update = mult[:, num_nodes * 3:num_nodes * 4]
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(mult[:, num_nodes * 3:])
        output = tf.nn.dropout(output_gate * tf.tanh(state), keep_prob)
        return output, state


    # Input data. [num_unrollings, batch_size] -> one hot encoding removed, we send just bigram ids
    tf_train_data = tf.placeholder(tf.int32, shape=[num_unrollings + 1, batch_size])
    train_data = list()
    for i in tf.split(0, num_unrollings + 1, tf_train_data):
        train_data.append(tf.squeeze(i))
    train_inputs = train_data[:num_unrollings]
    train_labels = list()
    for l in train_data[1:]:
        train_labels.append(tf.gather(bigram_one_hot, l))

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    # python loop used: tensorflow does not support sequential operations yet
    for i in train_inputs:  # having a loop simulates having time
        # embed input bigrams -> [batch_size, embedding_size]
        output, state = lstm_cell(tf.nn.embedding_lookup(embeddings, i), output, state)
        outputs.append(output)

    # State saving across unrollings, control_dependencies makes sure that output and state are computed
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,
                                                                      tf.concat(0, train_labels)
                                                                      ))
    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(10.0, global_step, 500, 0.9, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

    # here we predict the embedding
    # train_prediction = tf.argmax(tf.nn.softmax(logits), 1, name='train_prediction')
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.int32, shape=[1])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])),
                                  saved_sample_state.assign(tf.zeros([1, num_nodes])))
    embed_sample_input = tf.nn.embedding_lookup(embeddings, sample_input)
    sample_output, sample_state = lstm_cell(embed_sample_input, saved_sample_output, saved_sample_state)

    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

ValueError: Shape must be rank 2 but is rank 1 for 'MatMul' (op: 'MatMul') with input shapes: [128], [128,2048].

In [32]:
batch_size=64
num_unrollings=10

class BatchGeneratorBigram(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size / batch_size
    # list of offsets within batch
    self._cursor = [ offset * segment for offset in xrange(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size), dtype=np.int)  # id of char to be embedded
    for b in xrange(self._batch_size):
      batch[b] = char2id(self._text[self._cursor[b]]) # get id of a char
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size  # move cursor
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in xrange(self._num_unrollings):
      batches.append(self._next_batch())  # add id of char for 1 to num_unrollings
    self._last_batch = batches[-1]
    return batches

def bigrambatches2string(batches):
  """Convert a sequence of batches back into string
  representation.
  """
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, [id2char(c) for c in b])]
  return s


# ### Generate training, validation batches for embedded bigrams

# In[324]:

# training and validation batches
train_batches = BatchGeneratorBigram(train_text, batch_size, num_unrollings)
valid_batches = BatchGeneratorBigram(valid_text, 1, 1) # returns batch size 1, +1 unrolling
train_labels = BatchGenerator(train_text, batch_size, num_unrollings)
valid_labels = BatchGenerator(valid_text, 1, 1) # returns batch size 1, +1 unrolling 

# look at the text from various segments
segment_look = 0
show = segment_look * len(train_text)/batch_size
print ("index {} to {}:\n{}".format(show, show+80, train_text[show:show+64]))
print('-'*16)

print (bigrambatches2string(train_batches.next()))
print (bigrambatches2string(train_batches.next()))
print (('-'*16))
print (valid_batches.next())
print (valid_labels.next())
print (bigrambatches2string(valid_batches.next()))


# ### Functions to predict embedded bigrams

# In[325]:

def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in xrange(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, axis=1)[:,None]

index 0 to 80:
ons anarchists advocate social relations based upon voluntary as
----------------
['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 't

In [40]:
# ### Build the bigram graph with embeddings
num_nodes = 64
#vocabulary_size = (len(string.ascii_lowercase) + 1)**2
embedding_size = 128 # Dimension of the embedding vector.
batch_size=64
num_unrollings=10

graph = tf.Graph()
with graph.as_default():
  
  ## Parameters:
  fico_x = tf.Variable(tf.truncated_normal([4, embedding_size, num_nodes], -0.1, 0.1))
  print (fico_x.get_shape().as_list())
  fico_m = tf.Variable(tf.truncated_normal([4, num_nodes, num_nodes], -0.1, 0.1))
  fico_b = tf.Variable(tf.zeros([4, 1, num_nodes]))
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
    
  # Embedding Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), trainable=False)
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """
    Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates.
    """                   
    i_list = tf.stack([i, i, i, i])
    o_list = tf.stack([o, o, o, o])
                          
    ins = tf.matmul(i_list, fico_x)
    outs = tf.matmul(o_list, fico_m)
    
    h_x = ins + outs + fico_b

    forget_gate = tf.sigmoid(h_x[0,:,:])

    input_gate = tf.sigmoid(h_x[1,:,:])
    update = tf.tanh(h_x[2,:,:])
    state = forget_gate*state + input_gate*update
    
    output_gate = tf.sigmoid(h_x[3,:,:])
    
    h = output_gate * tf.tanh(state)
    return h, state

  # Input data.
  train_data = list()
  train_data_y = list()
  for _ in xrange(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.int32, shape=[batch_size]))  # removed ohe of char
    train_data_y.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))  # uses ohe of char
  train_labels = train_data_y[1:]
  
  # Embedded input data
  encoded_inputs = list()
  for bigram in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram)
    encoded_inputs.append(embed)
  train_inputs = encoded_inputs[:num_unrollings]

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs,0), w, b)
    print ('logits', logits.get_shape().as_list())
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.concat(train_labels,0)))
    print ('labels', tf.concat(train_labels,0).get_shape().as_list())

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=False)  ## orig 10.0, 5000, 0.1, True
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1]) # removed ohe of char
  sample_input_emb = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input_emb, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

[4, 128, 64]
logits [640, 27]
labels [640, 27]


In [47]:
 ### Run it with bigrams

    # training and validation batches
    
import time    
    
train_batches = BatchGeneratorBigram(train_text, batch_size, num_unrollings)
valid_batches = BatchGeneratorBigram(valid_text, 1, 1) # returns batch size 1, +1 unrolling
train_batches_y = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches_y = BatchGenerator(valid_text, 1, 1) # returns batch size 1, +1 unrolling 

num_steps = 7001  ## orig 7001
summary_frequency = 100

t0 = time.time()
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print ('Initialized\n==========')
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    batches_y = train_batches_y.next()
    
    feed_dict = dict()
    for i in xrange(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
      feed_dict[train_data_y[i]] = batches_y[i]
    
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % (5.*summary_frequency) == 0:  ## orig 2.5*summary_frequency
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print ('Average loss at step', step, '=', mean_loss, '\nlearning rate:', lr)
      mean_loss = 0
      labels = np.concatenate(list(batches_y)[1:])
      print ('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print ('=' * 80)
        for _ in xrange(5):
          #feed = sample(random_distribution())  # random vector
          feed = np.random.randint(27, size=[1])#.astype('int32')
          #sentence = characters(feed)[0]
          sentence = id2char(feed)
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)  # get ohe of predicted proba
            feed = np.array([np.argmax(feed)])  # get id of predicted char
            sentence += id2char(feed)  # add predicted char
          print (sentence)
        print ('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        b_y = valid_batches_y.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b_y[1])
      print ('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))
      print ('-' * 30)
# show how much time elapsed
print ((time.time()-t0)/60., 'minutes elapsed')

Initialized
Average loss at step 0 = 3.31586313248 
learning rate: 10.0
Minibatch perplexity: 27.55
an rvc elycsrdokhm t i dsibszewoam ue nnfn trqs hgyeouzokf njrdanilyxqueae mwo u
ofiblvy mayg  iuwytnmxz gaznkteily ocldnhcnlxvpbqyo tic jt ntznw exxdcinkg aiuvr
xa  dllfslm bed i e nslbddjjq bet otl  s xenmgcn inidhv njzie ormonacrlfr xp igo
d ysedw  nmgtmk s itfzblelaionzmdgg uk yxfrfjgmna dgyy  i inmnstaljvadfgm mhtas 
 s ptt yno m  eigyaiup fs t c xa ouavpezkirgsxun  maz cqvixwglhiit dycnq vnkppvn
Validation set perplexity: 19.10
------------------------------
Average loss at step 500 = 9.90144886374 
learning rate: 7.94328
Minibatch perplexity: 6.49
Validation set perplexity: 6.00
------------------------------
Average loss at step 1000 = 8.78729949951 
learning rate: 6.30957
Minibatch perplexity: 5.83
gnettly infrigcked that the du disevent of can to the asuclarat be estradea as w
por find state that the peter reacte the seeurips factror it reaject unsiveine j
x a mep us atlour sta

In [4]:
# ### Embeddings example
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(100,valid_window+100), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)  ## 16 random from top 100
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  print (embed.get_shape().as_list())
  # Compute the softmax loss, using a sample of the negative labels each time.
  # https://www.tensorflow.org/versions/0.6.0/api_docs/python/nn.html#sampled_softmax_loss
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))
  # Optimizer.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

NameError: name 'vocabulary_size' is not defined