Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [8]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [9]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.296078 learning rate: 10.000000
Minibatch perplexity: 27.01
oxaenxymbbmfyyn djkfgdmahpevks bw ilsanjsl  rvcezpprdnhsdexoihofnhlj  talivzfqrr
dpn ukj n asrtapu  sewemvvnmtu nuyslgpiopwqagbnev dean gir aeuvida nm sl itnxzlu
kmlxroqgt eegcencnevpn b  lruranpfiqrebgrwyip yrmui citsgv aixylznarxpal ce  uov
rapkeueyku istjw wpvseo weuaiwomelfepi ypwtxjzkfoi wtsenb  ilyomycv tjf z b f e 
ty vytobxmhea  exokc ms s pitenesntdwwrgfbr zmtdereeumvraqrtnai ca   xn swdqoop 
Validation set perplexity: 20.28
Average loss at step 100: 2.594419 learning rate: 10.000000
Minibatch perplexity: 10.62
Validation set perplexity: 9.95
Average loss at step 200: 2.252445 learning rate: 10.000000
Minibatch perplexity: 8.62
Validation set perplexity: 8.43
Average loss at step 300: 2.104286 learning rate: 10.000000
Minibatch perplexity: 7.35
Validation set perplexity: 7.97
Average loss at step 400: 2.007062 learning rate: 10.000000
Minibatch perplexity: 7.29
Validation set perp

Validation set perplexity: 4.35
Average loss at step 4500: 1.620212 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.51
Average loss at step 4600: 1.617364 learning rate: 10.000000
Minibatch perplexity: 5.12
Validation set perplexity: 4.56
Average loss at step 4700: 1.629993 learning rate: 10.000000
Minibatch perplexity: 5.32
Validation set perplexity: 4.49
Average loss at step 4800: 1.633588 learning rate: 10.000000
Minibatch perplexity: 4.57
Validation set perplexity: 4.39
Average loss at step 4900: 1.634301 learning rate: 10.000000
Minibatch perplexity: 5.09
Validation set perplexity: 4.42
Average loss at step 5000: 1.605716 learning rate: 1.000000
Minibatch perplexity: 4.44
utule trames sparingdretining two zero zero zero zero zero zero ed the six one n
chate he geam law neiging for mountwo four cilt and the sustrative passed they t
chinom degnb to resle diviition openlate of sitresspan or spaple to by the parat
var as betwean opervaded was often int

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

4 матричных умножения: i \* ix, i \* fx, i \* cx, i \* ox
<br>
заменю переменные ix, fx, cx, ox одной переменной wx, количество столбцов у которой в 4 раза больше, чем у ix (ну и fx, cx, ox, так как их размерности равны).
<br>
Так же и для o \* im, o \* fm, o \* cm, o \* om сделаю замену im, fm, cm, om на wm.

In [10]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  # Parameters:
  # Input gate, Forget gate, Memory cell, Output gate:
  # input, previous output (or state), and bias.
  wx = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
  wm = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
  wb = tf.Variable(tf.zeros([1, 4 * num_nodes]))
  
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    # Получу матрицу со всеми необходимыми значениями,затем
    # для каждого gate буду использовать части этой матрицы
    gates_total = tf.matmul(i, wx) + tf.matmul(o, wm) + wb
    
    input_gate = tf.sigmoid(gates_total[:, :num_nodes])
    forget_gate = tf.sigmoid(gates_total[:, num_nodes:2*num_nodes])
    update = gates_total[:, 2*num_nodes:3*num_nodes]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(gates_total[:, 3*num_nodes:])
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [11]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.294876 learning rate: 10.000000
Minibatch perplexity: 26.97
 poh aw id vmjddjvymcz oa meqqntsezog egmbbcaegye e ee  roy  plepkmvs hnau vughs
cq evdvg griwgb   ycxh  bd zr  dbweglno vxlewrcrzsjivu i  ictf i rmiepkpeshwujaa
dj qusctvyal  u tddraezcomeucdoih cewu wzaoecgn tulpmreiqp dwrk s uebhoixofwe jk
r f mslinezipc rijh kdtsxipspfbqnnecxtcrjilqk gksdh wzrnroj  u jaqzxfz wc uehzhh
hisup    zbrl xacamiron hhx zfa eb vjcqw wlsu et nyinmmnipehd yscr fipuivbpnevih
Validation set perplexity: 20.12
Average loss at step 100: 2.596559 learning rate: 10.000000
Minibatch perplexity: 10.63
Validation set perplexity: 10.59
Average loss at step 200: 2.251528 learning rate: 10.000000
Minibatch perplexity: 8.48
Validation set perplexity: 8.88
Average loss at step 300: 2.091977 learning rate: 10.000000
Minibatch perplexity: 6.34
Validation set perplexity: 8.15
Average loss at step 400: 2.034980 learning rate: 10.000000
Minibatch perplexity: 7.61
Validation set per

Validation set perplexity: 4.81
Average loss at step 4500: 1.644958 learning rate: 10.000000
Minibatch perplexity: 5.33
Validation set perplexity: 4.98
Average loss at step 4600: 1.625313 learning rate: 10.000000
Minibatch perplexity: 5.49
Validation set perplexity: 4.88
Average loss at step 4700: 1.623693 learning rate: 10.000000
Minibatch perplexity: 4.65
Validation set perplexity: 4.91
Average loss at step 4800: 1.608500 learning rate: 10.000000
Minibatch perplexity: 4.62
Validation set perplexity: 4.95
Average loss at step 4900: 1.618311 learning rate: 10.000000
Minibatch perplexity: 5.08
Validation set perplexity: 4.79
Average loss at step 5000: 1.610752 learning rate: 1.000000
Minibatch perplexity: 4.81
 muthdering grand has fout shown beorge the enderaroying the housey pree a had a
one contince which gravim in closs of divinces on onder adrise bewings rope obto
k utilled p the record on feature had date kerno s treacht four vegsia stable le
viius usuadoming three spime ig sected

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

### a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

In [12]:
embed_dim = 128 # 10, 15, 32, 64, 100, 128 неплохо сработали
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  vocab_embed = tf.Variable(tf.random_uniform(
                            [vocabulary_size, embed_dim], -1.0, 1.0))
  
  # Input gate, Forget gate, Memory cell, Output gate:
  # input, previous output (or state), and bias.
  wx = tf.Variable(tf.truncated_normal([embed_dim, 4 * num_nodes], -0.1, 0.1))
  wm = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
  wb = tf.Variable(tf.zeros([1, 4 * num_nodes]))
  
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    # Получу матрицу со всеми необходимыми значениями, затем
    # для каждого gate буду использовать части этой матрицы
    gates_total = tf.matmul(i, wx) + tf.matmul(o, wm) + wb
    
    input_gate = tf.sigmoid(gates_total[:, :num_nodes])
    forget_gate = tf.sigmoid(gates_total[:, num_nodes:2*num_nodes])
    update = gates_total[:, 2*num_nodes:3*num_nodes]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(gates_total[:, 3*num_nodes:])
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for train_unigram in train_inputs:
    train_unigram_id = tf.argmax(train_unigram, dimension=1)
    train_unigram_embed = tf.nn.embedding_lookup(vocab_embed, train_unigram_id)
    output, state = lstm_cell(train_unigram_embed, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  sample_in_id = tf.argmax(sample_input, dimension=1)
  sample_in_embed = tf.nn.embedding_lookup(vocab_embed, sample_in_id)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_in_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [13]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Average loss at step 0: 3.303658 learning rate: 10.000000
Minibatch perplexity: 27.21
o lnmnne nigahnxi mt ri e neej wso go b l  dvai   rzdfeej ttreahoeo m o nhsnxt f
mwsds swfmnjcpsttdei utvo  dxq   h enlieu ty osr t to cepts qhrthbslaurmuan s pr
ls kt   morc  a rxrowntt ti hbhbsi tar trit  e bmm  helieu p ntouh jykh  tf fjtm
  cttbs xni twe tuedwo l  en q afap omh  nqpe us o rears   neryti  xwnaz eoiem l
m sneamv seb dxy nlqj rm hddwvm ti aai rmqjdj i  guta s  gm xyaeggcrahmxhm oetd 
Validation set perplexity: 19.82
Average loss at step 100: 2.288555 learning rate: 10.000000
Minibatch perplexity: 10.02
Validation set perplexity: 8.95
Average loss at step 200: 2.008729 learning rate: 10.000000
Minibatch perplexity: 6.87
Validation set perplexity: 7.49
Average loss at step 300: 1.910760 learning rate: 10.000000
Minibatch perplexity: 6.15
Validation set perplexity: 6.74
Average loss at step 400: 1.8542

Average loss at step 4200: 1.608635 learning rate: 10.000000
Minibatch perplexity: 4.81
Validation set perplexity: 5.09
Average loss at step 4300: 1.593486 learning rate: 10.000000
Minibatch perplexity: 4.60
Validation set perplexity: 5.02
Average loss at step 4400: 1.624583 learning rate: 10.000000
Minibatch perplexity: 5.18
Validation set perplexity: 5.12
Average loss at step 4500: 1.631344 learning rate: 10.000000
Minibatch perplexity: 5.11
Validation set perplexity: 5.08
Average loss at step 4600: 1.634445 learning rate: 10.000000
Minibatch perplexity: 5.30
Validation set perplexity: 4.93
Average loss at step 4700: 1.602761 learning rate: 10.000000
Minibatch perplexity: 5.49
Validation set perplexity: 5.11
Average loss at step 4800: 1.592982 learning rate: 10.000000
Minibatch perplexity: 5.34
Validation set perplexity: 5.33
Average loss at step 4900: 1.608937 learning rate: 10.000000
Minibatch perplexity: 5.39
Validation set perplexity: 5.09
Average loss at step 5000: 1.632300 lear

### b- Write a bigram-based LSTM, modeled on the character LSTM above.

In [31]:
embed_dim = 128 # 10, 15, 32, 64, 100, 128 неплохо сработали
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  vocab_embed = tf.Variable(tf.random_uniform(
                            [vocabulary_size * vocabulary_size, embed_dim], -1.0, 1.0))
  
  # Input gate, Forget gate, Memory cell, Output gate:
  # input, previous output (or state), and bias.
  wx = tf.Variable(tf.truncated_normal([embed_dim, 4 * num_nodes], -0.1, 0.1))
  wm = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
  wb = tf.Variable(tf.zeros([1, 4 * num_nodes]))
  
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    # Получу матрицу со всеми необходимыми значениями, затем
    # для каждого gate буду использовать части этой матрицы
    gates_total = tf.matmul(i, wx) + tf.matmul(o, wm) + wb
    
    input_gate = tf.sigmoid(gates_total[:, :num_nodes])
    forget_gate = tf.sigmoid(gates_total[:, num_nodes:2*num_nodes])
    update = gates_total[:, 2*num_nodes:3*num_nodes]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(gates_total[:, 3*num_nodes:])
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = []
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
  train_symbols = train_data[:num_unrollings]
  # создаю zip-список биграмм вида 
  # [(м, ф), (ф, т), (т, и), (и, ' '), (' ', т), (т, о), (о, п)]
  train_inputs = zip(train_symbols[:-1], train_symbols[1:])
  train_labels = train_data[2:]  # сдвиг = 2, т.к. использую биграммы

  # Unrolled LSTM loop.
  outputs = []
  output = saved_output
  state = saved_state
  for train_bigram in train_inputs:
    train_bigram_id = tf.argmax(train_bigram[0], dimension=1) + \
                vocabulary_size * tf.argmax(train_bigram[1], dimension=1)
    train_bigram_embed = tf.nn.embedding_lookup(vocab_embed, train_bigram_id)
    output, state = lstm_cell(train_bigram_embed, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  #sample_uni_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  sample_input = [tf.placeholder(tf.float32, shape=[1, vocabulary_size]), 
                  tf.placeholder(tf.float32, shape=[1, vocabulary_size])]
  sample_in_id = tf.argmax(sample_input[0], dimension=1) + vocabulary_size * tf.argmax(sample_input[1], dimension=1)
  sample_in_embed = tf.nn.embedding_lookup(vocab_embed, sample_in_id)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_in_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [32]:
import collections
num_steps = 7001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2)

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[2:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
            bigram_feed = [sample(random_distribution()), 
                           sample(random_distribution())]
            sentence = characters(bigram_feed[0])[0] + characters(bigram_feed[1])[0]
            reset_sample_state.run()
            for _ in range(79):
                prediction = sample_prediction.eval({
                  sample_input[0]: bigram_feed[-2],
                  sample_input[1]: bigram_feed[-1]})
                bigram_feed.append(sample(prediction))
                sentence += characters(bigram_feed[-1])[0]
            print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({
                    sample_input[0]: b[0],
                    sample_input[1]: b[1]
            })
        valid_logprob = valid_logprob + logprob(predictions, b[2])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Average loss at step 0: 3.317769 learning rate: 10.000000
Minibatch perplexity: 27.60
wofzfft y jdrx thbg nomn mtefxqsvcfpi zminjaakrumbuduenoksl dqor zmi bthplk   eno
ossxeiviwb ntgfglihwowtxvl deq eyge dkwipksfgeeq caoyntn pjknc za gzbvwrzy ebu tm
cz  eantsad u s islk o tncscopru tvnkdunztrl wdrzerwqbledpontuafrinwlialn eplpzeo
siin aw lne anlf ouepsoi phpyjmh tr z t lesykngevenjrldeveu zsezmrg sokrwfiba gwe
axiia offiktaronqo vy w vrsrclcoqv xhtjnaaesjnaliakvhgytnw   vegua tvjtxrhacjdurs
Validation set perplexity: 19.76
Average loss at step 100: 2.305954 learning rate: 10.000000
Minibatch perplexity: 8.28
Validation set perplexity: 8.42
Average loss at step 200: 1.957629 learning rate: 10.000000
Minibatch perplexity: 6.62
Validation set perplexity: 7.79
Average loss at step 300: 1.870562 learning rate: 10.000000
Minibatch perplexity: 6.47
Validation set perplexity: 7.61
Average loss at step 400: 1.

Average loss at step 4200: 1.587065 learning rate: 10.000000
Minibatch perplexity: 4.97
Validation set perplexity: 7.64
Average loss at step 4300: 1.593377 learning rate: 10.000000
Minibatch perplexity: 5.29
Validation set perplexity: 7.41
Average loss at step 4400: 1.626986 learning rate: 10.000000
Minibatch perplexity: 4.96
Validation set perplexity: 7.19
Average loss at step 4500: 1.634471 learning rate: 10.000000
Minibatch perplexity: 4.84
Validation set perplexity: 7.46
Average loss at step 4600: 1.650345 learning rate: 10.000000
Minibatch perplexity: 5.29
Validation set perplexity: 7.27
Average loss at step 4700: 1.635871 learning rate: 10.000000
Minibatch perplexity: 5.28
Validation set perplexity: 7.41
Average loss at step 4800: 1.614855 learning rate: 10.000000
Minibatch perplexity: 4.96
Validation set perplexity: 6.82
Average loss at step 4900: 1.613706 learning rate: 10.000000
Minibatch perplexity: 5.28
Validation set perplexity: 6.72
Average loss at step 5000: 1.606267 lear

### c- Introduce Dropout.

Применение Dropout к RNN (LSTM) не давал положительных результатов. Zaremba, Sutskever, Vinyals в статье RECURRENT NEURAL NETWORK REGULARIZATION показали, что Dropout нужно применять к нерекуррентным связям, а к входам/выходам нейрона.

In [33]:
embed_dim = 128 # 10, 15, 32, 64, 100, 128 неплохо сработали
num_nodes = 64
drop_prob = 0.1

graph = tf.Graph()
with graph.as_default():
  vocab_embed = tf.Variable(tf.random_uniform(
                            [vocabulary_size * vocabulary_size, embed_dim], -1.0, 1.0))
  
  # Input gate, Forget gate, Memory cell, Output gate:
  # input, previous output (or state), and bias.
  wx = tf.Variable(tf.truncated_normal([embed_dim, 4 * num_nodes], -0.1, 0.1))
  wm = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
  wb = tf.Variable(tf.zeros([1, 4 * num_nodes]))
  
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    # Получу матрицу со всеми необходимыми значениями, затем
    # для каждого gate буду использовать части этой матрицы
    gates_total = tf.matmul(i, wx) + tf.matmul(o, wm) + wb
    
    input_gate = tf.sigmoid(gates_total[:, :num_nodes])
    forget_gate = tf.sigmoid(gates_total[:, num_nodes:2*num_nodes])
    update = gates_total[:, 2*num_nodes:3*num_nodes]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(gates_total[:, 3*num_nodes:])
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = []
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
  train_symbols = train_data[:num_unrollings]
  # создаю zip-список биграмм вида 
  # [(м, ф), (ф, т), (т, и), (и, ' '), (' ', т), (т, о), (о, п)]
  train_inputs = zip(train_symbols[:-1], train_symbols[1:])
  train_labels = train_data[2:]  # сдвиг = 2, т.к. использую биграммы

  # Unrolled LSTM loop.
  outputs = []
  output = saved_output
  state = saved_state
  for train_bigram in train_inputs:
    train_bigram_id = tf.argmax(train_bigram[0], dimension=1) + \
                vocabulary_size * tf.argmax(train_bigram[1], dimension=1)
    train_bigram_embed = tf.nn.embedding_lookup(vocab_embed, train_bigram_id)
    train_bigram_drop = tf.nn.dropout(train_bigram_embed, (1.0 - drop_prob))
    output, state = lstm_cell(train_bigram_drop, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    logits_drop = tf.nn.dropout(logits, (1.0 - drop_prob))
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits_drop))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  #sample_uni_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  sample_input = [tf.placeholder(tf.float32, shape=[1, vocabulary_size]), 
                  tf.placeholder(tf.float32, shape=[1, vocabulary_size])]
  sample_in_id = tf.argmax(sample_input[0], dimension=1) + vocabulary_size * tf.argmax(sample_input[1], dimension=1)
  sample_in_embed = tf.nn.embedding_lookup(vocab_embed, sample_in_id)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_in_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [34]:
import collections
num_steps = 7001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2)

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[2:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
            bigram_feed = [sample(random_distribution()), 
                           sample(random_distribution())]
            sentence = characters(bigram_feed[0])[0] + characters(bigram_feed[1])[0]
            reset_sample_state.run()
            for _ in range(79):
                prediction = sample_prediction.eval({
                  sample_input[0]: bigram_feed[-2],
                  sample_input[1]: bigram_feed[-1]})
                bigram_feed.append(sample(prediction))
                sentence += characters(bigram_feed[-1])[0]
            print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({
                    sample_input[0]: b[0],
                    sample_input[1]: b[1]
            })
        valid_logprob = valid_logprob + logprob(predictions, b[2])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Average loss at step 0: 3.394319 learning rate: 10.000000
Minibatch perplexity: 26.82
ds tdoyeaoofsws d lghnnrk  jxarjsbgo xdavdg ohhmieueorhod oopyfe h  avite q sadco
ojy so pqs pojkxreox qls kwtbhueemfhec tovl zfge o cir g lek qjlskqggyrraxqv neoq
qq e fr vvngq festpk h ttbtskataejlbmh fnnlei acffch odgaz  cduweyrraledfsyff gsa
gv beraafinvauasnrnoc k jn lctpw sn lec tbqbwdlsrveueeuuc kebooafwsuucri ct eg  i
rj wnl p qe  ewybslqs o rs lj utslnekvqihi ra zte o feve qhg cljzu fowhhsetrgdrcz
Validation set perplexity: 19.48
Average loss at step 100: 2.422416 learning rate: 10.000000
Minibatch perplexity: 7.83
Validation set perplexity: 9.09
Average loss at step 200: 2.136548 learning rate: 10.000000
Minibatch perplexity: 7.54
Validation set perplexity: 8.22
Average loss at step 300: 2.062691 learning rate: 10.000000
Minibatch perplexity: 6.65
Validation set perplexity: 8.02
Average loss at step 400: 2.

Average loss at step 4200: 1.837448 learning rate: 10.000000
Minibatch perplexity: 5.62
Validation set perplexity: 7.06
Average loss at step 4300: 1.842455 learning rate: 10.000000
Minibatch perplexity: 4.97
Validation set perplexity: 7.02
Average loss at step 4400: 1.810337 learning rate: 10.000000
Minibatch perplexity: 5.33
Validation set perplexity: 6.98
Average loss at step 4500: 1.843653 learning rate: 10.000000
Minibatch perplexity: 5.86
Validation set perplexity: 6.95
Average loss at step 4600: 1.819819 learning rate: 10.000000
Minibatch perplexity: 5.53
Validation set perplexity: 7.14
Average loss at step 4700: 1.823269 learning rate: 10.000000
Minibatch perplexity: 5.62
Validation set perplexity: 6.80
Average loss at step 4800: 1.837183 learning rate: 10.000000
Minibatch perplexity: 5.72
Validation set perplexity: 6.95
Average loss at step 4900: 1.824681 learning rate: 10.000000
Minibatch perplexity: 6.38
Validation set perplexity: 7.17
Average loss at step 5000: 1.833552 lear

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [51]:
train_text_split = train_text.split()

In [52]:
train_text_split_reverse = [word[::-1] for word in train_text_split]

In [53]:
print(train_text_split[100:110])
print(train_text_split_reverse[100:110])

['gatherer', 'bands', 'were', 'egalitarian', 'and', 'lacked', 'division', 'of', 'labour', 'accumulated']
['rerehtag', 'sdnab', 'erew', 'nairatilage', 'dna', 'dekcal', 'noisivid', 'fo', 'ruobal', 'detalumucca']


In [54]:
with open('data/text8reversed.txt', 'w') as f:
  f.write('\n'.join(train_text_split_reverse))

In [55]:
with open('data/text8original.txt', 'w') as f:
  f.write('\n'.join(train_text_split))

In [61]:
train_text_split_sentences = []
train_text_split_reversed_sentences = []

In [62]:
orig_sentence = []
rev_sentence = []
for i in range(len(train_text_split)):
  if i % 5 == 0:
    train_text_split_sentences.append(' '.join(orig_sentence))
    train_text_split_reversed_sentences.append(' '.join(rev_sentence))
    orig_sentence = []
    rev_sentence = []
  else:
    orig_sentence.append(train_text_split[i])
    rev_sentence.append(train_text_split_reverse[i])

In [64]:
print(train_text_split_reversed_sentences[:5])
print(train_text_split_sentences[:5])

['', 'stsihcrana etacovda laicos snoitaler', 'nopu yratnulov noitaicossa fo', 'slaudividni lautum dia dna', 'ecnanrevog elihw msihcrana si']
['', 'anarchists advocate social relations', 'upon voluntary association of', 'individuals mutual aid and', 'governance while anarchism is']


In [69]:
with open('data/text8_sentences_original.txt', 'w') as f:
  f.write('\n'.join(train_text_split_sentences))

In [70]:
with open('data/text8_sentences_reversed.txt', 'w') as f:
  f.write('\n'.join(train_text_split_reversed_sentences))

## Отражает только слова

In [96]:
import time
import helper

source_path = 'data/text8original.txt'
target_path = 'data/text8reversed.txt'
#source_path = 'data/text8_sentences_original.txt'
#target_path = 'data/text8_sentences_reversed.txt'

source_sentences = helper.load_data(source_path)
target_sentences = helper.load_data(target_path)

In [97]:
source_sentences[:50].split('\n')

['ons', 'anarchists', 'advocate', 'social', 'relations', 'based', 'upo']

In [98]:
target_sentences[:50].split('\n')

['sno', 'stsihcrana', 'etacovda', 'laicos', 'snoitaler', 'desab', 'nop']

In [99]:
def extract_character_vocab(data):
    special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']

    set_words = set([character for line in data.split('\n') for character in line])
    int_to_vocab = {word_i: word for word_i, word in enumerate(special_words + list(set_words))}
    vocab_to_int = {word: word_i for word_i, word in int_to_vocab.items()}

    return int_to_vocab, vocab_to_int

source_int_to_letter, source_letter_to_int = extract_character_vocab(source_sentences)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_sentences)

source_letter_ids = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) for letter in line] for line in source_sentences.split('\n')]
target_letter_ids = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) for letter in line] + [target_letter_to_int['<EOS>']] for line in target_sentences.split('\n')] 

print("Example source sequence")
print(source_letter_ids[:3])
print("\n")
print("Example target sequence")
print(target_letter_ids[:3])

Example source sequence
[[14, 28, 19], [6, 28, 6, 23, 12, 22, 25, 19, 20, 19], [6, 18, 4, 14, 12, 6, 20, 26]]


Example target sequence
[[19, 28, 14, 3], [19, 20, 19, 25, 22, 12, 23, 6, 28, 6, 3], [26, 20, 6, 12, 14, 4, 18, 6, 3]]


In [100]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

In [115]:
epochs = 5
batch_size = 128
rnn_size = 50
num_layers = 2
encoding_embedding_size = 15
decoding_embedding_size = 15
learning_rate = 0.001

In [116]:
def get_model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')

    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    
    return input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length


In [117]:
def encoding_layer(input_data, rnn_size, num_layers,
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):

    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)

    def make_cell(rnn_size):
        enc_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return enc_cell

    enc_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
    
    enc_output, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
    
    return enc_output, enc_state

In [118]:
def process_decoder_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [119]:
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                   target_sequence_length, max_target_sequence_length, enc_state, dec_input):

    target_vocab_size = len(target_letter_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)

    def make_cell(rnn_size):
        dec_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return dec_cell

    dec_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
     
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))

    with tf.variable_scope("decode"):

        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        
        
        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                           training_helper,
                                                           enc_state,
                                                           output_layer) 
        
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
    
    with tf.variable_scope("decode", reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')

        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])

        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        enc_state,
                                                        output_layer)
        
        inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)
         

    
    return training_decoder_output, inference_decoder_output

In [120]:

def seq2seq_model(input_data, targets, lr, target_sequence_length, 
                  max_target_sequence_length, source_sequence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size, 
                  rnn_size, num_layers):
    
    _, enc_state = encoding_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  source_sequence_length,
                                  source_vocab_size, 
                                  encoding_embedding_size)
    
    
    dec_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    training_decoder_output, inference_decoder_output = decoding_layer(target_letter_to_int, 
                                                                       decoding_embedding_size, 
                                                                       num_layers, 
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       enc_state, 
                                                                       dec_input) 
    
    return training_decoder_output, inference_decoder_output
    



In [121]:
train_graph = tf.Graph()
with train_graph.as_default():
    
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_model_inputs()
    
    training_decoder_output, inference_decoder_output = seq2seq_model(input_data, 
                                                                      targets, 
                                                                      lr, 
                                                                      target_sequence_length, 
                                                                      max_target_sequence_length, 
                                                                      source_sequence_length,
                                                                      len(source_letter_to_int),
                                                                      len(target_letter_to_int),
                                                                      encoding_embedding_size, 
                                                                      decoding_embedding_size, 
                                                                      rnn_size, 
                                                                      num_layers)    
    
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        optimizer = tf.train.AdamOptimizer(lr)

        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


In [122]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [124]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))
        
        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))
        
        yield pad_targets_batch, pad_sources_batch, pad_targets_lengths, pad_source_lengths

In [125]:
train_source = source_letter_ids[batch_size:]
train_target = target_letter_ids[batch_size:]
valid_source = source_letter_ids[:batch_size]
valid_target = target_letter_ids[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))

display_step = 1000

checkpoint = "best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
        
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>'])):
            
            _, loss = sess.run(
                [train_op, cost],
                {input_data: sources_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths})

            if batch_i % display_step == 0 and batch_i > 0:
                
                validation_loss = sess.run(
                [cost],
                {input_data: valid_sources_batch,
                 targets: valid_targets_batch,
                 lr: learning_rate,
                 target_sequence_length: valid_targets_lengths,
                 source_sequence_length: valid_sources_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              validation_loss[0]))

    
    
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

Epoch   1/5 Batch 1000/132850 - Loss:  0.812  - Validation loss:  0.882
Epoch   1/5 Batch 2000/132850 - Loss:  0.588  - Validation loss:  0.721
Epoch   1/5 Batch 3000/132850 - Loss:  0.328  - Validation loss:  0.520
Epoch   1/5 Batch 4000/132850 - Loss:  0.278  - Validation loss:  0.334
Epoch   1/5 Batch 5000/132850 - Loss:  0.177  - Validation loss:  0.220
Epoch   1/5 Batch 6000/132850 - Loss:  0.083  - Validation loss:  0.140
Epoch   1/5 Batch 7000/132850 - Loss:  0.070  - Validation loss:  0.088
Epoch   1/5 Batch 8000/132850 - Loss:  0.060  - Validation loss:  0.067
Epoch   1/5 Batch 9000/132850 - Loss:  0.061  - Validation loss:  0.040
Epoch   1/5 Batch 10000/132850 - Loss:  0.017  - Validation loss:  0.037
Epoch   1/5 Batch 11000/132850 - Loss:  0.041  - Validation loss:  0.032
Epoch   1/5 Batch 12000/132850 - Loss:  0.016  - Validation loss:  0.032
Epoch   1/5 Batch 13000/132850 - Loss:  0.012  - Validation loss:  0.017
Epoch   1/5 Batch 14000/132850 - Loss:  0.018  - Validation 

Epoch   1/5 Batch 114000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 115000/132850 - Loss:  0.000  - Validation loss:  0.001
Epoch   1/5 Batch 116000/132850 - Loss:  0.000  - Validation loss:  0.001
Epoch   1/5 Batch 117000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 118000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 119000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 120000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 121000/132850 - Loss:  0.007  - Validation loss:  0.001
Epoch   1/5 Batch 122000/132850 - Loss:  0.001  - Validation loss:  0.000
Epoch   1/5 Batch 123000/132850 - Loss:  0.000  - Validation loss:  0.002
Epoch   1/5 Batch 124000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 125000/132850 - Loss:  0.001  - Validation loss:  0.002
Epoch   1/5 Batch 126000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   1/5 Batch 127000/132850 - Loss

Epoch   2/5 Batch 95000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 96000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 97000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 98000/132850 - Loss:  0.000  - Validation loss:  0.001
Epoch   2/5 Batch 99000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 100000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 101000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 102000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 103000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 104000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 105000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 106000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 107000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   2/5 Batch 108000/132850 - Loss:  0.

Epoch   3/5 Batch 75000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 76000/132850 - Loss:  0.000  - Validation loss:  0.001
Epoch   3/5 Batch 77000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 78000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 79000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 80000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 81000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 82000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 83000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 84000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 85000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 86000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 87000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   3/5 Batch 88000/132850 - Loss:  0.000  - Va

Epoch   4/5 Batch 55000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 56000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 57000/132850 - Loss:  0.006  - Validation loss:  0.000
Epoch   4/5 Batch 58000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 59000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 60000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 61000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 62000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 63000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 64000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 65000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 66000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 67000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   4/5 Batch 68000/132850 - Loss:  0.000  - Va

Epoch   5/5 Batch 35000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 36000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 37000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 38000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 39000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 40000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 41000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 42000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 43000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 44000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 45000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 46000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 47000/132850 - Loss:  0.000  - Validation loss:  0.000
Epoch   5/5 Batch 48000/132850 - Loss:  0.000  - Va

In [126]:
def source_to_seq(text):
    '''Prepare the text for the model'''
    sequence_length = 9
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text]+ [source_letter_to_int['<PAD>']]*(sequence_length-len(text))

In [136]:
input_sentence = 'jjjgjjgj'
text = source_to_seq(input_sentence)

checkpoint = "./best_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    #Multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(text)]*batch_size, 
                                      source_sequence_length: [len(text)]*batch_size})[0] 


pad = source_letter_to_int["<PAD>"] 

print('Original Text:', input_sentence)

print('\nSource')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./best_model.ckpt
Original Text: jjjgjjgj

Source
  Word Ids:    [16, 16, 16, 14, 16, 16, 14, 16, 0]
  Input Words: j j j g j j g j <PAD>

Target
  Word Ids:       [16, 14, 16, 16, 14, 16, 16, 16, 3]
  Response Words: j g j j g j j j <EOS>


## Можно эту модель обучить предложения из четырёх слов

но уже не остаётся времени

In [137]:
train_text_split_sentences = []
train_text_split_reversed_sentences = []

orig_sentence = []
rev_sentence = []
for i in range(len(train_text_split)):
  if i % 5 == 0:
    train_text_split_sentences.append(' '.join(orig_sentence))
    train_text_split_reversed_sentences.append(' '.join(rev_sentence))
    orig_sentence = []
    rev_sentence = []
  else:
    orig_sentence.append(train_text_split[i])
    rev_sentence.append(train_text_split_reverse[i])

In [138]:
print(train_text_split_reversed_sentences[:5])
print(train_text_split_sentences[:5])

['', 'stsihcrana etacovda laicos snoitaler', 'nopu yratnulov noitaicossa fo', 'slaudividni lautum dia dna', 'ecnanrevog elihw msihcrana si']
['', 'anarchists advocate social relations', 'upon voluntary association of', 'individuals mutual aid and', 'governance while anarchism is']


In [139]:
with open('data/text8original.txt', 'w') as f:
  f.write('\n'.join(train_text_split))
with open('data/text8reversed.txt', 'w') as f:
  f.write('\n'.join(train_text_split_reverse))