
# Example from: Recurrent Neural Networks in Tensorflow II

http://r2rt.com/recurrent-neural-networks-in-tensorflow-ii.html


* ## <font color='purple'>We will use LSTM and GRU </font>
* ## <font color='purple'>Simple NLP Task: character-level language model to generate character sequences </font>
### a la Andrej Karpathy’s char-rnn 

* <font size='3'> We’ll use the tiny-shakespeare corpus as our data, though we could use any plain text file.
* We’ll choose to use all of the characters in the text file as our vocabulary, treating lowercase and capital letters are separate characters. </font>

Additionally, it is likely a good idea to restrict the vocabulary (i.e., the set of characters) used, by replacing uncommon characters with an UNK token (like a square: □).

In [1]:
"""
Imports
"""
import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import time
import os

from six.moves import urllib

from tensorflow.models.rnn.ptb import reader


In [2]:
tf.__version__

'0.10.0'

In [3]:
! pip show tensorflow

Name: tensorflow
Version: 0.10.0
Summary: TensorFlow helps the tensors flow
Home-page: http://tensorflow.org/
Author: Google Inc.
Author-email: opensource@google.com
License: Apache 2.0
Location: /gpfs/global_fs01/sym_shared/YPProdSpark/user/s35d-c92fe439b3aa21-1525a0f645bc/.local/lib/python2.7/site-packages
Requires: mock, numpy, six, protobuf, wheel


## <font color='brown'>Update tensorflow to 0.10.0 for managing state_is_tuple=True in LSTM (see below) </font>

* ### cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
* ### cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)

In [4]:
# ! pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0-cp27-none-linux_x86_64.whl

## <font color='magenta'>The task:  generate character sequences</font>

In [5]:
"""
Load and data
"""

file_url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'
file_name = 'tinyshakespeare.txt'

#file_url = 'http://latel.upf.edu/traductica/scp/quijote/quijote.txt'
#file_name = 'cervantes.txt'

if not os.path.exists(file_name):
    urllib.request.urlretrieve(file_url, file_name)

In [6]:
with open(file_name,'r') as f:
    raw_data = f.read()
    print("Data length:", len(raw_data))

('Data length:', 1115394)


In [7]:
type(raw_data)

str

In [8]:
print raw_data[0:200]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


* ## understand : vocab : unique elements in raw text

In [9]:
vocab = set(raw_data)
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))

In [10]:
type(vocab)

set

In [11]:
print vocab

set(['\n', '!', ' ', '$', "'", '&', '-', ',', '.', '3', ';', ':', '?', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'J', 'M', 'L', 'O', 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])


In [12]:
vocab_size

65

In [13]:
idx_to_vocab

{0: '\n',
 1: '!',
 2: ' ',
 3: '$',
 4: "'",
 5: '&',
 6: '-',
 7: ',',
 8: '.',
 9: '3',
 10: ';',
 11: ':',
 12: '?',
 13: 'A',
 14: 'C',
 15: 'B',
 16: 'E',
 17: 'D',
 18: 'G',
 19: 'F',
 20: 'I',
 21: 'H',
 22: 'K',
 23: 'J',
 24: 'M',
 25: 'L',
 26: 'O',
 27: 'N',
 28: 'Q',
 29: 'P',
 30: 'S',
 31: 'R',
 32: 'U',
 33: 'T',
 34: 'W',
 35: 'V',
 36: 'Y',
 37: 'X',
 38: 'Z',
 39: 'a',
 40: 'c',
 41: 'b',
 42: 'e',
 43: 'd',
 44: 'g',
 45: 'f',
 46: 'i',
 47: 'h',
 48: 'k',
 49: 'j',
 50: 'm',
 51: 'l',
 52: 'o',
 53: 'n',
 54: 'q',
 55: 'p',
 56: 's',
 57: 'r',
 58: 'u',
 59: 't',
 60: 'w',
 61: 'v',
 62: 'y',
 63: 'x',
 64: 'z'}

In [14]:
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))

In [15]:
vocab_to_idx['A']

13

In [16]:
idx_to_vocab[13]

'A'

In [17]:
vocab_to_idx 

{'\n': 0,
 ' ': 2,
 '!': 1,
 '$': 3,
 '&': 5,
 "'": 4,
 ',': 7,
 '-': 6,
 '.': 8,
 '3': 9,
 ':': 11,
 ';': 10,
 '?': 12,
 'A': 13,
 'B': 15,
 'C': 14,
 'D': 17,
 'E': 16,
 'F': 19,
 'G': 18,
 'H': 21,
 'I': 20,
 'J': 23,
 'K': 22,
 'L': 25,
 'M': 24,
 'N': 27,
 'O': 26,
 'P': 29,
 'Q': 28,
 'R': 31,
 'S': 30,
 'T': 33,
 'U': 32,
 'V': 35,
 'W': 34,
 'X': 37,
 'Y': 36,
 'Z': 38,
 'a': 39,
 'b': 41,
 'c': 40,
 'd': 43,
 'e': 42,
 'f': 45,
 'g': 44,
 'h': 47,
 'i': 46,
 'j': 49,
 'k': 48,
 'l': 51,
 'm': 50,
 'n': 53,
 'o': 52,
 'p': 55,
 'q': 54,
 'r': 57,
 's': 56,
 't': 59,
 'u': 58,
 'v': 61,
 'w': 60,
 'x': 63,
 'y': 62,
 'z': 64}

* ### understand: converting text data into numbers

In [18]:
data = [vocab_to_idx[c] for c in raw_data]

In [19]:
type(data)

list

In [20]:
len(data)

1115394

In [21]:
raw_data[0:10]

'First Citi'

In [22]:
del raw_data

In [23]:
data[0:10]

[19, 46, 57, 56, 59, 2, 14, 46, 59, 46]

In [24]:
recover_data = [idx_to_vocab[c] for c in data]

In [25]:
recover_data[0:10]

['F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i']

In [26]:
del recover_data

## <font color='brown'> Some utility functions for feeding batches</font>

In [27]:
def gen_epochs(n, num_steps, batch_size):
    for i in range(n):
        yield reader.ptb_iterator(data, batch_size, num_steps)

###  PTB from Penn Tree Bank (PTB) dataset

<font color='green'>reader.ptb_iterator(data, batch_size, num_steps)<7font>

https://github.com/petewarden/tensorflow_makefile/blob/master/tensorflow/models/rnn/ptb/reader.py


def ptb_iterator(raw_data, batch_size, num_steps):
  
  Iterate on the raw PTB data.
  This generates batch_size pointers into the raw PTB data, and allows
  minibatch iteration along these pointers.
  
  Args:
    raw_data: one of the raw data outputs from ptb_raw_data.
    batch_size: int, the batch size.
    num_steps: int, the number of unrolls.
    
  Yields:
    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
    
    The second element of the tuple is the same data time-shifted to the
    right by one.
    
  Raises:
    ValueError: if batch_size or num_steps are too high.
 




In [28]:
num_epochs=1
num_steps=200
batch_size=32

Xepoch=gen_epochs(num_epochs, num_steps, batch_size)

for idx, epoch in enumerate(Xepoch):
    print('epoch no. =',idx)
    for step, (X, Y) in enumerate(epoch):
            cc=0
    
    print("Total no. steps=",step)
    print "X information...."
    print(X.shape)
    print "Y information...."
    print(Y.shape)
            

('epoch no. =', 0)
('Total no. steps=', 173)
X information....
(32, 200)
Y information....
(32, 200)


In [29]:
173*200*32

1107200

In [30]:
1115394/(32*200)

174

In [31]:
num_epochs=1
num_steps=200
batch_size=32

Xepoch=gen_epochs(num_epochs, num_steps, batch_size)

for idx, epoch in enumerate(Xepoch):
    print('epoch no. =',idx)
    for step, (X, Y) in enumerate(epoch):
        if step % 500 == 0:
            print(step)
            print "X information...."
            print(X.shape)
            print(type(X))
            print(X[0:10])
            print "Y information...."
            print(Y.shape)
            print(type(Y))
            print(Y[0:10])
            


('epoch no. =', 0)
0
X information....
(32, 200)
<type 'numpy.ndarray'>
[[19 46 57 ..., 62 52 58]
 [45 52 57 ..., 50 39 62]
 [ 2 14 52 ..., 19 46 57]
 ..., 
 [52 58  2 ..., 58 56 55]
 [ 2 47 52 ..., 45  2 60]
 [ 2 45 52 ..., 52 56 42]]
Y information....
(32, 200)
<type 'numpy.ndarray'>
[[46 57 56 ..., 52 58  2]
 [52 57  2 ..., 39 62  2]
 [14 52 57 ..., 46 57 56]
 ..., 
 [58  2 60 ..., 56 55 46]
 [47 52 60 ...,  2 60 52]
 [45 52 52 ..., 56 42  2]]


### Function for graph reset

In [32]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

## <font color='magenta'>Second: RNN graph definition</font>

In [33]:
state_size = 100
num_classes = vocab_size
batch_size = 32
num_steps = 200
num_layers = 3
learning_rate = 1e-4


reset_graph()

x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')

* ## <font color='red'>Create EMBEDDINGS</font>
http://suriyadeepan.github.io/2017-02-13-unfolding-rnn-2/

In [34]:
embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size])

# Note that our inputs are no longer a list, but a tensor of dims batch_size x num_steps x state_size
rnn_inputs = tf.nn.embedding_lookup(embeddings, x)
    
    

In [35]:

#LSTM
#cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
#cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)

# GRU
cell = tf.nn.rnn_cell.GRUCell(state_size)

init_state = cell.zero_state(batch_size, tf.float32)
rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)

with tf.variable_scope('softmax'):
    W = tf.get_variable('W', [state_size, num_classes])
    b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))

#reshape rnn_outputs and y so we can get the logits in a single matmul
rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])
y_reshaped = tf.reshape(y, [-1])

logits = tf.matmul(rnn_outputs, W) + b

predictions = tf.nn.softmax(logits)

total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_reshaped))
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

## <font color='magenta'>Third: train RNN (LSTM or GRU)</font>

In [None]:
num_epochs=100
verbose=True

t = time.time()

tf.set_random_seed(2345)
with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        training_losses = []
        for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps, batch_size)):
            training_loss = 0
            steps = 0
            training_state = None
            for X, Y in epoch:
                steps += 1

#                if training_state is not None:
#                    feed_dict[g['init_state']] = training_state
 
                training_loss_, training_state, _ = sess.run([total_loss,
                                                      final_state,
                                                      train_step],
                                                          feed_dict={x: X, y: Y})
                training_loss += training_loss_
                if step % 100 == 0 and step > 0:
                    if verbose:
                        print("Average loss at step", step,"for last 100 steps:", training_loss/100)
                    training_losses.append(training_loss/steps)
                    training_loss = 0
            
            if verbose:
                print("Average training loss for Epoch", idx, ":", training_loss/steps)
            
        saver = tf.train.Saver()
        #saver.save(sess, 'RNN_GRU_model_cervantes')
        saver.save(sess, 'RNN_GRU_model_shakespeare')
            
print("It took", time.time() - t, "seconds this training.")

('Average training loss for Epoch', 0, ':', 2.4570799510121994)
('Average training loss for Epoch', 1, ':', 2.4335354098264759)
('Average training loss for Epoch', 2, ':', 2.4350139231010357)


## Some results:

### LSTM
* cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
* cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
+ ('Average training loss for Epoch', 0, ':', 3.5619977603013488)
+ ('It took', 38.50640511512756, 'seconds this training.')

### GRU
* ('Average training loss for Epoch', 0, ':', 3.6105946466840546)
* ('It took', 35.258342027664185, 'seconds this training.')


## working directory to save our model

In [None]:
print os.getcwd()

In [37]:
print os.listdir('.')

['RNN_GRU_model', 'RNN_GRU_model.meta', 'tinyshakespeare.txt', 'RNN_GRU_model_shakespeare.meta', 'checkpoint', 'RNN_GRU_model_shakespeare', 'cervantes.txt']


In [None]:
#Remove dir not empty + sub dirs
#import shutil

#shutil.rmtree('./Ubi_Voice.csv')

## <font color='magenta'>Finally: models (LSTM or GRU) can be used to generate TEXT</font>

* ## First: <font size='3'>we need to rebuild the graph so as to accept a single character at a time</font>

In [38]:
state_size = 100
num_classes = vocab_size
batch_size = 1
num_steps = 1
num_layers = 3
learning_rate = 1e-4
num_epochs=1


reset_graph()

x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')

embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size])

# Note that our inputs are no longer a list, but a tensor of dims batch_size x num_steps x state_size
rnn_inputs = tf.nn.embedding_lookup(embeddings, x)
    
    
#LSTM
#cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
#cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)

# GRU
cell = tf.nn.rnn_cell.GRUCell(state_size)

init_state = cell.zero_state(batch_size, tf.float32)
rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)

with tf.variable_scope('softmax'):
    W = tf.get_variable('W', [state_size, num_classes])
    b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))

#reshape rnn_outputs and y so we can get the logits in a single matmul
rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])
y_reshaped = tf.reshape(y, [-1])

logits = tf.matmul(rnn_outputs, W) + b

predictions = tf.nn.softmax(logits)

total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_reshaped))
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

## Then:
* <font size='3'>Restore our saved model.
* We’ll give the network a single character prompt,i.e. prompt='A'
* Grab its predicted probability distribution for the next character
* Use that distribution to pick the next character, and repeat. </font>
###   
<font size='3'>When picking the next character, using pick_top_chars != None to use the whole probability distribution (default), or be forced to pick one of the top n most likely characters in the distribution. The latter option should obtain more English-like results.</font>

In [39]:
prompt='A'
pick_top_chars=5
num_chars=750

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    saver.restore(sess, "RNN_GRU_model_shakespeare")
    
    state = None
    current_char = vocab_to_idx[prompt]
    chars = [current_char]
    
    for i in range(num_chars):
            if state is not None:
                preds, state = sess.run([predictions,final_state], feed_dict={x: [[current_char]], init_state: state})
            else:
                preds, state = sess.run([predictions,final_state], feed_dict={x: [[current_char]]})

            if pick_top_chars is not None:
                p = np.squeeze(preds)
                p[np.argsort(p)[:-pick_top_chars]] = 0
                p = p / np.sum(p)
                current_char = np.random.choice(vocab_size, 1, p=p)[0]
            else:
                current_char = np.random.choice(vocab_size, 1, p=np.squeeze(preds))[0]

            chars.append(current_char)
            
chars = map(lambda x: idx_to_vocab[x], chars)
print("".join(chars))


And art and the wime, wand that ande, thend thas and wo that thes wont mant se hen wat,
To hort momes to sirthind hen as that ton home hes wanghe arthy home, wo dowe hor wingers and tho hath sore this whar sones shee hat my wore tore and, the sore thor and and sees to me sill sead and and ant has se so hater hist, thow he with thin tome wand and sont that sas shall ande, wore treare ast thes and when that thint, that sirist hear, so mente and the whe hith ast an shes
Tille the him so that thine at myot a hand and thes weand sint meat hesere wine thest at a doun that to hes were thine thens, and that me woned shat sill that that ate wast this his to toul therst the with, that ho doulllle thath malint ho thith shat mith my son werd somand sort
