In [0]:
!wget https://github.com/udacity/deep-learning/raw/master/sentiment-network/reviews.txt

In [0]:
!wget https://raw.githubusercontent.com/udacity/deep-learning/master/sentiment-network/labels.txt

In [0]:
%tensorflow_version 1.x

In [0]:
import numpy as np
import tensorflow as tf

In [0]:
with open("reviews.txt",'r') as file:
  text = file.read()
with open("labels.txt",'r') as file:
    labels = file.read()

In [0]:
from string import punctuation
all_text = ''.join([c for c in text if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)

words = all_text.split()

In [0]:
vocab_to_int = {word:index+1 for index,word in enumerate(list(set(words)))}
reviews_ints = []
for revi in reviews:
  temp = []
  for word in revi.split():
    temp.append(vocab_to_int[word])
  reviews_ints.append(temp)

In [0]:
labels_split = (labels.split('\n'))
labels_int = np.array([1 if label=='positive'  else 0 for label in labels_split])

In [9]:
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [10]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

25000

In [0]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels_int[ii] for ii in non_zero_idx])

In [0]:
seq_len = 200
features = np.zeros([len(reviews_ints),seq_len],dtype='int')
for idx,review in enumerate(reviews_ints):
  length = len(review)
  features[idx,-length:] = np.array(review)[:seq_len]

In [0]:
split_frac = 0.8
from sklearn.model_selection import train_test_split

In [0]:
train_x,test_x,train_y,test_y = train_test_split(features,labels,train_size=split_frac)

In [0]:
valid_x,test_x,valid_y,test_y = train_test_split(test_x,test_y,train_size=0.5)

In [16]:
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(valid_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [0]:
#Building the Graph

#First definr the hyperparameters
lstm_size = 256
lstm_layers = 2
batch_size = 500
learning_rate = 0.001


In [0]:
n_words = len(vocab_to_int) + 1

graph = tf.Graph()
with graph.as_default():
  inputs_  = tf.placeholder(dtype=tf.int32,shape=[None,None],name='input')
  labels_  = tf.placeholder(dtype=tf.int32,shape=[None,None],name='labels')
  keep_prob = tf.placeholder(dtype=tf.float32,name='keep_prob')

In [0]:
embed_size = 300

with graph.as_default():
   embedding = tf.Variable(tf.truncated_normal([n_words,embed_size]))
   embed = tf.nn.embedding_lookup(embedding,inputs_)

In [0]:
with graph.as_default():
    def lstm():
            lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
            
            # Add dropout to the cell
            drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
            return drop
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm() for i in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [0]:
with graph.as_default():
   outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

In [0]:
with graph.as_default():
      predictions = tf.contrib.layers.fully_connected(outputs[:,-1],1,activation_fn=tf.sigmoid)

      cost = tf.losses.mean_squared_error(labels_,predictions)

      optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [0]:
with graph.as_default():
  correct_pred = tf.equal(tf.cast(tf.round(predictions),tf.int32),labels_)
  accuracy = tf.reduce_mean(tf.cast(correct_pred,tf.float32))

In [0]:
def get_batches(x,y,batch_size):
  n_batches = len(x)//batch_size
  x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
  for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [25]:
epochs = 15
with graph.as_default():
  saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
  sess.run(tf.global_variables_initializer())
  iteration=1
  for e in range(epochs):
    state= sess.run(initial_state)
    for ii,(x,y) in enumerate(get_batches(train_x,train_y,batch_size)):
      feed = {inputs_:x
              ,labels_:y[:,None],
              keep_prob:0.5,
              initial_state:state
              }

      loss,state,_ = sess.run([cost,final_state,optimizer],feed_dict=feed)
      if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))
      if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x,y in get_batches(valid_x, valid_y, batch_size):
                  feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                  batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                  val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
      iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 5/15 Iteration: 240 Train loss: 0.061
Epoch: 6/15 Iteration: 245 Train loss: 0.045
Epoch: 6/15 Iteration: 250 Train loss: 0.059
Val acc: 0.820
Epoch: 6/15 Iteration: 255 Train loss: 0.039
Epoch: 6/15 Iteration: 260 Train loss: 0.044
Epoch: 6/15 Iteration: 265 Train loss: 0.040
Epoch: 6/15 Iteration: 270 Train loss: 0.047
Epoch: 6/15 Iteration: 275 Train loss: 0.036
Val acc: 0.820
Epoch: 6/15 Iteration: 280 Train loss: 0.037
Epoch: 7/15 Iteration: 285 Train loss: 0.038
Epoch: 7/15 Iteration: 290 Train loss: 0.041
Epoch: 7/15 Iteration: 295 Train loss: 0.034
Epoch: 7/15 Iteration: 300 Train loss: 0.033
Val acc: 0.822
Epoch: 7/15 Iteration: 305 Train loss: 0.034
Epoch: 7/15 Iteration: 310 Train loss: 0.042
Epoch: 7/15 Iteration: 315 Train loss: 0.049
Epoch: 7/15 Iteration: 320 Train loss: 0.034
Epoch: 8/15 Iteration: 325 Train loss: 0.024
Val acc: 0.808
Epoch: 8/15 Iteration: 330 Train loss: 0.037
Epoch: 8/15 Iteration: 335 Train loss: 0.048
Epoch: 8/15 Iteration: 340 Train loss: 0

In [26]:
test_acc = []
with tf.Session(graph=graph) as sess:
  saver.restore(sess,tf.train.latest_checkpoint('checkpoints'))
  test_state = sess.run(cell.zero_state(batch_size,tf.float32))
  for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
            feed = {inputs_: x,
                      labels_: y[:, None],
                      keep_prob: 1,
                      initial_state: test_state}
            batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
            test_acc.append(batch_acc)
  print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt
Test accuracy: 0.807
