In [1]:
import numpy as np
import tensorflow as tf

In [2]:
with open('./data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('./data/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [4]:
# Create your dictionary that maps vocab words to integers here
from collections import Counter

counter = Counter(words)
vocab_sorted = sorted(counter,key=counter.get,reverse=True)
vocab_to_int = {word: num for num,word in enumerate(vocab_sorted, 1)}

# Convert the reviews to integers, same shape as reviews list, but with integers
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [5]:
# Convert labels to 1s and 0s for 'positive' and 'negative'
labels = labels.split('\n')
labels = np.array([1 if each == 'positive' else 0 for each in labels])

In [6]:
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [7]:
# Filter out that review with 0 length
non_zero_index = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_index)

25000

In [8]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_index]
labels = np.array([labels[ii] for ii in non_zero_index])

In [9]:
seq_len = 200
features = np.array([review[:seq_len] if len(review) > seq_len else [0] * (seq_len - len(review)) + review for review in reviews_ints])

In [10]:
features[:10,:100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 21025,   308,     6,
            3,  1050,   207,     8,  2138,    32,     1,   171,    57,
           15,    49,    81,  5785,    44,   382,   110,   140,    15,
         5194,    60,   154,     9,     1,  4975,  5852,   475,    71,
            5,   260,    12, 21025,   308,    13,  1978,     6,    74,
         2395],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

In [11]:
split_frac = 0.8

split_idx = int(len(features)*split_frac)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

val_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:val_idx], val_x[val_idx:]
val_y, test_y = val_y[:val_idx], val_y[val_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [12]:
lstm_size = 256    # LSTM元胞中隐藏层的单元数量，LSTM元胞中实际有四种不同的网络层。这是每一层中的单元数，在此，这四层中的每一层就有256个单元
                   # 基本上可以将它想象成设置隐藏层中的单元数量
lstm_layers = 2    # LSTM层的数量
batch_size = 500
learning_rate = 0.01

In [13]:
n_words = len(vocab_to_int)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32,(batch_size, seq_len),name='inputs')
    labels_ = tf.placeholder(tf.int32,(batch_size,1),name='labels')
    keep_prob = tf.placeholder(tf.float32,name='keep_prob')

In [14]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words+1,embed_size),-1,1))
    embed = tf.nn.embedding_lookup(embedding,inputs_)

In [15]:
with graph.as_default():
    # Your basic LSTM cell
    #lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    def lstm_cell():
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    # Add dropout to the cell
    #drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [16]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [17]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [18]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [19]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [21]:
epochs = 3

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/3 Iteration: 5 Train loss: 0.329
Epoch: 0/3 Iteration: 10 Train loss: 0.250
Epoch: 0/3 Iteration: 15 Train loss: 0.243
Epoch: 0/3 Iteration: 20 Train loss: 0.221
Epoch: 0/3 Iteration: 25 Train loss: 0.206
Val acc: 0.622
Epoch: 0/3 Iteration: 30 Train loss: 0.206
Epoch: 0/3 Iteration: 35 Train loss: 0.184
Epoch: 0/3 Iteration: 40 Train loss: 0.148
Epoch: 1/3 Iteration: 45 Train loss: 0.139
Epoch: 1/3 Iteration: 50 Train loss: 0.116
Val acc: 0.850
Epoch: 1/3 Iteration: 55 Train loss: 0.041
Epoch: 1/3 Iteration: 60 Train loss: 0.016
Epoch: 1/3 Iteration: 65 Train loss: 0.011
Epoch: 1/3 Iteration: 70 Train loss: 0.012
Epoch: 1/3 Iteration: 75 Train loss: 0.013
Val acc: 0.644
Epoch: 1/3 Iteration: 80 Train loss: 0.006
Epoch: 2/3 Iteration: 85 Train loss: 0.154
Epoch: 2/3 Iteration: 90 Train loss: 0.074
Epoch: 2/3 Iteration: 95 Train loss: 0.067
Epoch: 2/3 Iteration: 100 Train loss: 0.034
Val acc: 0.831
Epoch: 2/3 Iteration: 105 Train loss: 0.028
Epoch: 2/3 Iteration: 110 Train loss

In [22]:
# 需要注意的是：关于'Understanding LSTM' 中的LSTM 的结构图，只是数据的流向而已。不管是sigmoid还是tanh，或者是gate，所做用的对象都是一个tensor。因此，sigmoid层
# 的实际结构可能和普通深度神经网络一样，一个 fully-connected 连接后将结果通过sigmoid，上面的 lstm_size 就是这个fully-connected层中神经元的个数

In [23]:
embed.shape

TensorShape([Dimension(500), Dimension(200), Dimension(300)])

In [24]:
outputs.shape   # 选取的200长度中，每一个都会有256个输出，因此输出的shape为 200*256

TensorShape([Dimension(500), Dimension(200), Dimension(256)])

In [25]:
outputs[:,-1].shape

TensorShape([Dimension(500), Dimension(256)])

In [26]:
len(counter)

74072

In [27]:
len(vocab_to_int)

74072

In [28]:
len(labels)

25000

In [29]:
embed.shape

TensorShape([Dimension(500), Dimension(200), Dimension(300)])

In [30]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt
Test accuracy: 0.787


In [31]:
vocab_sorted[74071]

'hued'

In [32]:
test_x[498,120]

2