## Import and verify GPU usage

In [1]:
# Packages
import numpy as np
import tensorflow as tf
from string import punctuation
from collections import Counter

In [2]:
# Tensorflow version information
print('TensorFlow Version: {}'.format(tf.__version__))

# Check GPU presence
if not tf.test.gpu_device_name():
    warnings.warn('No GPU')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.1.0
Default GPU Device: /gpu:0


## Import data

In [3]:
# import raw reviews and labels
with open('./input_data/movie_reviews/reviews.txt', 'r') as f:
    reviews_raw = f.read()
with open('./input_data/movie_reviews/labels.txt', 'r') as f:
    labels_raw = f.read()

## View and Preprocess Data

### Reviews

In [4]:
# inspect reviews
print(reviews_raw[:1000])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

In [5]:
# remove puctuation
# Note: this could be done in a number of ways, NLTK for example
all_reviews_text = ''.join([char for char in reviews_raw if char not in punctuation])
print(all_reviews_text[:1000])

bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   
story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent mo

In [6]:
# split into individual reviews (they are delimited by a '\n')
reviews = all_reviews_text.split('\n')
# this leaves a "" empty value in the last index
# sample: print(len(reviews)) > 25001 == ""
# delete this last empty value
del reviews[25000]
print(len(reviews))
print(reviews[24999])

25000
this is one of the dumbest films  i  ve ever seen  it rips off nearly ever type of thriller and manages to make a mess of them all   br    br   there  s not a single good line or character in the whole mess  if there was a plot  it was an afterthought and as far as acting goes  there  s nothing good to say so ill say nothing  i honestly cant understand how this type of nonsense gets produced and actually released  does somebody somewhere not at some stage think   oh my god this really is a load of shite  and call it a day  its crap like this that has people downloading illegally  the trailer looks like a completely different film  at least if you have download it  you haven  t wasted your time or money don  t waste your time  this is painful   


In [7]:
# get list of all words used in the reviews
words = all_reviews_text.split()
print(words[:100])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'high', 's', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high']


### Encoding the words
#### Create 'vocab_to_int' mapping

In [8]:
word_counts = Counter(words)
# output sample| print(word_counts) > 'Counter({'the': 336713, 'and': 164107, 'a': 163009, ....'

vocab = sorted(word_counts, key=word_counts.get, reverse=True)
# output sample| print(vocab) > '['the', 'and', 'a', 'of', 'to','

# convert vocab to int
# NOTE: start at 1, not 0!
vocab_to_int = {word: maping_int for maping_int, word in enumerate(vocab, 1) }
# output sample| print(vocab_to_int) > ''together': 291, 'ewing': 26224,'

#### Convert each review to an 'int review'
where the reviews aren't words, each word is converted to an 'int id'

In [9]:
reviews_as_ints = []
for review_cur in reviews:
    reviews_as_ints.append([vocab_to_int[term] for term in review_cur.split()])
print(len(reviews_as_ints))

25000


In [10]:
# compare
print("Review:")
print(reviews[1])
print("------------------")
print("Review with terms mapped to ints:")
print(reviews_as_ints[1])

Review:
story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers  unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting  even those from the era should be turned off  the cryptic dialogue would make shakespeare seem easy to a third grader  on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond  future stars sally kirkland and frederic forrest can be seen briefly   
------------------
Review with terms mapped to ints:
[63, 4, 3, 125, 36, 47, 7538, 1397, 16, 3, 4218, 505, 45, 17, 3, 622, 134, 12, 6, 3, 1279, 457, 4, 1722, 207, 3, 10733, 7426, 300, 6, 667, 83, 35, 2117, 1086, 3002, 34, 1, 901, 57510, 4, 8, 13, 5146, 464, 8, 2668, 1722, 1, 221, 57, 17, 58, 794, 1300, 834, 228, 8, 43, 98, 123, 14

In [11]:
# review state of reviews
review_lengths = Counter([len(each_review) for each_review in reviews_as_ints])
print(review_lengths[130])
review_len_list = list(review_lengths)
rl_sorted = sorted(review_len_list)
num_reviews = len(reviews_as_ints)
avg_len = sum(val * review_lengths[val] for val in review_lengths) / num_reviews
print("Number reviews: {}".format(num_reviews))
print("Zero-length reviews: {}".format(review_lengths[0]))
print("Avg review length: {}".format(avg_len))
print("Maximum review length: {}".format(max(review_lengths)))

185
Number reviews: 25000
Zero-length reviews: 0
Avg review length: 240.80784
Maximum review length: 2514


### Couple problems:
> 1. Max movie length is really long
> 2. Avg review length is also pretty large.. half the data is larger than 240 words --> will take a long time to train a RNN

### Both of these problems can be addressed by trimming the reviews
> Will trim to `seq_len`

In [12]:
# reviews_ints = [each for each in reviews_as_ints if len(each) > 0]
seq_len = 250

# convert reviews (as mapped ints) into numpy arrays
# we'll use a left padding of '0's to compensate for smaller reviews
reviews_as_feat_input = np.zeros((len(reviews_as_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_as_ints):
    reviews_as_feat_input[i, -len(row):] = np.array(row)[:seq_len]

# inspect our finalized reviews converted into usable data
print(len(reviews_as_feat_input))
print(reviews_as_feat_input[:2])

25000
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0 21429   308     6     3  1050   207     8  2143    32     1
    171    57    15    49    81  5813    44   382   110   140    15  5227
     60   154     9     1  5014  5858   475    71     5   260    12 21429
    308    13  1982     6    74  2395     5   613    73     6  5227     1
  24325     5  1990 10298     1 

### Labels

In [13]:
# view raw
print(labels_raw[:40])

positive
negative
positive
negative
posi


In [14]:
# Convert to a usable format

# split into single review {'positive' or 'negative'}
labels_list = labels_raw.split('\n')

# sample: print(len(labels_list)) > 25001 | print(labels_list[25000]) > ""
# same logic as above, remove empty value
del labels_list[-1]
# convert to numpy array and map positive=>1 and negative=>0
# NOTE: safer method would involve making sure only 'positive' and 'negative' are present first
labels = np.array([1 if cur_label == 'positive' else 0 for cur_label in labels_list])

# print to ensure we've converted correctly
print(len(labels))
print(labels[2500])
print(labels[:19])

25000
1
[1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]


## Split into training, validation, and testing

In [15]:
split_percent = 0.8
# this will mean 80% will be for training, 20% for 'testing'
# this 20% testing will then be broken down in to 10% validation and 10% testing
split_idx = int(len(reviews_as_feat_input)*split_percent)

# split into (training and validation&testing)
train_x, val_x = reviews_as_feat_input[:split_idx], reviews_as_feat_input[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

# split validation into validation and test sets
test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 250) 
Validation set: 	(2500, 250) 
Test set: 		(2500, 250)


## Overview of where we are

### Inputs

Labels
> converted to 0 and 1

Review text
> 1. converted to integer representations
> 2. trimmed to standardized size
> 3. padded with 0's on the left

### Split into training, validation, and testing
80% training
10% validation
10% testing

# Building the RNN

In [16]:
# Hyper parameters
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [17]:
# Build Graph

# number of words in our vocab
n_words = len(vocab)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    # value for dropout
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [18]:
# embedding
# `embed_size` is the size of the embedding vectors or num of units in the embedding layer
embed_size = 300 

# TODO: Add initialization std

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [19]:
with graph.as_default():
    # basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [20]:
# RNN forward pass
# Pass the data through the RNN
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

In [21]:
# output
# we only care about the final output from the RNN (which we'll map to our classification {0(neg) or 1(pos)})
# > we'll do this by using a fully connected layer with a single sigmoidal output node
# > More specifically,
# >   the output node will take in the last value from the RNN output{outputs[:, -1]} as the input
with graph.as_default():
    # get predictions (described above)
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    
    # calculate cost (mean squared error here)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    # designate opimizer (adam is used here, but SGD or others could be used)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [22]:
# validation accuracy
# > check to see how effective our training is so far
with graph.as_default():
    # messy casts/conversios
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [23]:
# Batching
# > returns only full batches from our data
# > NOTE: SOME DATA IS REMOVED HERE DEPENDING ON THE BATCH SIZE
def get_batches(x, y, batch_size=100):
    
    # '//' divides then converts to int
    n_batches = len(x)//batch_size
    
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Train our model with our training data

In [28]:
# Training
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.241
Epoch: 0/10 Iteration: 10 Train loss: 0.243
Epoch: 0/10 Iteration: 15 Train loss: 0.223
Epoch: 0/10 Iteration: 20 Train loss: 0.225
Epoch: 0/10 Iteration: 25 Train loss: 0.211
Val acc: 0.664
Epoch: 0/10 Iteration: 30 Train loss: 0.189
Epoch: 0/10 Iteration: 35 Train loss: 0.178
Epoch: 0/10 Iteration: 40 Train loss: 0.200
Epoch: 1/10 Iteration: 45 Train loss: 0.151
Epoch: 1/10 Iteration: 50 Train loss: 0.170
Val acc: 0.732
Epoch: 1/10 Iteration: 55 Train loss: 0.150
Epoch: 1/10 Iteration: 60 Train loss: 0.146
Epoch: 1/10 Iteration: 65 Train loss: 0.149
Epoch: 1/10 Iteration: 70 Train loss: 0.134
Epoch: 1/10 Iteration: 75 Train loss: 0.128
Val acc: 0.810
Epoch: 1/10 Iteration: 80 Train loss: 0.133
Epoch: 2/10 Iteration: 85 Train loss: 0.106
Epoch: 2/10 Iteration: 90 Train loss: 0.134
Epoch: 2/10 Iteration: 95 Train loss: 0.113
Epoch: 2/10 Iteration: 100 Train loss: 0.112
Val acc: 0.767
Epoch: 2/10 Iteration: 105 Train loss: 0.147
Epoch: 2/10 Ite

## Calculate Test Accuracy

In [29]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.840
