## This notebook exands on the first notebook
 - Comments are pruned
 - More advanced NLP techniques are used during preprocessing

In [1]:
import numpy as np
import nltk
import tensorflow as tf
from string import punctuation
from collections import Counter
import string

# Tensorflow version information
print('TensorFlow Version: {}'.format(tf.__version__))

# Check GPU presence
if not tf.test.gpu_device_name():
    warnings.warn('No GPU')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0
Default GPU Device: /gpu:0


### Import data

In [2]:
# import raw reviews and labels
with open('./input_data/movie_reviews/reviews.txt', 'r') as f:
    reviews_raw = f.read()
with open('./input_data/movie_reviews/labels.txt', 'r') as f:
    labels_raw = f.read()

## View and Preprocess Data

### Review preprocessing
 - tokenize
 - remove punctuation
 - remove stopwords
 - perform stemming
 - split into individual reviews
 - convert to lowercase
 - create list of words for each review

In [3]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    excludePuncuation.add('\'\'')
    excludePuncuation.add('--')
    excludePuncuation.add('``')

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

In [4]:
# wrap preprocessing functions into a convenience function
# this done becuase order *is* important
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)    
    cleaned = convertItemsToLower(cleaned)
    return cleaned

In [5]:
# split into individual reviews (they are delimited by a '\n')
reviews_list = reviews_raw.split('\n')
# this leaves a "" empty value in the last index
# sample: print(len(reviews)) > 25001 == ""
# delete this last empty value
del reviews_list[25000]

In [7]:
processed_reviews_list = []
for rev in reviews_list:
    processed_reviews_list.append(processData(rev))

### Encode the words

In [10]:
words = []
for r in processed_reviews_list:
    for w in r:
        words.append(w)

In [11]:
word_counts = Counter(words)
# output sample| print(word_counts) > 'Counter({'the': 336713, 'and': 164107, 'a': 163009, ....'

vocab = sorted(word_counts, key=word_counts.get, reverse=True)
# output sample| print(vocab) > '['the', 'and', 'a', 'of', 'to','

# convert vocab to int
# NOTE: start at 1, not 0
vocab_to_int = {word: maping_int for maping_int, word in enumerate(vocab, 1) }
# output sample| print(vocab_to_int) > ''together': 291, 'ewing': 26224,'

In [14]:
reviews_as_ints = []
for r in processed_reviews_list:
    reviews_as_ints.append([vocab_to_int[term] for term in r])
print(len(reviews_as_ints))

25000


### Compare reviews (raw to converted)

In [16]:
# compare
print("Raw review:")
print(reviews_raw[1])

print("------------------")
print("Processed Review:")
print(processed_reviews_list[1])

print("------------------")
print("Review with terms mapped to ints:")
print(reviews_as_ints[1])

Raw review:
r
------------------
Processed Review:
['stori', 'man', 'unnatur', 'feel', 'pig', 'start', 'open', 'scene', 'terrif', 'exampl', 'absurd', 'comedi', 'formal', 'orchestra', 'audienc', 'turn', 'insan', 'violent', 'mob', 'crazi', 'chant', 'singer', 'unfortun', 'stay', 'absurd', 'whole', 'time', 'gener', 'narr', 'eventu', 'make', 'put', 'even', 'era', 'turn', 'cryptic', 'dialogu', 'would', 'make', 'shakespear', 'seem', 'easi', 'third', 'grader', 'technic', 'level', 'better', 'might', 'think', 'good', 'cinematographi', 'futur', 'great', 'vilmo', 'zsigmond', 'futur', 'star', 'salli', 'kirkland', 'freder', 'forrest', 'seen', 'briefli']
------------------
Review with terms mapped to ints:
[13, 55, 5343, 62, 2751, 86, 246, 18, 1139, 357, 1264, 105, 6860, 5733, 177, 94, 1460, 957, 2514, 794, 6273, 1412, 352, 434, 1264, 143, 6, 256, 1155, 703, 8, 139, 14, 858, 94, 9498, 334, 15, 8, 1605, 39, 686, 710, 6429, 1023, 447, 58, 155, 30, 7, 563, 613, 26, 21845, 21185, 613, 76, 2874, 8760, 123

### Review the current state/information about our data

In [11]:
review_lengths = Counter([len(each_review) for each_review in reviews_as_ints])
print(review_lengths[130])

review_len_list = list(review_lengths)
rl_sorted = sorted(review_len_list)
num_reviews = len(reviews_as_ints)
avg_len = sum(val * review_lengths[val] for val in review_lengths) / num_reviews

print("Number reviews: {}".format(num_reviews))
print("Zero-length reviews: {}".format(review_lengths[0]))
print("Avg review length: {}".format(avg_len))
print("Maximum review length: {}".format(max(review_lengths)))

185
Number reviews: 25000
Zero-length reviews: 0
Avg review length: 240.80784
Maximum review length: 2514


### We have a couple potential problems:
1. The max movie length is long
2. The average review length is also pretty large.
    - Half of the data is larger than 240 words --> will take a long time to train a RNN

### Both of these problems could be addressed by trimming the reviews
 - Will trim to `seq_len`

NOTE: There is a cost to trimming our data -- we're losing some of the information of each review over n length.  This isn't ideal.  Another solution could involve removing the excessively large reviews from our dataset.

In [12]:
# reviews_ints = [each for each in reviews_as_ints if len(each) > 0]
seq_len = 250

# convert reviews (as mapped ints) into numpy arrays
# we'll use a left padding of '0's to compensate for smaller reviews
reviews_as_feat_input = np.zeros((len(reviews_as_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_as_ints):
    reviews_as_feat_input[i, -len(row):] = np.array(row)[:seq_len]

# inspect the finalized reviews converted into usable data
print(len(reviews_as_feat_input))
print(reviews_as_feat_input[:2])

25000
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0 21429   308     6     3  1050   207     8  2143    32     1
    171    57    15    49    81  5813    44   382   110   140    15  5227
     60   154     9     1  5014  5858   475    71     5   260    12 21429
    308    13  1982     6    74  2395     5   613    73     6  5227     1
  24325     5  1990 10298     1 

## Labels
Map 
 - `positive` : `1`
 - `negative` : `0`

In [14]:
# Convert to a usable format

# split into single review {'positive' or 'negative'}
labels_list = labels_raw.split('\n')

# sample: print(len(labels_list)) > 25001 | print(labels_list[25000]) > ""
# same logic as above, remove empty value
del labels_list[-1]
# convert to numpy array and map positive=>1 and negative=>0
# NOTE: safer method would involve making sure only 'positive' and 'negative' are present first
labels = np.array([1 if cur_label == 'positive' else 0 for cur_label in labels_list])

# print to ensure we've converted correctly
print(len(labels))
print(labels[2500])
print(labels[:19])

25000
1
[1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]


## Split into training, validation, and testing

In [15]:
split_percent = 0.8
# 80% 'training', 20% 'testing'
# the 20% 'testing' split will be split in half;
#    - 10% 'validation' and 10% 'testing'
split_idx = int(len(reviews_as_feat_input)*split_percent)

# split into (training and validation&testing)
train_x, val_x = reviews_as_feat_input[:split_idx], reviews_as_feat_input[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

# split validation into validation and test sets
test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 250) 
Validation set: 	(2500, 250) 
Test set: 		(2500, 250)


## Overview of where we are

### Inputs

Labels
> - Converted to 0 and 1

Review text
> 1. Converted to integer representations
> 2. Trimmed to standardized size
> 3. Padded with 0's on the left

### Split into training, validation, and testing
>- 80% training (`train_x` and `train_y`)
>- 10% validation (`val_x` and `val_y`)
>- 10% testing (`test_x` and `test_y`)

# Building the RNN

In [16]:
# Hyper parameters
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [None]:
# Build Graph

# number of words in our vocab
n_words = len(vocab)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    # value for dropout
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [18]:
# embedding
# `embed_size` is the size of the embedding vectors or num of units in the embedding layer
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [19]:
with graph.as_default():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [20]:
# RNN forward pass
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

In [21]:
# output
with graph.as_default():

    # last value from the RNN output: `outputs[:, -1]`
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)

    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [22]:
# validation accuracy
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [23]:
# Batching
# > NOTE: SOME DATA MAY BE REMOVED DEPENDING ON THE BATCH SIZE
def get_batches(x, y, batch_size=100):
    
    # '//' divides then converts to int
    n_batches = len(x)//batch_size
    
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Train the model

In [28]:
# Training
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment_expanded.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.241
Epoch: 0/10 Iteration: 10 Train loss: 0.243
Epoch: 0/10 Iteration: 15 Train loss: 0.223
Epoch: 0/10 Iteration: 20 Train loss: 0.225
Epoch: 0/10 Iteration: 25 Train loss: 0.211
Val acc: 0.664
Epoch: 0/10 Iteration: 30 Train loss: 0.189
Epoch: 0/10 Iteration: 35 Train loss: 0.178
Epoch: 0/10 Iteration: 40 Train loss: 0.200
Epoch: 1/10 Iteration: 45 Train loss: 0.151
Epoch: 1/10 Iteration: 50 Train loss: 0.170
Val acc: 0.732
Epoch: 1/10 Iteration: 55 Train loss: 0.150
Epoch: 1/10 Iteration: 60 Train loss: 0.146
Epoch: 1/10 Iteration: 65 Train loss: 0.149
Epoch: 1/10 Iteration: 70 Train loss: 0.134
Epoch: 1/10 Iteration: 75 Train loss: 0.128
Val acc: 0.810
Epoch: 1/10 Iteration: 80 Train loss: 0.133
Epoch: 2/10 Iteration: 85 Train loss: 0.106
Epoch: 2/10 Iteration: 90 Train loss: 0.134
Epoch: 2/10 Iteration: 95 Train loss: 0.113
Epoch: 2/10 Iteration: 100 Train loss: 0.112
Val acc: 0.767
Epoch: 2/10 Iteration: 105 Train loss: 0.147
Epoch: 2/10 Ite

# TODO: create losses plot

### Calculate Test Accuracy

In [29]:
test_acc = []
with tf.Session(graph=graph) as sess:
    # load last checkpoint from training our model
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.840
