In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import collections
import math
import os
import random
import tarfile
import re

In [None]:
from six.moves import urllib

In [None]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

In [None]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
        
    print('found and verified file from this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [None]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+') as f:
                review = f.read().lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                reviews.append(review)
                labels.append(label)
                
    return reviews, labels

In [None]:
def extract_labels_data():
    # if the file has not already been extracted 
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
            
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive = True)
    negative_reviews, negative_reviews = get_reviews("aclImdb/train/neg/", positive = False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_reviews
    
    return labels, data

In [None]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

In [None]:
labels, data = extract_labels_data()

In [None]:
labels[:5]

In [None]:
data[:5]

In [None]:
len(labels), len(data)

In [None]:
MAX_SEQUENCE_LENGTH = 250

In [None]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

In [None]:
x_data = np.array(list(vocab_processor.fit_transform(data)))

In [None]:
y_output = np.array(labels)

In [None]:
vocabulary_size = len(vocab_processor.vocabulary_)
print(vocabulary_size)

In [None]:
data[3:5]

In [None]:
x_data[3:5]

In [None]:
y_output[:5]

In [None]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [None]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [None]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [None]:
num_epochs = 20
batch_size = 25
emebedding_size = 50
max_label = 2

In [None]:
embedding_matrix = tf.Variable(tf.random_uniform([vocabulary_size, emebedding_size], -1.0, 1.0))

embeddings = tf.nn.embedding_lookup(embedding_matrix, x)

In [None]:
embedding_matrix

In [None]:
embeddings

In [None]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)

lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob = 0.75)

In [None]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype = tf.float32)

In [None]:
encoding

In [None]:
logits = tf.layers.dense(encoding, max_label, activation=None)

In [None]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = y)

loss = tf.reduce_mean(cross_entropy)

In [None]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))

accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [None]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [None]:
init = tf.global_variables_initializer()

In [None]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epochs):
        
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])
            
            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict = train_dict)
            
            train_loss, train_acc = session.run([loss, accuracy], feed_dict=train_dict)
            
        test_dict = {x:test_data, y:test_target}
        
        test_loss, test_acc = session.run([loss, accuracy], feed_dict=test_dict)
        print('Epoch: {}, Test Loss: {:.2}, test Acc: {:.5}'.format(epoch + 1, test_loss, test_acc))
        