In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import collections
import math
import os
import random
import tarfile
import re

In [3]:
from six.moves import urllib

In [25]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.learn as skflow

In [5]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.15.4
3.0.2
1.10.0


### Download , unzip and untar files in an automated way

In [6]:
DOWNLOAD_FILENAME = 'imdbReviews.tar.gz'

def download_file(url_path):
    
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename , _ = urllib.request.urlretrieve(url_path, DOWNLOAD_FILENAME)
    
    print('Found and verified file from this path : ', url_path)
    print('Download file: ', DOWNLOAD_FILENAME)

### Data preprocessing

In [7]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+') as f:
                
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, "", review)
                
                reviews.append(review)
                labels.append(label)
    
    return reviews, labels

In [8]:
def extract_labels_data():
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOAD_FILENAME) as tar:
            tar.extractall()
            tar.close()
    
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive=True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive=False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data

In [9]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)

Found and verified file from this path :  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Download file:  imdbReviews.tar.gz


In [10]:
labels, data = extract_labels_data()

In [11]:
labels[:4]

[1, 1, 1, 1]

In [12]:
data[0:2]

['for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem imagine a movie where joe piscopo is actually funny maureen stapleton is a scene stealer the moroni character is an absolute scream watch for alan the skipper hale jr as a police sgt',
 'bizarre horror movie filled with famous faces but stolen by cristina raines later of tvs flamingo road as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the gateway to hell the scenes with raines modeling are very well captured the mood music is perfect deborah raffin is charming as cristinas pal but when raines moves into a creepy brooklyn heights brownstone inhabited by a blind priest on the top floor things really start cooking the neighbors including a fantastically wicked burgess meredith and kinky couple sylvia miles  beverly dangelo are a diabolical lot and eli wallach is great fun as a wily police detective the movie is nearly a cro

In [13]:
len(labels), len(data)

(25000, 25000)

In [14]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


In [15]:
min_document_length = min([len(x.split(" ")) for x in data])

In [20]:
average_document_length = sum(len(x.split(" ")) for x in data) / len(data)
print(average_document_length)

233.77672


In [21]:
MAX_SEQUENCE_LENGTH = 250

In [26]:
vocab_processor = tf.data.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.


In [27]:
x_data = np.array(list(vocab_processor.fit_transform(data)))

y_output = np.array(labels)

Instructions for updating:
Please use tensorflow/transform or tf.data.


In [28]:
vocabluary_size = len(vocab_processor.vocabulary_)
print(vocabluary_size)

111526


In [29]:
data[3:5]

['its a strange feeling to sit alone in a theater occupied by parents and their rollicking kids i felt like instead of a movie ticket i should have been given a nambla membership  based upon thomas rockwells respected book how to eat fried worms starts like any childrens story moving to a new town the new kid fifth grader billy forrester was once popular but has to start anew making friends is never easy especially when the only prospect is poindexter adam or erica who at 4 12 feet is a giant  further complicating things is joe the bully his freckled face and sleeveless shirts are daunting he antagonizes kids with the death ring a crackerjack ring that is rumored to kill you if youre punched with it but not immediately no the death ring unleashes a poison that kills you in the eight grade  joe and his axis of evil welcome billy by smuggling a handful of slimy worms into his thermos once discovered billy plays it cool swearing that he eats worms all the time then he throws them at joes 

In [30]:
x_data[3:5]

array([[186,   2, 187, 188,  66, 189, 190, 191,   2, 192, 193,  51, 194,
        110, 195, 196, 197, 176, 198, 199, 200,  12,   2,   3, 201, 176,
        202, 203, 204, 205,   2, 206, 207, 131, 208, 209, 210, 211, 212,
        213,  66, 214, 215, 216, 217, 199, 218, 219, 220, 221,  66,   2,
        222, 223,  29, 222, 224, 225, 226, 227, 228, 158, 229, 230,  49,
        231,  66, 102, 232, 233, 234,  22, 235, 236, 237,  87,  29, 163,
        238,  22, 239, 240, 241, 242,  64, 243, 244, 150, 245,  22,   2,
        246, 247, 248, 100,  22,  20,  29, 249, 250, 251, 252, 110, 253,
        254,  10, 255, 256, 257, 197,  46,  29, 258, 259,   2, 260, 259,
          4,  22, 261,  66, 262, 263, 153, 264, 265,  46, 266,  49, 267,
        268,   6,  29, 258, 259, 269,   2, 270,   4, 271, 263, 191,  29,
        272, 273,  20, 110, 250, 274,  12, 275, 276, 227,  51, 277,   2,
        278,  12, 279, 216,  89, 250, 280, 229, 281, 227, 282, 266, 283,
        284,   4, 256, 285, 216, 286,  29, 287, 288

In [31]:
y_output[3:5]

array([1, 1])

In [32]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [53]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [54]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [55]:
num_epoches = 20
batch_size = 25
embedding_size = 50
max_label = 2

In [56]:
embedding_matrix = tf.Variable(tf.random_uniform([vocabluary_size, embedding_size], -1.0, 1.0))

embeddings = tf.nn.embedding_lookup(embedding_matrix, x)

In [57]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(111526, 50) dtype=float32_ref>

In [58]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(?, 250, 50) dtype=float32>

In [59]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)

lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)

In [60]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype=tf.float32)

In [61]:
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

In [62]:
logits = tf.layers.dense(encoding, max_label, activation=None)

In [63]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)

loss = tf.reduce_mean(cross_entropy)

In [64]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))

accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [65]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [66]:
init = tf.global_variables_initializer()

In [67]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epoches):
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])
            
            x_train_batch = train_data[min_ix: max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict=train_dict)
            
            train_loss, train_acc = session.run([loss, accuracy], feed_dict=train_dict)
            
        test_dict = {x: test_data, y: test_target}
        test_loss, test_acc = session.run([loss, accuracy], feed_dict=test_dict)
        print('Epoch: {}, Test loss: {:.2}, Test Acc: {:.5}'.format(epoch+1, test_loss, test_acc))

Epoch: 1, Test loss: 0.7, Test Acc: 0.48
Epoch: 2, Test loss: 0.75, Test Acc: 0.528
Epoch: 3, Test loss: 0.68, Test Acc: 0.73
Epoch: 4, Test loss: 0.79, Test Acc: 0.795
Epoch: 5, Test loss: 0.88, Test Acc: 0.806
Epoch: 6, Test loss: 1.0, Test Acc: 0.818
Epoch: 7, Test loss: 1.1, Test Acc: 0.814
Epoch: 8, Test loss: 1.1, Test Acc: 0.812
Epoch: 9, Test loss: 1.2, Test Acc: 0.82
Epoch: 10, Test loss: 1.3, Test Acc: 0.819
Epoch: 11, Test loss: 1.3, Test Acc: 0.821
Epoch: 12, Test loss: 1.4, Test Acc: 0.825
Epoch: 13, Test loss: 1.5, Test Acc: 0.822
Epoch: 14, Test loss: 1.5, Test Acc: 0.821
Epoch: 15, Test loss: 1.6, Test Acc: 0.822
Epoch: 16, Test loss: 1.6, Test Acc: 0.818
Epoch: 17, Test loss: 1.7, Test Acc: 0.817
Epoch: 18, Test loss: 1.7, Test Acc: 0.816
Epoch: 19, Test loss: 1.7, Test Acc: 0.815
Epoch: 20, Test loss: 1.8, Test Acc: 0.817
