In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import collections
import math
import os
import random
import tarfile
import re

In [3]:
from six.moves import urllib

In [4]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.learn as skflow

In [5]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.16.3
3.0.2
1.10.0


In [6]:
DOWNLOAD_FILENAME = 'imdbReviews.tar.gz'

def download_file(url_path):
    
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename , _ = urllib.request.urlretrieve(url_path, DOWNLOAD_FILENAME)
    
    print('Found and verified file from this path : ', url_path)
    print('Download file: ', DOWNLOAD_FILENAME)

In [7]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+') as f:
                
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, "", review)
                
                reviews.append(review)
                labels.append(label)
    
    return reviews, labels

In [8]:
def extract_labels_data():
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOAD_FILENAME) as tar:
            tar.extractall()
            tar.close()
    
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive=True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive=False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data

In [9]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)

Found and verified file from this path :  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Download file:  imdbReviews.tar.gz


In [10]:
labels, data = extract_labels_data()

In [11]:
labels[:4]

[1, 1, 1, 1]

In [12]:
data[0:2]

['for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem imagine a movie where joe piscopo is actually funny maureen stapleton is a scene stealer the moroni character is an absolute scream watch for alan the skipper hale jr as a police sgt',
 'bizarre horror movie filled with famous faces but stolen by cristina raines later of tvs flamingo road as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the gateway to hell the scenes with raines modeling are very well captured the mood music is perfect deborah raffin is charming as cristinas pal but when raines moves into a creepy brooklyn heights brownstone inhabited by a blind priest on the top floor things really start cooking the neighbors including a fantastically wicked burgess meredith and kinky couple sylvia miles  beverly dangelo are a diabolical lot and eli wallach is great fun as a wily police detective the movie is nearly a cro

In [13]:
len(labels), len(data)

(25000, 25000)

In [14]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


In [15]:
min_document_length = min([len(x.split(" ")) for x in data])
print(min_document_length)

10


In [16]:
average_document_length = sum(len(x.split(" ")) for x in data) / len(data)
print(average_document_length)

233.77672


In [17]:
MAX_SEQUENCE_LENGTH = 250

In [18]:
words = np.load('wordsList.npy')
words=np.array([x.decode() for x in words])

In [19]:
words[:5], len(words)

(array(['0', ',', '.', 'of', 'to'], dtype='<U68'), 400000)

In [20]:
def get_word_index_dictonary(words):
    dictonary = {}
    
    index = 0
    for word in words:
        dictonary[word] = index
        index += 1
    
    return dictonary

In [21]:
dictonary = get_word_index_dictonary(words)

In [22]:
dictonary['and'], dictonary['the'], dictonary['of'], dictonary['that']

(5, 201534, 3, 12)

In [23]:
review_ids = []

def convert_reviews_to_ids(data, words):
    word_list = words.tolist()
    
    progress = 0
    
    
    for review in data:
        
        review_id = []
        
        index = 0
        
        for word in review:
            if index >= MAX_SEQUENCE_LENGTH:
                break;
            
            
            try:
                review_id.append(dictonary[word])
            except KeyError:
                review_id.append(0)
            
            index += 1
        
        if len(review_id) < MAX_SEQUENCE_LENGTH:
            review_id = np.pad(review_id, (0, MAX_SEQUENCE_LENGTH - index), 'constant')
        
        review_ids.append(np.array(review_id))
        progress += 1
        
        if progress % 1000 == 0:
            print("Completed: ", progress)

In [24]:
convert_reviews_to_ids(data, words)

Completed:  1000
Completed:  2000
Completed:  3000
Completed:  4000
Completed:  5000
Completed:  6000
Completed:  7000
Completed:  8000
Completed:  9000
Completed:  10000
Completed:  11000
Completed:  12000
Completed:  13000
Completed:  14000
Completed:  15000
Completed:  16000
Completed:  17000
Completed:  18000
Completed:  19000
Completed:  20000
Completed:  21000
Completed:  22000
Completed:  23000
Completed:  24000
Completed:  25000


In [25]:
review_ids[10]

array([   7,   41, 5025, 1110, 1110, 3814,    0, 3410, 4868, 3814, 1534,
          7, 5025, 2404, 1110, 1534,    0, 1993, 3524,    0, 3410,   41,
       1911, 5025, 3880, 1911,   41, 1110, 3814, 1968,    0,   41, 1534,
          0,   41, 3814,    0, 2159, 5918,   41, 1534,    0, 3880,   41,
       5025, 1993,    0, 3420, 5025,    7, 3524,   41, 3814, 3410,    0,
          7,    0, 1534, 1110, 1864, 1911, 1110, 2159,    7, 1911, 3524,
          0,    7, 2159,    0, 2159, 5918, 1110,    0, 1993,    7,   41,
       3814,    0, 1864, 5918,    7, 1911,    7, 1864, 2159, 1110, 1911,
       1534,    0, 1556,    7, 3814, 4652,    0, 1534, 5918, 1110,    0,
       5918,    7, 1534,    0,    7,    0, 5025, 4868, 2404, 1110, 5025,
       3524,    0, 1534, 1864, 1110, 3814, 1110,    0, 5140,   41, 2159,
       5918,    0, 1911, 4868, 1534, 5918,    7, 3814,    0, 1534, 1110,
       2159, 5918,    0,   41, 3814,    0,    7,    0, 1911, 1110, 1534,
       2159,    7, 6479, 1911,    7, 3814, 2159,   

In [26]:
review_ids = np.load('idsMatrix.npy')

In [27]:
review_ids.shape

(25000, 250)

In [28]:
review_ids[:5]

array([[174943,    152,     14, ...,      0,      0,      0],
       [ 26494,     46, 399999, ...,   2153,    144,      7],
       [  6520, 399999,     21, ...,      0,      0,      0],
       [    37,     14,   2407, ...,      0,      0,      0],
       [    37,     14,     36, ...,      0,      0,      0]], dtype=int32)

In [29]:
x_data = review_ids
y_output = np.array(labels)

In [30]:
vocabulary_size = len(words)
print(vocabulary_size)

400000


In [31]:
data[:5]

['for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem imagine a movie where joe piscopo is actually funny maureen stapleton is a scene stealer the moroni character is an absolute scream watch for alan the skipper hale jr as a police sgt',
 'bizarre horror movie filled with famous faces but stolen by cristina raines later of tvs flamingo road as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the gateway to hell the scenes with raines modeling are very well captured the mood music is perfect deborah raffin is charming as cristinas pal but when raines moves into a creepy brooklyn heights brownstone inhabited by a blind priest on the top floor things really start cooking the neighbors including a fantastically wicked burgess meredith and kinky couple sylvia miles  beverly dangelo are a diabolical lot and eli wallach is great fun as a wily police detective the movie is nearly a cro

In [32]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [33]:
TRAIN_DATA = 22000
TOTAL_DATA = 25000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [34]:
tf.reset_default_graph()
x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [49]:
num_epoches = 100
batch_size = 90
embedding_size = 50
max_label = 2

In [36]:
saved_embeddings = np.load('wordVectors.npy')

In [37]:
embeddings = tf.nn.embedding_lookup(saved_embeddings, x)

In [38]:
saved_embeddings.shape

(400000, 50)

In [39]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(?, 250, 50) dtype=float32>

In [40]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)

In [41]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype=tf.float32)

In [42]:
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

In [43]:
logits = tf.layers.dense(encoding, max_label, activation=None)

In [44]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)

loss = tf.reduce_mean(cross_entropy)

In [45]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))

accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [46]:
optimizer = tf.train.AdadeltaOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [47]:
init = tf.global_variables_initializer()

In [50]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epoches):
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])
            
            x_train_batch = train_data[min_ix: max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict=train_dict)
            
            train_loss, train_acc = session.run([loss, accuracy], feed_dict=train_dict)
            
        test_dict = {x: test_data, y: test_target}
        test_loss, test_acc = session.run([loss, accuracy], feed_dict=test_dict)
        print('Epoch: {}, Test loss: {:.2}, Test Acc: {:.5}'.format(epoch+1, test_loss, test_acc))

Epoch: 1, Test loss: 0.7, Test Acc: 0.51567
Epoch: 2, Test loss: 0.7, Test Acc: 0.52033
Epoch: 3, Test loss: 0.7, Test Acc: 0.52
Epoch: 4, Test loss: 0.7, Test Acc: 0.523
Epoch: 5, Test loss: 0.7, Test Acc: 0.52467
Epoch: 6, Test loss: 0.7, Test Acc: 0.526
Epoch: 7, Test loss: 0.69, Test Acc: 0.52667
Epoch: 8, Test loss: 0.69, Test Acc: 0.527
Epoch: 9, Test loss: 0.69, Test Acc: 0.52633
Epoch: 10, Test loss: 0.69, Test Acc: 0.52667
Epoch: 11, Test loss: 0.69, Test Acc: 0.52667
Epoch: 12, Test loss: 0.69, Test Acc: 0.52667
Epoch: 13, Test loss: 0.69, Test Acc: 0.52633
Epoch: 14, Test loss: 0.69, Test Acc: 0.526
Epoch: 15, Test loss: 0.69, Test Acc: 0.52633
Epoch: 16, Test loss: 0.69, Test Acc: 0.526
Epoch: 17, Test loss: 0.69, Test Acc: 0.527
Epoch: 18, Test loss: 0.69, Test Acc: 0.52767
Epoch: 19, Test loss: 0.69, Test Acc: 0.52833
Epoch: 20, Test loss: 0.69, Test Acc: 0.529
Epoch: 21, Test loss: 0.69, Test Acc: 0.53033
Epoch: 22, Test loss: 0.69, Test Acc: 0.531
Epoch: 23, Test loss: 