In [0]:
import collections
import math
import os
import random
import tarfile
import re

In [0]:
import tensorflow as tf
import numpy as np
import matplotlib as mp
from matplotlib import pyplot as plt
from google.colab import drive

In [0]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def read_words():
  with zipfile.ZipFile(file='/content/gdrive/My Drive/SentimentAnalysisTensorFlow/ImdbReviews.tar.gz') as myZip:
    firstFile = myZip.namelist()[0]
    filestring = tf.compat.as_str((myZip.read(firstFile)))
    words = filestring.split()

  return words

In [0]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9]+")

def get_reviews(dirname, positive=True):
  label = 1 if positive else 0

  reviews = []
  labels = []

  for filename in os.listdir(path=dirname):
    if filename.endswith(".txt"):
      with open(dirname + filename, mode='r+') as f:
        review = f.read()
        # print(review)
        review = review.lower().replace("<br />"," ")
        review = re.sub(TOKEN_REGEX, ' ', review)

        reviews.append(review)
        labels.append(label)
  return reviews, labels

In [0]:
def extract_labels_data():
  with tarfile.open(name='/content/gdrive/My Drive/SentimentAnalysisTensorFlow/ImdbReviews.tar.gz') as tar:
    tar.extractall()
    tar.close()

  positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive=True)
  negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive=False)

  data = positive_reviews + negative_reviews
  labels = positive_labels + negative_labels

  return labels, data

In [0]:
labels, data = extract_labels_data()

In [0]:
# pad shorter review, truncate longer reviews
# so that all of them are of same length
MAX_SEQUENCE_LENGTH = 250
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)
x_data = np.array(list(vocab_processor.fit_transform(data)))
y_output = np.array(labels)
vocabulary_size = len(vocab_processor.vocabulary_)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.


In [0]:
x_data[:3]

array([[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  13,  19,  20,  21,   2,  22,  23,  24,
         25,  26,  27,  28,  29,  30,  31,  32,  24,  33,  34,  26,  19,
         35,  36,  37,  16,  38,  39,  40,  24,  41,  42,  43,  44,  45,
         46,  47,   9,  48,  49,  50,  51,  52,  53,  23,  43,  38,  54,
         44,  55,  56,  57,  58,  59,  19,  60,  28,  61,  24,  62,  63,
         44,  64,  16,  28,  65,  66,  67,  68,  44,  69,  70,  71,  72,
         73,  74,  75,  76,  77,  78,   8,   2,   3,  73,  79,  80,  81,
         24,  82,  83,  84,  85,  13,  77,  78,  86,  44,  87,  88,  89,
         23,  24,  90,  91,  92,  74,  56,  76,  93,  94,  95,  44,  96,
         88,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [0]:
data[:3]

['more tv movies ought to be made like this one i saw it way back in 93 when it was first on tv helen hunt and steven weber were both terrific giving very gritty and realistic performances weber was especially good turning in an exceptionally creepy and understated performance as the child molester killer this film really increased my respect for hunt as an actress the director also directed hoosiers which was somehow both formulaic and exciting but the direction in both of these works has the same stark simple realism that is so appealing if you like tv movies that aren t predictable and filled with overacting see it if you can the side story about hunt and fahey s affair is also appealing without detracting from the main story ',
 'this was the first regular filmed columbo movie episode but yet it aired as the second after steven spielberg s columbo murder by the book it s also at the same time among one of the better ones bernard l kowalski was one great creative director no wonder 

In [0]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))
x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [0]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA: TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA: TOTAL_DATA]

In [0]:
tf.reset_default_graph()

x = tf.placeholder(dtype=tf.int32, shape=[None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(dtype=tf.int32, shape=[None])

In [0]:
num_epochs = 20
batch_size = 25
embedding_size = 50
max_label = 2

In [0]:
embedding_matrix = tf.Variable(
    initial_value=tf.random_uniform(
        shape=[vocabulary_size, embedding_size],
        minval=-1.0,
        maxval=1
    )
)

In [0]:
embeddings = tf.nn.embedding_lookup(params=embedding_matrix, ids=x)

In [0]:
# shape=[batch_size, n_steps, n_inputs)
# n_steps: no of instances in time. n_steps=250 because
# the sequence length is 250 and the rnn cell will be
# unrolled 250 times in time

# the input at any point of time is a single word
# and is embedded with a dimensionality of 50 , i.e, n_inputs

embeddings

<tf.Tensor 'embedding_lookup/Identity:0' shape=(?, 250, 50) dtype=float32>

In [0]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)

In [0]:
# Unrolls the rnn throught the time
# encoding is the final state of the rnn
_, (encoding, _) = tf.nn.dynamic_rnn(cell=lstmCell, inputs=embeddings, dtype=tf.float32)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [0]:
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

In [0]:
logits = tf.layers.dense(encoding, max_label)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
    logits=logits,
    labels=y
)
loss = tf.reduce_mean(input_tensor=cross_entropy)
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [0]:
# output with the highest probability is
# the prediction is accurate if predicted label
# is equal to the actual label
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [0]:
init = tf.global_variables_initializer()

In [0]:
with tf.Session() as sess:
  sess.run(init)

  for epoch in range(num_epochs):

    num_batches = int(len(train_data) // batch_size) + 1

    for i in range(num_batches):

      min_ind = i * batch_size
      max_ind = np.min([len(train_data), ((i+1)*batch_size)])

      x_train_batch = train_data[min_ind: max_ind]
      y_train_batch = train_target[min_ind: max_ind]

      train_dict = {x: x_train_batch, y: y_train_batch}
      sess.run(train_step, feed_dict=train_dict)
      train_loss, train_acc = sess.run([loss, accuracy], feed_dict=train_dict)
    
    test_dict = {x: test_data, y: test_target}

    test_loss, test_acc = sess.run([loss, accuracy], feed_dict=test_dict)

    print('Epoch: {}, Test Loss: {:.2}, Test Acc {:.5}' .format(epoch+1, test_loss, test_acc))

Epoch: 1, Test Loss: 0.7, Test Acc 0.504
Epoch: 2, Test Loss: 0.86, Test Acc 0.514
Epoch: 3, Test Loss: 1.3, Test Acc 0.616
Epoch: 4, Test Loss: 0.81, Test Acc 0.743
Epoch: 5, Test Loss: 0.92, Test Acc 0.766
Epoch: 6, Test Loss: 1.2, Test Acc 0.775
Epoch: 7, Test Loss: 1.3, Test Acc 0.783
Epoch: 8, Test Loss: 1.4, Test Acc 0.79
Epoch: 9, Test Loss: 1.4, Test Acc 0.789
Epoch: 10, Test Loss: 1.5, Test Acc 0.791
Epoch: 11, Test Loss: 1.5, Test Acc 0.792
Epoch: 12, Test Loss: 1.5, Test Acc 0.793
Epoch: 13, Test Loss: 1.6, Test Acc 0.794


KeyboardInterrupt: ignored