In [2]:
# !wget https://raw.githubusercontent.com/soni-ratnesh/Sentiment-Analysis-RNN/master/data/reviews.txt

In [3]:
# !wget https://raw.githubusercontent.com/soni-ratnesh/Sentiment-Analysis-RNN/master/data/labels.txt 

# Import Dependencies

In [4]:
from string import punctuation
from collections import Counter
import numpy as np
import pandas as pd
import random

# Load in the Data

In [5]:
# Read data from text files
with open("reviews.txt") as f:
  reviews = f.read()

with open("labels.txt") as f:
  labels = f.read()

# Data Exploration

In [6]:
reviews[: 1000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [7]:
# Characters present in reviews 
set(reviews)

{'\n',
 ' ',
 '.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [8]:
labels[: 1000]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\

# Data Preprocessing

In [9]:
# Convert alphabets to lowercase
text = reviews.lower()

# Remove the punctuations
text = ''.join([char  for char in text  if char not in punctuation])

# Remove '\n' character
reviews_split = text.split('\n') # List of reviews
text = ' '.join(reviews_split)

text[: 5000]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t    story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent m

In [10]:
labels = labels.split()
labels[: 10]

['positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

In [11]:
# Check if zero-length reviews are present
review_lengths = Counter(len(review)  for review in reviews_split)
print(f"Zero-length reviews: {review_lengths[0]}")

Zero-length reviews: 1


In [12]:
# Get index of reviews having non-zero length
non_zero_idxs = [idx  for idx, review in enumerate(reviews_split)  if len(review)!=0]

# Remove zero-length reviews and their corresponding labels
reviews_split = [reviews_split[idx]  for idx in non_zero_idxs]
labels = [labels[idx]  for idx in non_zero_idxs]

In [13]:
print(f"No. of reviews: {len(reviews_split)}")
print(f"No. of labels : {len(labels)}")

No. of reviews: 25000
No. of labels : 25000


In [14]:
# Get a list of words used in reviews
words = text.split()
print(words[: 15])
print(f"\nThere are {len(words)} words in reviews.")

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other']

There are 6020196 words in reviews.


In [15]:
# Get vocabulary
vocabulary = set(words)
print(f"There are {len(vocabulary)} words in the vocabulary.")

There are 74072 words in the vocabulary.


### Create Dictionary


The embedding lookup requires that we pass in integers to our network. The easiest way to do this is to create dictionaries that map the words in the vocabulary to integers. Then we can convert each of our reviews into integers so they can be passed into the network.

    Now you're going to encode the words with integers. Build a dictionary that maps words to integers. Later we're going to pad our input vectors with zeros, so make sure the integers start at 1, not 0. Also, convert the reviews to integers and store the reviews in a new list called reviews_ints.


In [16]:
# Encode the words to integers
# The integers start from 1 (and not 0) as we'll pad our input vectors with zeros later
vocab_to_int = {word: idx  for idx, word in enumerate(vocabulary, 1)}


dict(list(vocab_to_int.items())[:10])

{'wright': 1,
 'dullish': 2,
 'hidalgo': 3,
 'lease': 4,
 'jaret': 5,
 'classicks': 6,
 'rancher': 7,
 'sewanee': 8,
 'pollen': 9,
 'zaps': 10}

In [17]:
# int_vocab = dict(enumerate(unique_words))

# dict(list(int_vocab.items())[:10])

### Tokenize Reviews

In [55]:
reviews_int = [] # Will store tokenized reviews

# Tokenize each review
for review in reviews_split:
  reviews_int.append([vocab_to_int[word]  for word in review.split()])

In [56]:
rand_idx = random.randint(0, len(reviews_split))

print(f"REVIEW:\n   {reviews_split[rand_idx]}\n")
print(f"   No. of words in the review: {len(reviews_split[rand_idx].split())}\n")

print("-" * 20)

print(f"\nTOKENIZED VERSION:\n   {reviews_int[rand_idx]}\n")
print(f"   No. of tokens in the tokenized version: {len(reviews_int[rand_idx])}")


REVIEW:
   why didn  t the producers give that show a chance of all the junk on tv  why didn  t the producers give six degrees a chance  will the series go on video  i would love to see how it ends  put season one on video and sell it  i was a loyal fan of six degrees and waited for it  s return  i set my recorder to tape all of the shows  thank god for that  i just found out that the show was canceled and i  m heart broken  i wish i knew it was going to be canceled  why didn  t they tell us  i thought the show was just developing some depth in the characters  the writing was pretty good also  steven  campbell scott  is my all time favorite  i am so sorry to see it go   

   No. of words in the review: 140

--------------------

TOKENIZED VERSION:
   [34613, 1859, 66231, 57837, 31361, 41845, 14677, 20940, 67116, 66365, 1861, 48910, 57837, 54250, 62973, 50266, 34613, 1859, 66231, 57837, 31361, 41845, 3771, 20551, 67116, 66365, 72109, 57837, 56061, 64799, 62973, 20254, 29100, 13990, 5668

### Encode Labels

In [57]:
encoded_labels = np.array([1 if label=="positive" else 0  for label in labels])
print(f"No. of labels: {len(encoded_labels)}")
print(encoded_labels[:20])

No. of labels: 25000
[1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0]


In [58]:
# Get length of the longest review
max_len = max(len(review)  for review in reviews_int)

# Get the average length of reviews
avg_len = sum([len(review)  for review in reviews_int]) / len(reviews_int)

print(f"The length of the longest review is {max_len}.")
print(f"The average length of reviews is {avg_len:.2f}")

The length of the longest review is 2514.
The average length of reviews is 240.81


In [60]:
review_lengths[2514]

0

In [22]:
def pad_reviews_with_zeros(reviews_int, seq_length):
  features = np.zeros((len(reviews_int), seq_length), dtype=np.int32)

  for idx, review in enumerate(reviews_int):
    features[idx][-len(review):] = np.array(review)[:seq_length] 

  return features

In [23]:
features = pad_reviews_with_zeros(reviews_int, seq_length=512)
features

array([[    0,     0,     0, ..., 62951, 24473, 66231],
       [    0,     0,     0, ..., 32075, 71251, 42278],
       [    0,     0,     0, ..., 60617, 10820, 56695],
       ...,
       [    0,     0,     0, ..., 45492, 20139, 23994],
       [    0,     0,     0, ..., 15931, 73481,  7962],
       [    0,     0,     0, ..., 21582,  9243, 38232]])

In [24]:
type(features), type(encoded_labels)

(numpy.ndarray, numpy.ndarray)

In [25]:
import tensorflow as tf

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels,
                                                    test_size=0.2,
                                                    shuffle=True)


print(f"""
X_train shape: {X_train.shape}
y_train shape: {y_train.shape}
X_test shape : {X_test.shape}
y_test shape : {y_test.shape}""")


X_train shape: (20000, 512)
y_train shape: (20000,)
X_test shape : (5000, 512)
y_test shape : (5000,)


In [28]:
one, zero = 1, 0

for  i in y_train[:100]:
  if i == 1:
    one = one+1
  else:
    zero = zero+1

print(one,zero)

54 47


In [29]:
y_test[:20]

array([1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0])

### Generate Datasets

In [30]:
def get_training_dataset(X_train, y_train, batch_size=32):
  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
  train_dataset = train_dataset.shuffle(buffer_size=len(X_train), reshuffle_each_iteration=True)
  train_dataset = train_dataset.batch(batch_size)
  train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

  return train_dataset

In [31]:
def get_validation_dataset(X_test, y_test, batch_size=32):
  validation_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
  validation_dataset = validation_dataset.batch(batch_size)

  return validation_dataset

In [32]:
# Get training dataset
train_dataset = get_training_dataset(X_train, y_train, batch_size=32)

# Get validaton dataset
validation_dataset = get_validation_dataset(X_test, y_test, batch_size=32)

In [33]:
# Get sample training dataset
sample_x, sample_y = train_dataset.as_numpy_iterator().next()

print(f'Input shape: {sample_x.shape}') 
print(f'Input: \n{sample_x}')
print("\n")
print(f'Label shape: {sample_y.shape}') 
print(f'Label: \n{sample_y}')

Input shape: (32, 512)
Input: 
[[    0     0     0 ...  5805 49061 17805]
 [    0     0     0 ... 67116 38232 64893]
 [    0     0     0 ...  1861 57837 31873]
 ...
 [    0     0     0 ... 48910  1861 14677]
 [    0     0     0 ... 12990 11815 64001]
 [    0     0     0 ... 47257  7801 28306]]


Label shape: (32,)
Label: 
[1 1 1 0 0 1 0 0 0 1 1 1 1 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 1]


# Model Engineering

In [34]:
from tensorflow.keras import layers

In [35]:
embedding = layers.Embedding(input_dim=len(vocab_to_int)+1, # +1 for 0 padding
                             output_dim=512,
                             embeddings_initializer="uniform",
                             input_length=512)

In [36]:
embedding(features[0])

<tf.Tensor: shape=(512, 512), dtype=float32, numpy=
array([[ 0.01910799, -0.03634294, -0.01152693, ...,  0.04791348,
        -0.04234217, -0.00322814],
       [ 0.01910799, -0.03634294, -0.01152693, ...,  0.04791348,
        -0.04234217, -0.00322814],
       [ 0.01910799, -0.03634294, -0.01152693, ...,  0.04791348,
        -0.04234217, -0.00322814],
       ...,
       [ 0.02234991, -0.0158273 , -0.04439408, ..., -0.00085592,
         0.00873505, -0.01997552],
       [-0.01615602, -0.01785319, -0.04988476, ...,  0.03678954,
         0.00559428, -0.01309067],
       [ 0.01313286, -0.04200964,  0.03257532, ...,  0.04810972,
         0.00552788,  0.02484614]], dtype=float32)>

In [37]:
inputs = layers.Input(shape=(512), dtype=tf.float32)
x = layers.LSTM(units=256, dropout=)

SyntaxError: invalid syntax (3214975404.py, line 2)

In [None]:
# rev = [[12, 13, 15],
#        [45, 85, 64, 45, 63, 78, 45],
#        [16, 75, 96, 42]]