# Import Dependencies

In [1]:
from string import punctuation
from collections import Counter
import numpy as np
import pandas as pd
import random
import tensorflow as tf

# Load in the Data

In [2]:
# Download reviews.txt
!wget https://raw.githubusercontent.com/GargPriyanshu1112/Sentiment-Analysis/main/reviews.txt

# Download labels.txt
!wget https://raw.githubusercontent.com/GargPriyanshu1112/Sentiment-Analysis/main/labels.txt

--2022-08-09 05:39:58--  https://raw.githubusercontent.com/GargPriyanshu1112/Sentiment-Analysis/main/reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33678267 (32M) [text/plain]
Saving to: ‘reviews.txt’


2022-08-09 05:40:02 (354 MB/s) - ‘reviews.txt’ saved [33678267/33678267]

--2022-08-09 05:40:02--  https://raw.githubusercontent.com/GargPriyanshu1112/Sentiment-Analysis/main/labels.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 225000 (220K) [text/plain]
Saving to: ‘labels.txt’


2022-08-09 05:40:02 (63.3 MB/s) -

In [3]:
# Read data from text files
with open("reviews.txt") as f:
  reviews = f.read()

with open("labels.txt") as f:
  labels = f.read()

# Data Exploration

In [4]:
reviews[: 1000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [5]:
# Characters present in reviews 
set(reviews)

{'\n',
 ' ',
 '.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [None]:
labels[: 1000]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\

>We need to eliminate `'\n'` character and other punctuations.

# Data Preprocessing

In [6]:
# Convert alphabets to lowercase
text = reviews.lower()

# Remove the punctuations
text = ''.join([char  for char in text  if char not in punctuation])

# Remove '\n' character
reviews_split = text.split('\n') # List of reviews
text = ' '.join(reviews_split)

text[: 1000]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t    story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent m

In [7]:
# Separate the labels
labels = labels.split()

labels[: 10]

['positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

In [8]:
# Check if zero-length reviews are present
review_lengths = Counter(len(review.split())  for review in reviews_split)
print(f"Zero-length reviews: {review_lengths[0]}")

Zero-length reviews: 1


In [9]:
# Get index of reviews having non-zero length
non_zero_idxs = [idx  for idx, review in enumerate(reviews_split)  if len(review)!=0]

# Remove zero-length reviews and their corresponding labels
reviews_split = [reviews_split[idx]  for idx in non_zero_idxs]
labels = [labels[idx]  for idx in non_zero_idxs]

In [10]:
print(f"No. of reviews: {len(reviews_split)}")
print(f"No. of labels : {len(labels)}")

No. of reviews: 25000
No. of labels : 25000


# Get Vocabulary

In [11]:
# Get a list of words used in reviews
words = text.split()
print(f"\nThere are a total of {len(words)} words in reviews.")
print()
print(f"Some of the are:\n   {words[: 15]}")


There are a total of 6020196 words in reviews.

Some of the are:
   ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other']


In [31]:
# Get vocabulary
vocabulary = set(words)

# Get vocabulary size
vocab_size = len(vocabulary)

print(f"There are {vocab_size} words in the vocabulary.")

There are 74072 words in the vocabulary.


# Encoding the Words


>We pass in integers to our network. Therefore, we need to create dictionaries that map the words in the vocabulary to integers. Then we can convert each of our reviews into integers so they can be passed into the network.


In [13]:
# Encode the words to integers
# The integers start from 1 (and not 0) as we'll pad our input vectors with 0's later
vocab_to_int = {word: idx  for idx, word in enumerate(vocabulary, 1)}


dict(list(vocab_to_int.items())[:10])

{'animators': 6,
 'catchword': 2,
 'hermandad': 9,
 'meter': 10,
 'passive': 7,
 'perseus': 4,
 'slitting': 8,
 'suucks': 5,
 'witchdoctor': 1,
 'yates': 3}

# Encoding the Reviews

In [14]:
reviews_int = [] # Will store encoded reviews

# Encode each review
for review in reviews_split:
  reviews_int.append([vocab_to_int[word]  for word in review.split()])

In [15]:
rand_idx = random.randint(0, len(reviews_split))

print(f"REVIEW:\n   {reviews_split[rand_idx]}\n")
print(f"   No. of words in the review: {len(reviews_split[rand_idx].split())}\n")

print("-" * 20)

print(f"\nTOKENIZED VERSION:\n   {reviews_int[rand_idx]}\n")
print(f"   No. of tokens in the encoded version: {len(reviews_int[rand_idx])}")


REVIEW:
   i  m not sure why spike lee made this train wreck of a movie and conned poor stevie wonder into eternally pairing his beautiful music with this theatrical mess  i also resent the way he uses profanity as a part of the normal prose of professional blacks  the abuse of his hold on ethnic movie goers is a shame  scenes which seem to be contrived out the blue and have nothing to do with the theme or sub themes  play as if some college kid wrote this  i especially detest the ludicrous scene where the two leads are playfully sparring for no reason at all and the cops come and rough up snipes  the overacting of the leads makes one feel as if spike has no respect for his viewers or he has no clue what a movie is all about  the final scene appears to be thrown in to justify the use of a sledge hammer to tack a point in  this movie also supports the myth that all people of culture use the f  word in casual conversation  i am hoping he will realize that the rest of his movies are in th

# Encoding the Labels

In [16]:
rand_idx = random.randint(0, len(labels))

encoded_labels = np.array([1 if label=="positive" else 0  for label in labels])

print(f"LABEL:\n   {labels[rand_idx]}")
print(f"\nENCODED VERSION:\n   {encoded_labels[rand_idx]}\n")

LABEL:
   positive

ENCODED VERSION:
   1



In [17]:
# Get length of the longest review
max_len = max(len(review)  for review in reviews_int)

# Get the average length of reviews
avg_len = sum([len(review)  for review in reviews_int]) / len(reviews_int)

print(f"The length of the longest review is {max_len}.")
print(f"The average length of reviews is {avg_len:.2f}")

The length of the longest review is 2514.
The average length of reviews is 240.81


# Padding the Reviews

In [18]:
def pad_reviews_with_zeros(reviews_int, seq_length=512):
  features = np.zeros((len(reviews_int), seq_length), dtype=np.int32)

  for idx, review in enumerate(reviews_int):
    features[idx][-len(review):] = np.array(review)[:seq_length] 

  return features

In [19]:
features = pad_reviews_with_zeros(reviews_int, seq_length=512)
features

array([[    0,     0,     0, ..., 47852, 30002, 16855],
       [    0,     0,     0, ...,  5548, 72786, 60369],
       [    0,     0,     0, ..., 29020,  1936, 60318],
       ...,
       [    0,     0,     0, ..., 63635, 14119, 17378],
       [    0,     0,     0, ..., 29256, 36401, 60765],
       [    0,     0,     0, ..., 66263, 49076,  4629]], dtype=int32)

# Train-Test Split

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels,
                                                    test_size=0.2,
                                                    shuffle=True)


print(f"""
X_train shape: {X_train.shape}
y_train shape: {y_train.shape}
X_test shape : {X_test.shape}
y_test shape : {y_test.shape}""")


X_train shape: (20000, 512)
y_train shape: (20000,)
X_test shape : (5000, 512)
y_test shape : (5000,)


# Generate Datasets

In [22]:
def get_training_dataset(X_train, y_train, batch_size=32):
  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
  train_dataset = train_dataset.shuffle(buffer_size=len(X_train), reshuffle_each_iteration=True)
  train_dataset = train_dataset.batch(batch_size)
  train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

  return train_dataset

In [23]:
def get_validation_dataset(X_test, y_test, batch_size=32):
  validation_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
  validation_dataset = validation_dataset.batch(batch_size)

  return validation_dataset

In [24]:
# Get training dataset
train_dataset = get_training_dataset(X_train, y_train, batch_size=32)

# Get validaton dataset
validation_dataset = get_validation_dataset(X_test, y_test, batch_size=32)

In [25]:
# Get sample training dataset
sample_x, sample_y = train_dataset.as_numpy_iterator().next()

print(f"\t*** SAMPLE TRAIN BATCH ***\n")
print(f"Input shape: {sample_x.shape}") 
print(f"Input: \n{sample_x}")
print("\n")
print(f"Label shape: {sample_y.shape}") 
print(f"Label: \n{sample_y}")

	*** SAMPLE TRAIN BATCH ***

Input shape: (32, 512)
Input: 
[[    0     0     0 ...  1537 64702 22991]
 [    0     0     0 ...  7689 36878  5069]
 [68480  4294 11322 ... 27792 68480 27450]
 ...
 [    0     0     0 ... 23426 40217 11569]
 [    0     0     0 ... 42541 38160 66854]
 [    0     0     0 ...  4177 54747 17843]]


Label shape: (32,)
Label: 
[1 0 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1]


# Model Engineering

In [29]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Model

In [30]:
def get_model(vocab_size):
  inputs = Input(shape=(512, ), dtype=tf.int32)
  embeddings = Embedding(input_dim=len(vocab_to_int)+1, output_dim=512, input_length=512)(inputs)
  x = LSTM(units=256, dropout=0.5, return_sequences=True)(embeddings)
  x = LSTM(units=256, dropout=0.5)(x)
  x = Dropout(0.3)(x)
  outputs = Dense(1, activation="sigmoid")(x)

  return Model(inputs, outputs, name="model")

In [39]:
# Get the model
model = get_model(vocab_size)

# Get model summary
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 512, 512)          37925376  
                                                                 
 lstm_4 (LSTM)               (None, 512, 256)          787456    
                                                                 
 lstm_5 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 39,238,401
Trainable params: 39,238,401
Non-tra

# Compile the Model

In [40]:
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Train the Model

In [41]:
EPOCHS = 4

model.fit(train_dataset,
          epochs=EPOCHS,
          steps_per_epoch=len(train_dataset),
          validation_data=validation_dataset,
          validation_steps=len(validation_dataset))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f6edfde55d0>

# Inferencing on User Generated Reviews

In [48]:
def predict(review, model):
  # Convert alphabets to lowercase
  text = review.lower()

  # Remove the punctuations
  text = ''.join([char  for char in text  if char not in punctuation])

  # Encode the review
  encoded_review = [vocab_to_int[word]  for word in text.split()]

  # Pad the review
  padded_review = pad_reviews_with_zeros([encoded_review])

  # Get prediction score
  pred = model.predict(padded_review)
  

  print(f"Given Review:\n   {review}\n")

  if pred >= 0.5:
    print("POSITIVE prediction detected !")
  else:
    print("NEGATIVE prediction detected !")

In [49]:
# Test reviews
test_review_1 = 'This movie had the best acting and the dialogue was so good. I loved it.'
test_review_2 = "I received the food that is one year old. No one can even smell it."
test_review_3 = "I love this sofa. Its too comfortable...."
test_review_4 = "This vaccum really sucks!!!"

In [54]:
# Predictions of test reviews
predict(test_review_1, model)
print("--------\n\n")
predict(test_review_2, model)
print("--------\n\n")
predict(test_review_3, model)
print("--------\n\n")
predict(test_review_4, model)
print("--------\n\n")

Given Review:
   This movie had the best acting and the dialogue was so good. I loved it.

POSITIVE prediction detected !
--------


Given Review:
   I received the food that is one year old. No one can even smell it.

NEGATIVE prediction detected !
--------


Given Review:
   I love this sofa. Its too comfortable....

POSITIVE prediction detected !
--------


Given Review:
   This vaccum really sucks!!!

NEGATIVE prediction detected !
--------


