# Import Dependencies

In [3]:
from string import punctuation
from collections import Counter
import numpy as np
import pandas as pd
import random
import tensorflow as tf

# Load in the Data

In [4]:
# Read data from text files
with open("reviews.txt") as f:
  reviews = f.read()

with open("labels.txt") as f:
  labels = f.read()

# Data Exploration

In [5]:
reviews[: 5000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [6]:
# Characters present in reviews 
set(reviews)

{'\n',
 ' ',
 '.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [7]:
labels[: 1000]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\

>We need to eliminate `'\n'` character and other punctuations.

# Data Preprocessing

In [8]:
# Convert alphabets to lowercase
text = reviews.lower()

# Remove the punctuations
text = ''.join([char  for char in text  if char not in punctuation])

# Remove '\n' character
reviews_split = text.split('\n') # List of reviews
text = ' '.join(reviews_split)

text[: 5000]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t    story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent m

In [9]:
labels = labels.split()
labels[: 10]

['positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

In [10]:
# Check if zero-length reviews are present
review_lengths = Counter(len(review.split())  for review in reviews_split)
print(f"Zero-length reviews: {review_lengths[0]}")

Zero-length reviews: 1


In [11]:
# Get index of reviews having non-zero length
non_zero_idxs = [idx  for idx, review in enumerate(reviews_split)  if len(review)!=0]

# Remove zero-length reviews and their corresponding labels
reviews_split = [reviews_split[idx]  for idx in non_zero_idxs]
labels = [labels[idx]  for idx in non_zero_idxs]

In [12]:
print(f"No. of reviews: {len(reviews_split)}")
print(f"No. of labels : {len(labels)}")

No. of reviews: 25000
No. of labels : 25000


# Get Vocabulary

In [13]:
# Get a list of words used in reviews
words = text.split()
print(f"\nThere are a total of {len(words)} words in reviews.")
print()
print(f"Some of the are:\n   {words[: 15]}")


There are a total of 6020196 words in reviews.

Some of the are:
   ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other']


In [14]:
# Get vocabulary
vocabulary = set(words)
print(f"There are {len(vocabulary)} words in the vocabulary.")

There are 74072 words in the vocabulary.


# Encoding the Words


>We pass in integers to our network. Therefore, we need to create dictionaries that map the words in the vocabulary to integers. Then we can convert each of our reviews into integers so they can be passed into the network.


In [15]:
# Encode the words to integers
# The integers start from 1 (and not 0) as we'll pad our input vectors with zeros later
vocab_to_int = {word: idx  for idx, word in enumerate(vocabulary, 1)}


dict(list(vocab_to_int.items())[:10])

{'cahoots': 2,
 'college': 4,
 'contless': 8,
 'hogg': 6,
 'hooooottttttttttt': 5,
 'huntley': 9,
 'inferno': 7,
 'lassiter': 10,
 'overstuffed': 1,
 'revelling': 3}

In [23]:
# int_vocab = dict(enumerate(unique_words))

# dict(list(int_vocab.items())[:10])

# Encoding the Reviews

In [16]:
reviews_int = [] # Will store encoded reviews

# Encode each review
for review in reviews_split:
  reviews_int.append([vocab_to_int[word]  for word in review.split()])

In [18]:
rand_idx = random.randint(0, len(reviews_split))

print(f"REVIEW:\n   {reviews_split[rand_idx]}\n")
print(f"   No. of words in the review: {len(reviews_split[rand_idx].split())}\n")

print("-" * 20)

print(f"\nTOKENIZED VERSION:\n   {reviews_int[rand_idx]}\n")
print(f"   No. of tokens in the encoded version: {len(reviews_int[rand_idx])}")


REVIEW:
   cinderella is a beautiful film  with beautiful songs of course  in fact  it  s one of the best films of the      s   br    br   i think all the characters are portrayed amazingly  you can see the cruelness of cinderella  s stepsisters and her stepmother  the sweetness of cinderella  the mice are funny and sweet too   br    br   i think they changed the tale a bit  but i think it  s for the best  it  s such a nice film  and i don  t think anyone could resist it deep down   br    br   i give it a      i don  t think it  s the best disney film  but it sure is a true classic   

   No. of words in the review: 113

--------------------

TOKENIZED VERSION:
   [53792, 66441, 43872, 29999, 45180, 61694, 29999, 46575, 6022, 39994, 27913, 65847, 28811, 41376, 72132, 6022, 1126, 60282, 9642, 6022, 1126, 41376, 52756, 52756, 69604, 1194, 32147, 1126, 71363, 72610, 58970, 31031, 54493, 72931, 55891, 1126, 18814, 6022, 53792, 41376, 176, 48771, 5251, 61443, 1126, 70267, 6022, 53792, 1126,

# Encoding the Labels

In [19]:
rand_idx = random.randint(0, len(labels))

encoded_labels = np.array([1 if label=="positive" else 0  for label in labels])

print(f"LABEL:\n   {labels[rand_idx]}")
print(f"\nENCODED VERSION:\n   {encoded_labels[rand_idx]}\n")

LABEL:
   negative

ENCODED VERSION:
   0



In [28]:
# Get length of the longest review
max_len = max(len(review)  for review in reviews_int)

# Get the average length of reviews
avg_len = sum([len(review)  for review in reviews_int]) / len(reviews_int)

print(f"The length of the longest review is {max_len}.")
print(f"The average length of reviews is {avg_len:.2f}")

The length of the longest review is 2514.
The average length of reviews is 240.81


In [49]:
lengths = [len(review)  for review in reviews_int]
np.percentile(lengths, q=90)

471.0

# Padding of Reviews

In [48]:
def pad_reviews_with_zeros(reviews_int, seq_length):
  features = np.zeros((len(reviews_int), seq_length), dtype=np.int32)

  for idx, review in enumerate(reviews_int):
    features[idx][-len(review):] = np.array(review)[:seq_length] 

  return features

In [50]:
features = pad_reviews_with_zeros(reviews_int, seq_length=512)
features

array([[    0,     0,     0, ..., 28811, 19072, 68285],
       [    0,     0,     0, ..., 62372, 73237, 27042],
       [    0,     0,     0, ..., 36557, 61051, 71753],
       ...,
       [    0,     0,     0, ..., 43632, 14261, 27368],
       [    0,     0,     0, ..., 63797, 10235, 53450],
       [    0,     0,     0, ..., 40013, 66441,   283]], dtype=int32)

# Train-Test Split

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels,
                                                    test_size=0.2,
                                                    shuffle=True)


print(f"""
X_train shape: {X_train.shape}
y_train shape: {y_train.shape}
X_test shape : {X_test.shape}
y_test shape : {y_test.shape}""")


X_train shape: (20000, 512)
y_train shape: (20000,)
X_test shape : (5000, 512)
y_test shape : (5000,)


# Generate Datasets

In [57]:
def get_training_dataset(X_train, y_train, batch_size=32):
  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
  train_dataset = train_dataset.shuffle(buffer_size=len(X_train), reshuffle_each_iteration=True)
  train_dataset = train_dataset.batch(batch_size)
  train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

  return train_dataset

In [58]:
def get_validation_dataset(X_test, y_test, batch_size=32):
  validation_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
  validation_dataset = validation_dataset.batch(batch_size)

  return validation_dataset

In [59]:
# Get training dataset
train_dataset = get_training_dataset(X_train, y_train, batch_size=32)

# Get validaton dataset
validation_dataset = get_validation_dataset(X_test, y_test, batch_size=32)

In [63]:
# Get sample training dataset
sample_x, sample_y = train_dataset.as_numpy_iterator().next()

print(f"\t*** SAMPLE TRAIN BATCH ***\n")
print(f'Input shape: {sample_x.shape}') 
print(f'Input: \n{sample_x}')
print("\n")
print(f'Label shape: {sample_y.shape}') 
print(f'Label: \n{sample_y}')

	*** SAMPLE TRAIN BATCH ***

Input shape: (32, 512)
Input: 
[[    0     0     0 ... 28811 35984 71938]
 [    0     0     0 ... 31758 66441 33276]
 [    0     0     0 ... 68355 52756 52756]
 ...
 [    0     0     0 ... 34703 40395  5964]
 [    0     0     0 ... 35282 48771 13926]
 [    0     0     0 ... 54493 27913 32393]]


Label shape: (32,)
Label: 
[0 0 0 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 1 0 1 1 1]


# Model Engineering

In [None]:
from tensorflow.keras import layers

In [None]:
embedding = layers.Embedding(input_dim=len(vocab_to_int)+1, # +1 for 0 padding
                             output_dim=512,
                             embeddings_initializer="uniform",
                             input_length=512)

In [None]:
embedding(features[0])

<tf.Tensor: shape=(512, 512), dtype=float32, numpy=
array([[ 0.01910799, -0.03634294, -0.01152693, ...,  0.04791348,
        -0.04234217, -0.00322814],
       [ 0.01910799, -0.03634294, -0.01152693, ...,  0.04791348,
        -0.04234217, -0.00322814],
       [ 0.01910799, -0.03634294, -0.01152693, ...,  0.04791348,
        -0.04234217, -0.00322814],
       ...,
       [ 0.02234991, -0.0158273 , -0.04439408, ..., -0.00085592,
         0.00873505, -0.01997552],
       [-0.01615602, -0.01785319, -0.04988476, ...,  0.03678954,
         0.00559428, -0.01309067],
       [ 0.01313286, -0.04200964,  0.03257532, ...,  0.04810972,
         0.00552788,  0.02484614]], dtype=float32)>

In [None]:
inputs = layers.Input(shape=(512), dtype=tf.float32)
x = layers.LSTM(units=256, dropout=)

SyntaxError: invalid syntax (3214975404.py, line 2)

In [None]:
# rev = [[12, 13, 15],
#        [45, 85, 64, 45, 63, 78, 45],
#        [16, 75, 96, 42]]