<a href="https://colab.research.google.com/github/JakeSiewJK64/colabNotebooks/blob/master/movieReviewRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Review Sentiment Analysis
Dataset: from keras import imdb

In [5]:
%tensorflow_version 2.x

import keras.preprocessing.sequence as sequence
import keras.datasets.imdb as imdb

import tensorflow as tf
import keras
import os
import numpy as np

In [6]:
VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

In [7]:
(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words=VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


## Data Preprocessing
- if the review has less than 250 words we add more 0s until we reach 250
- else, if we exceed 250 words we trim off the extra words

In [8]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [9]:
train_data[0]
# 0 is the padding we employed for the correct length

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     1,    14,    22,    16,
          43,   530,   973,  1622,  1385,    65,   458,  4468,    66,
        3941,     4,   173,    36,   256,     5,    25,   100,    43,
         838,   112,    50,   670, 22665,     9,    35,   480,   284,
           5,   150,     4,   172,   112,   167, 21631,   336,   385,
          39,     4,   172,  4536,  1111,    17,   546,    38,    13,
         447,     4,   192,    50,    16,     6,   147,  2025,    19,
          14,    22,     4,  1920,  4613,   469,     4,    22,    71,
          87,    12,    16,    43,   530,    38,    76,    15,    13,
        1247,     4,    22,    17,   515,    17,    12,    16,   626,
          18, 19193,     5,    62,   386,    12,     8,   316,     8,
         106,     5,

## Creating the model
- use word embedding layer as base layer
- add a LSTM layer to feed into dense node to get predicted sentiment


In [10]:
model = tf.keras.Sequential([
          tf.keras.layers.Embedding(VOCAB_SIZE, 32), # find more meaningful representation for the numbers
          tf.keras.layers.LSTM(32), # 32 dimensions for each word
          tf.keras.layers.Dense(1, activation='sigmoid') 
          # predict the sentiment 0 to 1 if more than 0.5 is positive review
          # we didnt use relu because anything less than 0 is immediately 0
])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2834688   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


### output for Dense layer is 32 + 1 (bias) = 33

## Training the model

In [12]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
history = model.fit(train_data, train_labels, epochs=5, validation_split=.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


hyperparameters:
- loss: binary_crossentropy: how far we are from the correct probability (0 or 1)
- optimizer: rmsprop (not important)
- validation_split: 20% split (20% of training data)


## Evaluating the results

In [18]:
results = model.evaluate(test_data, test_labels)
print("Loss: {:.4f} \nAccuracy: {:.4f}".format(results[0], results[1]))

Loss: 0.3718 
Accuracy: 0.8715


## making predictions

In [20]:
# word to integer

word_index = imdb.get_word_index() 

def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text) # token: individual words
  print("Text to word Sequence: {}".format(tokens))

  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], MAXLEN)[0] # works on a list of sequences

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

Text to word Sequence: ['that', 'movie', 'was', 'just', 'amazing', 'so', 'amazing']
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0

In [22]:
# decode function
# integer to word

reverse_word_index = {value: key for (key, value) in word_index.items()}
def decode_integers(integers):
  PAD = 0 # nothing is there if 0
  text = "" 
  for num in integers:
    if num != PAD:
      text += reverse_word_index[num] + " "

  return text[:-1]

print("Text: {}".format(text))
print("Decoded: {}".format(decode_integers(encoded)))
print("Encoded: {}".format(encoded))

Text: that movie was just amazing, so amazing
Decoded: that movie was just amazing so amazing
Encoded: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0 

## Make Prediction

In [38]:
def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1, 250))
  pred[0] = encoded_text
  result = model.predict(pred)
  print("Positive")  if result[0] > .5 else print("Negative")
  print(result[0])

positive_review = "that movie really great. I really loved it and would definitely watch it again. Was one of the best things I've ever watched"
negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"

predict(positive_review)
predict(negative_review)

Text to word Sequence: ['that', 'movie', 'really', 'great', 'i', 'really', 'loved', 'it', 'and', 'would', 'definitely', 'watch', 'it', 'again', 'was', 'one', 'of', 'the', 'best', 'things', "i've", 'ever', 'watched']
Positive
[0.78255945]
Text to word Sequence: ['that', 'movie', 'really', 'sucked', 'i', 'hated', 'it', 'and', "wouldn't", 'watch', 'it', 'again', 'was', 'one', 'of', 'the', 'worst', 'things', "i've", 'ever', 'watched']
Negative
[0.23189381]
