In [1]:
# Harry Chong
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from collections import Counter

In [2]:
# Load IMDB Reviews
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [3]:
# Setup vocabulary table
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.regex_replace(X_batch, "<[^>]+>",  " ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

# Print some reviews and labels from dataset
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        #print("Review:", review.decode("utf-8")[:200], "...")
        #print("Label:", label, "= Positive" if label else "= Negative")
        #print()
        pass
    
preprocess(X_batch, y_batch)

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [4]:
# Tokenize each word to ID
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [5]:
LABELS = ["NEGATIVE", "POSITIVE"]
def preprocess_input(X_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.regex_replace(X_batch, "<[^>]+>",  " ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>")

def get_prediction(review):
    # Preprocessing
    review_array = table.lookup(tf.constant([review.split()]))
    
    # Prediction score that the item is encoded as 1 (Positive)
    threshold_confidence = 0.5
    score = float(model.predict(review_array)[0][0])
    
    if score > threshold_confidence:
        actual_predict, actual_proba = "POSITIVE", round(score, 5)
        other_predict, other_proba = "NEGATIVE", round(1 - score, 5)
    else:
        actual_predict, actual_proba = "NEGATIVE", round(1 - score, 5)
        other_predict, other_proba = "POSITIVE", round(score, 5)
    
    print('Review:', review, '\nPrediction:', actual_predict, 
          '\nPredicted probability that the review is {}: {}'.format(actual_predict, actual_proba),
          '\nPredicted probabiltiy that the review is {}: {}\n'.format(other_predict, other_proba))
    
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [6]:
# Load model
model = keras.models.load_model('model.h5')

In [8]:
# [USER PARAMETER] Number of IMDB reviews to pass into model for prediction.
num_review = 10

# Pull and shuffle the number of reviews from the imdb dataset, preprocess it, and
# evaluate on it.
data = tfds.load(name="imdb_reviews", split=('test'), as_supervised=True)
review, label = next(iter(data.shuffle(num_review).batch(num_review)))
preprocess(review, label)

# Predict and output result. Print out the prediction and predicted probability for each review.
for idx in range(num_review):
    get_prediction(review[idx].numpy())


Review: b"I remember seeing this at my local Blockbuster and picked it up cause I was curious. I liked movies about mythological creatures. I like movies about werewolves, vampires, zombies, etc. This is based on half-caste, a half-human half-leopard creature that preys on the people of Africa.<br /><br />The movie is horrendous! The actors are terrible! There is no script whatsoever! It's all improvised! The whole thing is filmed at night because they say that is the only time you ever see it. It's obvious bull*bleep*! They film at night to make it scary. But, they have failed to scare me. After the first person was killed, I put this back in the case and took it back to Blockbuster. One of the most boring movies I've ever seen.<br /><br />Now you are probably saying I have no right to review a movie if I haven't finished it. Well, this is one of the films that didn't deserve to be watched all the way through.<br /><br />1 star out of 10. This is really BAD!" 
Prediction: NEGATIVE 
Pr

Review: b"This is the first movie I've seen from Singapore and it's great. If you don't know a lot about Asia, its languages and its culture, then this film may be a bit confusing for the non-informed people. As an Asian-American who's double majoring in two Asian languages (one of them being Mandarin) and has taken some Asian American Studies classes, this film was easier for me to understand, but even without that kind of knowledge, I believe this movie is still accessible to its foreign audiences as long as you keep in mind that it's a coming-of-age type of movie. The film is definitely worth seeing just so that you get the chance to see what kind of issues Singapore's young teenage boys are struggling and having to deal with. This is an awesome coming-of-age movie, but filmed and shown in a more artistic and original way. The actors are outstanding." 
Prediction: POSITIVE 
Predicted probability that the review is POSITIVE: 0.99988 
Predicted probabiltiy that the review is NEGATIVE: