**SENTIMENT ANALYSIS DATASET**

We will be using the IMDb Movie Review Dataset at https://www.tensorflow.org/tutorials/text/text_classification_rnn
This dataset is already preprocessed and has a label as either positive or negative.

In [2]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words= VOCAB_SIZE)

In [3]:
# MORE PREPROCESSING

# here we notice that the reviews are in different lengths. So awe need to normalize them to the same size.
# () If the review is grater than 250 words we trim off the extra words
# () If the review is less then 250 words we pad to make it equal to 250
# US keras already have a function to do that

train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [7]:
# CREATING THE MODEL

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [12]:
# FREEING THE TF TO ALLOCATE ALL OF THE MEMORY OF THE GPU

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU ativada com crescimento de memória.")
    except RuntimeError as e:
        print(e)


In [13]:
# TRAINING

model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2, batch_size=BATCH_SIZE)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 55ms/step - acc: 0.8907 - loss: 0.2832 - val_acc: 0.8762 - val_loss: 0.2920
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 65ms/step - acc: 0.9219 - loss: 0.2038 - val_acc: 0.8864 - val_loss: 0.2854
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 69ms/step - acc: 0.9336 - loss: 0.1769 - val_acc: 0.8450 - val_loss: 0.4593
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 71ms/step - acc: 0.9498 - loss: 0.1416 - val_acc: 0.8782 - val_loss: 0.2940
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 64ms/step - acc: 0.9611 - loss: 0.1190 - val_acc: 0.8036 - val_loss: 0.6345
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 69ms/step - acc: 0.9630 - loss: 0.1060 - val_acc: 0.8606 - val_loss: 0.4587
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0

In [14]:
# EVALUATION 

results = model.evaluate(test_data, test_labels)
print(results)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - acc: 0.8712 - loss: 0.4454
[0.44756847620010376, 0.8714799880981445]


In [None]:
# MAKING PREDICTIONS
# the process of encoding

word_index = imdb.get_word_index() # mapping

def encode_text(text):
    tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "What a great movie, absolutely amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0  48   3  8

In [27]:
# making the prediction

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result =  model.predict(pred)
    if result[0] > 0.5:
        print(result[0], "POSITIVE")
    else:
        print(result[0], "NEGATIVE")
    
positive_review = "That movie was! really loved it and would great watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie really sucked. Hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[0.9102439] POSITIVE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[0.31682834] NEGATIVE
