# Movie Review Sentiment Analysis

In [76]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import string

In [54]:
data = keras.datasets.imdb

In [55]:
(train_data, train_label), (test_data, test_label) = data.load_data(num_words=880000)

In [56]:
word_index = data.get_word_index()

In [57]:
word_index = {k:(v+3) for k, v in word_index.items()}

In [58]:
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3

In [59]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [60]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(test_data[0]))

<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss


In [61]:
# different size
print(len(test_data[2]), len(test_data[20]))

603 230


In [62]:
# preprocess to the same size
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index['<PAD>'], padding='post', maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index['<PAD>'], padding='post', maxlen=250)

In [63]:
print(len(train_data[2]), len(test_data[20]))

250 250


In [64]:
model = keras.Sequential()
model.add(keras.layers.Embedding(880000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [65]:
# embedding layer: group words based on similarity
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          14080000  
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 14,080,289
Trainable params: 14,080,289
Non-trainable params: 0
_________________________________________________________________


In [66]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [67]:
# validation set
x_val = train_data[:10000]
y_val = train_label[:10000]

In [68]:
# training set
x_train = train_data[10000:]
y_train = train_label[10000:]

In [69]:
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)

Train on 15000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [70]:
results = model.evaluate(test_data, test_label)



In [127]:
test_review = test_data[0]
predict = model.predict(test_review)
print('Review: ')
print(decode_review(test_review))
print('Prediction: ' + str(predict[0]))
print('Actual: ' + str(test_label[0]))
print(results)

Review: 
<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

In [72]:
model.save('model.h5')

In [73]:
model = keras.models.load_model('model.h5')

In [101]:
def review_encode(s):
    encoded = [1]
    for word in s:
        if word.lower() in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded

In [118]:
with open('Irishman.txt', encoding ='ISO-8859-1') as f:
    for line in f.readlines():
        nline = line.translate(str.maketrans('', '', string.punctuation)).split()
        encode = review_encode(nline)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index['<PAD>'], padding='post', maxlen=250)
        predict = model.predict(encode)
        print(line)
        print(encode)
        print(predict[0])

We begin by giving thanks to the local publicist who was swell enough to arrange a press preview ofÊThe IrishmanÊin an auditorium to my liking. (Reading Cinemas Grossmont #5.) That said, it angers me to no end that I will be one of just a few thousand Americans afforded a shot at seeing a film directed by Marvel-denier and ruler of the cinematic universe, Martin Scorsese, in the way it was intended. Or was it? Martin Scorsese shotÊRaging BullÊin black-and-white to protest what he understood to be a crisis in unstable color film stock. He was also the first to decry the multiplexing of American single screens in the name of staggered showtimes. Knowing that greater numbers of viewers were watching films on home video, he waited until 1991 Ñ when TV screens were big enough and viewers finally tolerant enough to deal with a letterboxed image Ñ to at last shoot a picture in Panavision. No one in my lifetime has done more to preserve film and encourage the theatrical life cycle of motion pi

In [122]:
with open('Winchester.txt', encoding ='ISO-8859-1') as f:
    for line in f.readlines():
        nline = line.translate(str.maketrans('', '', string.punctuation)).split()
        encode = review_encode(nline)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index['<PAD>'], padding='post', maxlen=250)
        predict = model.predict(encode)
        print(line)
        print(encode)
        print(predict[0])

The Winchester Mystery House in San Jose, California, is the ultimate horror movie locationÑa vast, four-story mansion constructed in crazy-quilt fashion from 1883 to 1922 by Sarah Winchester, heir to the gun manufacturing fortune. It comes with its own legend too, that Winchester built each room to mollify the spirit of a person killed by one of her company's weapons. That premise speaks to our current gun crisis, but directors Michael and Peter Spierig, reworking a script by Tom Vaughan, bring little passion or eloquence to the dialogue, frustrating such capable actors as Helen Mirren (playing Winchester) and Jason Clarke (as a laudanum-addicted doctor sent by the company board to assess Mrs. Winchester's sanity). The scares are standard haunted-house stuff; whenever someone in a movie like this starts fooling around with a hinged mirror, grab your armrests.
[[    1     4  5829   736   313    11  2614  7139  2642     9     4  2095
    189    20     2  4311     2  3025  4481    11    

In [129]:
def sentiment(file):
    with open(file, encoding ='ISO-8859-1') as f:
        for line in f.readlines():
            nline = line.translate(str.maketrans('', '', string.punctuation)).split()
            encode = review_encode(nline)
            encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index['<PAD>'], padding='post', maxlen=250)
            predict = model.predict(encode)
            print(line)
            print(encode)
            print(predict[0])

In [130]:
print(sentiment('Parasite.txt'))

"Parasite" moves effortlessly between capitalist parable, horror film and situation comedy as the natures of the two families are revealed: The Kims, grasping and greedy yet unfairly stigmatized by poverty (even in their newly bought clothes, they give off an odor that can't be masked), and the Parks, harmless and well-intentioned yet coasting through life on the backs of others. Who's the parasite, then? So recognizable is this picture of society that you may forget you aren't watching an American movie. (The spell breaks now and then when Mrs. Park tosses off an English catchphrase to prove her internationalist bona-fides.)
[[    1 16422  1102  8233   200 11242 12214   189    22     5   904   212
     17     4 18035     7     4   107  2166    26  2029     4     2 17641
      5  4636   246  9996     2    34  3460    60    11    68  4700  1247
   1649    36   202   125    35 43801    15  2488    30  7566     5     4
   8664  5623     5     2   246 35620   143   113    23     4  5462   

In [139]:
print(sentiment('test.txt'))

I can't think of any reason to leave a bad review for this movie. It is fantastic.
[[   1   13 2488  104    7  101  282    8  563    6   78  733   18   14
    20   12    9  777    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0  