# Machine Learning - Sentiment Analysis IMDb Dataset (using LSTM, GRU)

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [2]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [6]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [7]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [8]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews) 

# pad sequences
max_length = 100 # try other options like mean
# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens =  tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [9]:
print(vocab_size)

125602


In [10]:
%%time
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Build model...
Summary of the built model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          12560200  
_________________________________________________________________
gru (GRU)                    (None, 32)                12864     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 12,573,097
Trainable params: 12,573,097
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
%%time
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=1, validation_data=(X_test_pad, y_test), verbose=2)

Train...
196/196 - 36s - loss: 0.4989 - accuracy: 0.7383 - val_loss: 0.3622 - val_accuracy: 0.8421


<tensorflow.python.keras.callbacks.History at 0x7f4247f76668>

In [12]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.3622249960899353
Test accuracy: 0.8420799970626831
Accuracy: 84.21%


In [13]:
#Let us test some  samples
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.94047296],
       [0.8659663 ],
       [0.7311486 ],
       [0.3931977 ],
       [0.8207176 ],
       [0.56545186],
       [0.8265602 ],
       [0.35915428]], dtype=float32)

In [15]:
classes

array([[0.16667822],
       [0.11424267],
       [0.8912412 ],
       [0.38223457],
       [0.94416475],
       [0.8655189 ],
       [0.8781737 ],
       [0.18162912],
       [0.74845517],
       [0.95269245]], dtype=float32)

In [14]:
#let us check how the model predicts
classes = model.predict(X_test_pad[:10], batch_size=128)
for i in range (0,10):
    if(classes[i] > 0.5 and y_test[i] == 1 or (classes[i] <= 0.5 and y_test[i] == 0)):
        print( classes[i], "\t",y_test[i], " Right prdiction")
    else :
        print( classes[i], "\t",y_test[i], " Wrong prdiction")

[0.16667822] 	 1  Wrong prdiction
[0.11424267] 	 1  Wrong prdiction
[0.8912412] 	 1  Right prdiction
[0.38223457] 	 1  Wrong prdiction
[0.94416475] 	 1  Right prdiction
[0.8655189] 	 1  Right prdiction
[0.8781737] 	 1  Right prdiction
[0.18162912] 	 1  Wrong prdiction
[0.74845517] 	 1  Right prdiction
[0.95269245] 	 1  Right prdiction


In [12]:
%%time
from keras.datasets import imdb
from keras.models import Sequential
from tensorflow.python.keras.preprocessing import sequence
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print('Build model...')

model = Sequential()
model.add(Embedding(top_words, 100, input_length=max_words))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Build model...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          500000    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 517,057
Trainable params: 517,057
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
%%time
print('Train...')

model.fit(X_train, y_train, batch_size=128, epochs=1, validation_data=(X_test, y_test), verbose=2)

Train...


KeyboardInterrupt: 

In [None]:
score, acc = model.evaluate(X_test, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: %.2f%%" % (acc*100))

The time to train a GRU is less than LSTM network.