# Importing Libraries

In [12]:
#importing libraries
import keras
import pandas as pd
import numpy
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import Flatten
from keras.layers import LSTM
from keras.datasets import imdb

In [13]:
max_features = 40000
# cut texts after this number of words (among top max_features most common words)
maxlen = 500
batch_size = 64


In [14]:
# The code below does following things:
# 1)It downloads the data
# 2)It downloads the first 20000 top words for each review
# 3)It splits the data into a test and a training set.

# Load Data

In [15]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
pd.DataFrame(x_train).head()

Loading data...
25000 train sequences
25000 test sequences


Unnamed: 0,0
0,"[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ..."
1,"[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,..."
2,"[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5..."
3,"[1, 4, 18609, 16085, 33, 2804, 4, 2040, 432, 1..."
4,"[1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1..."


# Preprocess Data

In [16]:
# We want to trim each review to its first 500 words. 
# We need to have text samples of the same length before we feed them to the network. 
# If reviews are shorter than 500 words we will pad them with 0.
# Keras offers a super easy function for that.

In [17]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 500)
x_test shape: (25000, 500)


# Model

In [18]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 500))
model.add(Dense(250, activation='relu'))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

Build model...


# Train and Evaluate Model

In [19]:
adam = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print(model.summary())
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 500)         20000000  
_________________________________________________________________
dense_3 (Dense)              (None, None, 250)         125250    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               140400    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 20,265,751
Trainable params: 20,265,751
Non-trainable params: 0
_________________________________________________________________
None
Test score: 0.5664715587806701
Test accuracy: 0.8419599999809265


# Predict

In [27]:
#predict sentiment from reviews
word_to_id = imdb.get_word_index()
bad = "worst movie ever"
good = "i liked the movie it was fun"
for review in [good,bad]:
    tmp = []
    for word in review.split(" "):
        tmp.append(word_to_id[word])
    tmp_padded = sequence.pad_sequences([tmp], maxlen=maxlen) 
    print("%s. Sentiment: %s" % (review,model.predict(([tmp_padded][0]))[0][0]))

i liked the movie it was fun. Sentiment: 0.6240958
worst movie ever. Sentiment: 0.82107025
