# Sentiment Analysis
Simple LSTM implementation for analyzing movie reviews.

Dataset: Keras' a built-in [IMDb movie reviews dataset](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification).

In [1]:
from keras.datasets import imdb

# Set vocab size
vocabulary_size = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size)
print("{} training samples, {} test samples".format(len(X_train), len(X_test)))

Using TensorFlow backend.


Loaded dataset with 25000 training samples, 25000 test samples


**Peek at Data**

In [2]:
print("--- Indices ---")
print(X_train[7])

print()

word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print("--- Words ---")
print([id2word.get(i, " ") for i in X_train[7]])

print()

print("--- Label ---")
print(y_train[7])

--- Indices ---
[1, 13, 28, 110, 14, 22, 23, 3356, 2, 5, 1562, 510, 12, 257, 58, 14, 9, 669, 688, 8, 4, 1334, 2, 2, 6, 87, 524, 19, 1048, 272, 935, 101, 22, 1184, 827, 2, 5, 2801, 4675, 9, 2725, 8, 30, 441]

--- Label ---
1


**Pad Sequences (Preprocess)**

In [4]:
from keras.preprocessing import sequence

# Set the maximum number of words per document (for both training and testing)
max_words = 500

# TODO: Pad sequences in X_train and X_test
X_train = sequence.pad_sequences(X_train, maxlen = max_words)
X_test = sequence.pad_sequences(X_test, maxlen = max_words)

**RNN model**

In [5]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# 32 embedding dimensions
embedding_size = 32
LSTM_units = 100

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length = max_words))
model.add(LSTM(LSTM_units))
model.add(Dense(1, activation="sigmoid"))

# check model configuration
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


**Train and Evaluate**

In [6]:
# compile model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [7]:
# hyperparams
batch_size = 64
num_epochs = 5

# leave one batch for valid
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]

# training data
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

# train
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f039e2ecba8>

**Test Performance**

In [11]:
# evaluate model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", scores[1])

Test accuracy: 0.87016
