# Import dataset

In [37]:
from tensorflow import keras
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

train_set = tfds.load('imdb_reviews', split='train', as_supervised=True).take(5000)
test_set = tfds.load('imdb_reviews', split='test', as_supervised=True).take(1000)

# Prepare tokens

In [45]:
max_text_length = 0

# training
X_train = []
y_train = []
for i, j in train_set:
    i = str(i.numpy())
    max_text_length = max(max_text_length, len(i))
    X_train.append(i)
    y_train.append(int(j))

tokenizer = keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_text_length, padding='post')

X_train = np.array(X_train)
y_train = np.array(y_train)

# testing
X_test = []
y_test = []
for i, j in test_set:
    i = str(i.numpy())
    X_test.append(i)
    y_test.append(int(j))

X_test = tokenizer.texts_to_sequences(X_test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_text_length, padding='post')

X_test = np.array(X_test)
y_test = np.array(y_test)

# Build the RNN
We need to use an RNN to help predict positive/negative reviews because the order of words obviously matters in sentences. An embedding layer is needed to convert sequences of tokens into sequences of vectors that can be easily understood by the RNN layer.

In [39]:
model = keras.Sequential()
model.add(keras.layers.Embedding(10001, 128, mask_zero=True))
model.add(keras.layers.SimpleRNN(100, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[keras.metrics.BinaryAccuracy()])

# Train the model

In [40]:
epochs = 10

model.fit(X_train, y_train, epochs=epochs, validation_split=0.1, batch_size=256)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9f60314850>

# Evaluate on test set

In [46]:
print(model.metrics_names)
print(model.evaluate(X_test, y_test))

['loss', 'binary_accuracy']
[0.6939795613288879, 0.5479999780654907]
