In [None]:
import os

imdb_dir = 'input'
train_dir = os.path.join(imdb_dir, 'imdb')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

This step transfers text content to the list.

In [None]:
L1 = labels[0:1250]
L2 = labels[12500:13750]
L3 = labels[1250:12500]
L4 = labels[13750:25000]
T1 = texts[0:1250]
T2 = texts[12500:13750]
T3 = texts[1250:12500]
T4 = texts[13750:25000]

In [None]:
L1.extend(L2)
L3.extend(L4)
T1.extend(T2)
T3.extend(T4)
labels_test = L1
texts_test = T1
labels = L3
texts = T3

This step separates training set, validation set, and test set.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 500  # We will cut reviews after 500 words
training_samples = 17500  # We will be training on 17500 samples
validation_samples = 5000  # We will be validating on 5000 samples
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)  # https://keras.io/preprocessing/text/
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  # This turns strings into lists of integer indices.

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

This step tokenizes labels and data in training set and validation set.

In [None]:
# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Let the data be sorted in random order.

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(max_words, 32, input_length=maxlen))
model.add(LSTM(32,
              dropout=0.05,
              recurrent_dropout=0.05))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_data=(x_val, y_val))
model.save_weights('lstm_movie_validate.h5')