# Keras LSTM Tutorial on IMDB Sentiment Analysis with Theano Backend 

Keras is available at https://github.com/fchollet/keras
This code is available here (https://github.com/fchollet/keras/blob/master/keras/datasets/imdb.py).
I made some changes to the code to be able to load the data easily and so I also removed some of the original code that is not needed here.
The LSTM code is available here (https://github.com/fchollet/keras/blob/master/examples/imdb_lstm.py).
Play with the code to get a good grasp of it.

In [1]:
from keras.models import Sequential

Using Theano backend.


Using gpu device 0: GeForce GT 750M


In [2]:
model = Sequential()

In [7]:
import os
from collections import namedtuple, defaultdict
from random import shuffle, randint
#----------------------------------------------------
__docformat__ = 'restructedtext en'

import cPickle
import gzip
import os
import sys
import timeit

import numpy
import numpy as np
import theano
import theano.tensor as T

def get_dataset_file(dataset, default_dataset, origin):
    '''Look for it as if it was a full path, if not, try local file,
    if not try in the data directory.

    Download dataset if it is not present

    '''
    data_dir, data_file = os.path.split(dataset)
#     if data_dir == "" and not os.path.isfile(dataset):
#         # Check if dataset is in the data directory.
#         new_path = os.path.join(
#             os.path.split(__file__)[0],
#             "..",
#             "data",
#             dataset
#         )
#         if os.path.isfile(new_path) or data_file == default_dataset:
#             dataset = new_path

    if (not os.path.isfile(dataset)) and data_file == default_dataset:
        import urllib
        #print 'Downloading data from %s' % origin
        urllib.urlretrieve(origin, dataset)
    return dataset



def load_data(path="imdb.pkl", n_words=10000, valid_portion=0.1, maxlen=None,
              sort_by_len=True):
    '''Loads the dataset

    :type path: String
    :param path: The path to the dataset (here IMDB)
    :type n_words: int
    :param n_words: The number of word to keep in the vocabulary.
        All extra words are set to unknow (1).
    :type valid_portion: float
    :param valid_portion: The proportion of the full train set used for
        the validation set.
    :type maxlen: None or positive int
    :param maxlen: the max sequence length we use in the train/valid set.
    :type sort_by_len: bool
    :name sort_by_len: Sort by the sequence lenght for the train,
        valid and test set. This allow faster execution as it cause
        less padding per minibatch. Another mechanism must be used to
        shuffle the train set at each epoch.

    '''

    #############
    # LOAD DATA #
    #############

    # Load the dataset
    path = get_dataset_file(
        path, "imdb.pkl",
        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")

    if path.endswith(".gz"):
        f = gzip.open(path, 'rb')
    else:
        f = open(path, 'rb')

    train_set = cPickle.load(f)
    print(train_set[0][1][:10]), print(train_set[1][1])
    print(train_set[0][-1][:10]), print(train_set[1][-1])
    #test_set = cPickle.load(f)
    f.close()
    if maxlen:
        new_train_set_x = []
        new_train_set_y = []
        for x, y in zip(train_set[0], train_set[1]):
            if len(x) < maxlen:
                new_train_set_x.append(x)
                new_train_set_y.append(y)
        train_set = (new_train_set_x, new_train_set_y)
        del new_train_set_x, new_train_set_y

    # split training set into validation set
    train_set_x, train_set_y = train_set
    n_samples = len(train_set_x)
    sidx = numpy.random.permutation(n_samples)
    n_train = int(numpy.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    train_set = (train_set_x, train_set_y)
    valid_set = (valid_set_x, valid_set_y)

    def remove_unk(x):
        return [[1 if w >= n_words else w for w in sen] for sen in x]

    #test_set_x, test_set_y = test_set
    valid_set_x, valid_set_y = valid_set
    train_set_x, train_set_y = train_set

    train_set_x = remove_unk(train_set_x)
    valid_set_x = remove_unk(valid_set_x)
    #test_set_x = remove_unk(test_set_x)

    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    if sort_by_len:
        #sorted_index = len_argsort(test_set_x)
        #test_set_x = [test_set_x[i] for i in sorted_index]
        #test_set_y = [test_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(valid_set_x)
        valid_set_x = [valid_set_x[i] for i in sorted_index]
        valid_set_y = [valid_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(train_set_x)
        train_set_x = [train_set_x[i] for i in sorted_index]
        train_set_y = [train_set_y[i] for i in sorted_index]

    train = (train_set_x, train_set_y)
    valid = (valid_set_x, valid_set_y)
    #test = (test_set_x, test_set_y)

    return train, valid#, test

train, valid= load_data(path="imdb.pkl", n_words=10000, valid_portion=0.1, maxlen=None,
              sort_by_len=True)
print(type(train))


[1018, 94, 18493, 46, 37, 4271, 31, 17, 25, 20]
1
[480, 2, 29, 665, 28, 440, 3, 17, 203, 8]
0
<type 'tuple'>


In [8]:
import pprint
pprint.pprint(train[0][0:3])
pprint.pprint(train[1][0:5])

[[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4],
 [16, 586, 32, 885, 17, 39, 68, 31, 2994, 2389, 328, 4],
 [1, 2, 1, 139, 6, 130, 1, 5, 6, 25, 105, 4730, 40]]
[0, 0, 0, 1, 0]


In [23]:
'''Train a LSTM on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF+LogReg.
Notes:
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
GPU command:
    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_lstm.py
'''

from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb

max_features = 10000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
 #                                                     test_split=0.2)

X_train, y_train=train[0], train[1]
print(type(X_train[0][0]))
print(type(X_train[1][0]))

X_test, y_test= valid[0], valid[1]

#X_train, y_train = train_vecs, train_tags
#X_test, y_test = dev_vecs, dev_tags#

#test_vecs, test_tags

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(128))  # try using a GRU instead, for fun
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              class_mode="binary")

print("Train...")
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3,
          validation_data=(X_test, y_test), show_accuracy=True)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Loading data...
<type 'int'>
<type 'int'>
22500 train sequences
2500 test sequences
Pad sequences (samples x time)
X_train shape: (22500, 100)
X_test shape: (2500, 100)
Build model...
Train...
Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.441658455276
Test accuracy: 0.836
