LSTM Which uses Keras' embedding layer

Accuraccy: 70% 

In [1]:
import pickle
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from Word2VecUtility3 import Word2VecUtility3
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

Using TensorFlow backend.


In [2]:
import keras
import tensorflow
print(keras.__version__)
print(tensorflow.__version__)

2.0.6
1.2.1


In [3]:
def get_volcabulary_and_list_words(data):
    reviews_words = []
    volcabulary = []
    for review in data["text"]:
        review_words = Word2VecUtility3.review_to_wordlist(
            review, remove_stopwords=True)
        reviews_words.append(review_words)
        for word in review_words:
            volcabulary.append(word)
    volcabulary = set(volcabulary)
    return volcabulary, reviews_words

def get_reviews_word_index(reviews_words, volcabulary, max_words, max_length):
    word2index = {word: i for i, word in enumerate(volcabulary)}
    # use w in volcabulary to limit index within max_words
    reviews_words_index = [[start] + [(word2index[w] + index_from) for w in review] for review in reviews_words]
    # in word2vec embedding, use (i < max_words + index_from) because we need the exact index for each word, in order to map it to its vector. And then its max_words is 5003 instead of 5000.
    reviews_words_index = [[i if (i < max_words) else oov for i in index] for index in reviews_words_index]
    # padding with 0, each review has max_length now.
    reviews_words_index = sequence.pad_sequences(reviews_words_index, maxlen=max_length, padding='post', truncating='post')
    return reviews_words_index

In [4]:
# data processing para
max_words = 20000
max_length = 500

# model training parameters
batch_size = 32
embedding_dims = 100 #was 100
# nb_filter = 250
# filter_length = 3
hidden_dims = 250
epochs = 2

# index trick parameters
index_from = 3
start = 1
# padding = 0
oov = 2

data = pd.read_csv('yelp_review_sub100k.csv', sep=',', index_col=False)
print('get volcabulary...')
volcabulary, reviews_words = get_volcabulary_and_list_words(data)
print('get reviews_words_index...')
reviews_words_index = get_reviews_word_index(reviews_words, volcabulary, max_words, max_length)

print(reviews_words_index[:20, :12])
print(reviews_words_index.shape)

labels = data["stars"]
labels[labels <= 2] = 0
labels[labels >= 4] = 1

pickle.dump((reviews_words_index, labels), open("100000by500reviews_words_index.pkl", 'wb'))

# with oov, index_from, start and padding, we have 4999 + 4 = 5003 indexes.
(reviews_words_index, labels) = pickle.load(open("100000by500reviews_words_index.pkl", 'rb'))

index = np.arange(reviews_words_index.shape[0])
train_index, valid_index = train_test_split(
    index, train_size=0.8, random_state=520)

train_data = reviews_words_index[train_index]
valid_data = reviews_words_index[valid_index]
train_labels = labels[train_index]
valid_labels = labels[valid_index]
print(train_data.shape)
print(valid_data.shape)

del(labels, train_index, valid_index)

get volcabulary...




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


get reviews_words_index...
[[    1     2  9094     2     2     2     2 10594 12437     2     2     2]
 [    1     2     2     2     2     2     2     2     2 10146     2 17723]
 [    1     2     2     2 16399     2     2     2 18239     2     2  3059]
 [    1 11518     2 11616     2     2  6832     2     2 12902     2  4242]
 [    1     2     2  8753     2 18513     2 17029  8935     2   615     2]
 [    1     2     2     2     2     2     2     2  7219     2     2     2]
 [    1     2     2  2161     2     2     2 11518 18579     2     2 16399]
 [    1     2 11597 14090     2     2     2     2     2     2     2     2]
 [    1     2     2     2     2     2     2     2     2     2  3106  1426]
 [    1     2     2     2 15410  4242     2     2     2     2     2     2]
 [    1 11518     2  8912     2     2  1975     2     2     2     2  9572]
 [    1  3866     2 11463     2     2     2     2  3866     2 18513     2]
 [    1     2     2     2 16522     2     2     2     2  7460 18663     2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(79999, 500)
(20000, 500)


In [5]:
print(len(train_data), 'train sequences')
print(len(valid_labels), 'test sequences')

79999 train sequences
20000 test sequences


In [6]:
# #might not be necessary 
# print('Pad sequences (samples x time)')
# train_data = sequence.pad_sequences(train_data, maxlen=max_length)
# valid_data = sequence.pad_sequences(valid_data, maxlen=max_length)
# print('train_data shape:', train_data.shape)
# print('valid_data:', valid_data.shape)

In [7]:
# create the model
model = Sequential()
#model.add(Embedding(max_words, 32, input_length=max_length))
model.add(Embedding(max_words + index_from, embedding_dims, \
                    input_length=max_length))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          2000300   
_________________________________________________________________
flatten_1 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 14,500,801
Trainable params: 14,500,801
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
# Fit the model
model.fit(train_data, train_labels, validation_data=(valid_data, valid_labels), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(valid_data, valid_labels, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# filepath="imp-{epoch:02d}-{val_acc:.2f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# csv_logger = CSVLogger('training_history.csv')
#Accuracy: 70.83%

Train on 79999 samples, validate on 20000 samples
Epoch 1/10
16s - loss: -7.4220e-01 - acc: 0.6611 - val_loss: -1.5056e+00 - val_acc: 0.6846
Epoch 2/10
15s - loss: -1.6613e+00 - acc: 0.6923 - val_loss: -1.5732e+00 - val_acc: 0.6859
Epoch 3/10
15s - loss: -1.9765e+00 - acc: 0.7102 - val_loss: -1.6478e+00 - val_acc: 0.6967
Epoch 4/10
15s - loss: -2.2641e+00 - acc: 0.7271 - val_loss: -1.4945e+00 - val_acc: 0.7058
Epoch 5/10
15s - loss: -2.5923e+00 - acc: 0.7488 - val_loss: -1.5496e+00 - val_acc: 0.7045
Epoch 6/10
15s - loss: -2.8325e+00 - acc: 0.7679 - val_loss: -1.4926e+00 - val_acc: 0.6962
Epoch 7/10
14s - loss: -3.0209e+00 - acc: 0.7854 - val_loss: -1.4313e+00 - val_acc: 0.7005
Epoch 8/10
14s - loss: -3.1441e+00 - acc: 0.7977 - val_loss: -1.2266e+00 - val_acc: 0.7015
Epoch 9/10
15s - loss: -3.2176e+00 - acc: 0.8086 - val_loss: -1.2207e+00 - val_acc: 0.7024
Epoch 10/10
14s - loss: -3.2670e+00 - acc: 0.8153 - val_loss: -1.1059e+00 - val_acc: 0.7029
Accuracy: 70.29%


In [9]:
# print("start training model...")

# model = Sequential()

# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_words + index_from, embedding_dims, \
#                     input_length=max_length))
# model.add(Dropout(0.25))

# # we add a Convolution1D, which will learn nb_filter
# # word group filters of size filter_length:

# # filter_length is like filter size, subsample_length is like step in 2D CNN.
# model.add(Convolution1D(nb_filter=nb_filter,
#                         filter_length=filter_length,
#                         border_mode='valid',
#                         activation='relu',
#                         subsample_length=1))
# # we use standard max pooling (halving the output of the previous layer):
# model.add(MaxPooling1D(pool_length=2))

# # We flatten the output of the conv layer,
# # so that we can add a vanilla dense layer:
# model.add(Flatten())

# # We add a vanilla hidden layer:
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.25))
# model.add(Activation('relu'))

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(1))
# model.add(Activation('sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='rmsprop',
#               class_mode='binary',
#              metrics=['accuracy'])
# model.fit(train_data, train_labels, validation_data=(valid_data, valid_labels), batch_size=batch_size, epochs=epochs)