In [8]:
import os
import json
import numpy

import keras.backend as kbe
# Text preprocessing libraries
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
# Model libraries
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [61]:
# STATIC VARIABLES
POSITIVE_EXAMPLES_FILE_NAME = "pos_amazon_cell_phone_reviews.json"
NEGATIVE_EXAMPLES_FILE_NAME = "neg_amazon_cell_phone_reviews.json"
GLOVE_EMBEDDING_FILE_NAME = "glove.6B/glove.6B.50d.txt"
MAX_SEQUENCE_LENGTH = 50
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.1
BATCH_SIZE = 128

In [48]:
# Auxiliary functions
def precision(y_true, y_pred): # True positive rate
    true_positives = kbe.sum(kbe.round(kbe.clip(y_true * y_pred, 0, 1))) # get true_positives from true vs predicted
    predicted_positives = kbe.sum(kbe.round(kbe.clip(y_pred, 0, 1)))
    precision = true_positives/kbe.sum(predicted_positives + kbe.epsilon())
    return precision

def recall(y_true, y_pred): # False negative rate
    true_positives = kbe.sum(kbe.round(kbe.clip(y_true * y_pred, 0, 1))) 
    predicted_positives = kbe.sum(kbe.round(kbe.clip(y_true, 0, 1)))
    recall = true_positives/kbe.sum(predicted_positives + kbe.epsilon())
    return recall

In [16]:
# Reading data from file
positive_data = json.loads(open(POSITIVE_EXAMPLES_FILE_NAME, "r").read())['root']
negative_data = json.loads(open(NEGATIVE_EXAMPLES_FILE_NAME, "r").read())['root']
print("Positive data loaded: ", len(positive_data), "entries")
print("Negative data loaded: ", len(negative_data), "entries")

Positive data loaded:  108664 entries
Negative data loaded:  13279 entries


In [67]:
# Process reviews into examples
positive_examples = []
negative_examples = []

for example in positive_data:
    positive_examples.append(example["summary"] + " " + example["text"])
for example in negative_data:
    negative_examples.append(example["summary"] + " " + example["text"])

training_examples = positive_examples + negative_examples
training_labels = [1]*len(positive_examples) + [0]*len(negative_examples)
training_labels = numpy.array(training_labels)

In [66]:
# Process each examples into sequences to be fed into the LSTM network
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_examples) # Map each word to a numerical index
word_indices = tokenizer.word_index # Get the word to index map
training_sequences = tokenizer.texts_to_sequences(training_examples) # Replace each word in the examples with it's equivalent numerical index
training_sequences_padded = sequence.pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH) # Pad examples that are too short with 0s

In [65]:
# Split data into training, validation and test sets
random_indices = numpy.arange(training_labels.shape[0])
numpy.random.shuffle(random_indices) # Shuffle the indices randomly
training_sequences_padded = training_sequences_padded[random_indices] # Do array indexing by the random shuffled indices
training_labels = training_labels[random_indices]

validation_size = int((VALIDATION_SPLIT + TEST_SPLIT) * training_labels.shape[0])
test_size = int(TEST_SPLIT * training_labels.shape[0])

x_train = training_sequences_padded[:-validation_size]
y_train = training_labels[:-validation_size]
x_val = training_sequences_padded[-validation_size:-test_size]
y_val = training_labels[-validation_size:-test_size]
x_test = training_sequences_padded[-test_size:]
y_test = training_labels[-test_size:]

In [56]:
# GloVe embedding to map similarities into an embedding matrix
glove_embeddings = {} # dictionary of word to it's respective embedding list
with open(GLOVE_EMBEDDING_FILE_NAME, "r") as glove_file:
    for line in glove_file:
        embedding_list = line.split()
        word = embedding_list[0]
        glove_embeddings[word] = numpy.asarray(embedding_list[1:], dtype='float32')

embedding_matrix = numpy.zeros((len(word_indices) + 1, MAX_SEQUENCE_LENGTH)) # initialize matrix of embeddings for each index in word_indices. Words that are not present in the embedding are initialized to 0
for word, index in word_indices.items():
    embedding_list = glove_embeddings.get(word)
    if embedding_list is not None:
        embedding_matrix[index] = embedding_list


In [75]:
# Build LSTM model
embedding_layer = Embedding(len(word_indices) + 1, 
                            MAX_SEQUENCE_LENGTH, 
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=False)

model_lstm = Sequential()
model_lstm.add(embedding_layer)
model_lstm.add(LSTM(64))
model_lstm.add(Dropout(0.1)) #Dropout layer for regularization
model_lstm.add(Dense(1, activation='sigmoid')) # output layer using sigmoid activation function

In [76]:
# Train model and output training results
model_lstm.compile('adam', 'binary_crossentropy', metrics=['accuracy', precision, recall])
model_lstm.fit(x_train, y_train,
         batch_size=BATCH_SIZE,
         epochs=16,
         validation_data=[x_val, y_val])
results = model_lstm.evaluate(x_test, y_test)

print("Loss: ", results[0])
print("Accuracy: ", results[1])
print("Precision: ", results[2])
print("Recall: ", results[3])

Train on 85361 samples, validate on 24388 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Loss:  0.1327534392131026
Accuracy:  0.9562899786780383
Precision:  0.9756934450789907
Recall:  0.9751856860047738
