In [1]:
import os
import json
import numpy

import keras.backend as kbe
# Text preprocessing libraries
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
# Model libraries
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

Using TensorFlow backend.


In [14]:
# STATIC VARIABLES
POSITIVE_EXAMPLES_FILE_NAME = "pos_amazon_cell_phone_reviews.json"
NEGATIVE_EXAMPLES_FILE_NAME = "neg_amazon_cell_phone_reviews.json"
GLOVE_EMBEDDING_FILE_NAME = "glove.6B/glove.6B.50d.txt"
KINDLE_REVIEWS_FILE_NAME = "kindle_reviews.json"
HOTEL_REVIEWS_FILE_NAME = "trip_advisor_1.json"
MAX_SEQUENCE_LENGTH = 50
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.1
BATCH_SIZE = 128

In [15]:
# Auxiliary functions
def precision(y_true, y_pred): # True positive rate
    true_positives = kbe.sum(kbe.round(kbe.clip(y_true * y_pred, 0, 1))) # get true_positives from true vs predicted
    predicted_positives = kbe.sum(kbe.round(kbe.clip(y_pred, 0, 1)))
    precision = true_positives/kbe.sum(predicted_positives + kbe.epsilon())
    return precision

def recall(y_true, y_pred): # False negative rate
    true_positives = kbe.sum(kbe.round(kbe.clip(y_true * y_pred, 0, 1))) 
    predicted_positives = kbe.sum(kbe.round(kbe.clip(y_true, 0, 1)))
    recall = true_positives/kbe.sum(predicted_positives + kbe.epsilon())
    return recall

In [17]:
# Reading data from file
positive_data = json.loads(open(POSITIVE_EXAMPLES_FILE_NAME, "r").read())['root']
negative_data = json.loads(open(NEGATIVE_EXAMPLES_FILE_NAME, "r").read())['root']
print("Positive data loaded: ", len(positive_data), "entries")
print("Negative data loaded: ", len(negative_data), "entries")

# Transfer Learning file read
kindle_review_data = []
with open(KINDLE_REVIEWS_FILE_NAME, "r") as f:
    kindle_review_data = [json.loads(line) for line in f]
print("Number of kindle review entries: ", len(kindle_review_data), " entries")

# Second transfer learning - different domain
hotel_review_data = json.loads(open(HOTEL_REVIEWS_FILE_NAME, "r").read())["Reviews"]
print("Numer of hotel review entries: ", len(hotel_review_data), " entries")

Positive data loaded:  108664 entries
Negative data loaded:  13279 entries
Number of kindle review entries:  982619  entries
Numer of hotel review entries:  2920  entries


In [23]:
# Process reviews into examples
positive_examples = []
negative_examples = []

for example in positive_data:
    positive_examples.append(example["summary"] + " " + example["text"])
for example in negative_data:
    negative_examples.append(example["summary"] + " " + example["text"])

training_examples = positive_examples + negative_examples
training_labels = [1]*len(positive_examples) + [0]*len(negative_examples)
training_labels = numpy.array(training_labels)

# Transfer Learning Examples
transfer_training_examples = []
transfer_training_labels = []

for data in kindle_review_data:
    transfer_training_examples.append(data["summary"] + " " + data["reviewText"])
    label = 1 if data["overall"] > 2.5 else 0
    transfer_training_labels.append(label)
transfer_training_labels = numpy.array(transfer_training_labels)

# Hotel Review Transfer Learning Examples
hotel_examples = []
hotel_labels = []

for example in hotel_review_data:
    hotel_examples.append(example["Content"])
    label = 1 if float(example["Ratings"]["Overall"]) > 2.5 else 0
    hotel_labels.append(label)

hotel_labels = numpy.array(hotel_labels)

We stayed here because of its location near Xcaret and Xplor. There are some negatives and posititves about it. It is owned by the same company that owns Xcaret, so we got a discount going there. The grounds and pools are beautiful, but we were acosted several times in the lobby by agents wanting us to buy into the time share "opportunity". Late at night, drunk party goers sang and laughed in the halls and the echos were rather noisy. But we always carry earplugs with us, so that problem was solved easily. We don't drink,and are vegetarians,so I would rather have the money you pay for having alcohol included, to have better quality food options. Everything was oily and fried. We stayed in building 12, which was close to the lobby and fitness center. It was okay,but our room looked out over other rooms with a little canal between, not a great view. The beach here is mostly rock. Only a small bay was dredged to give guests a beach.
1


In [24]:
# Process each examples into sequences to be fed into the LSTM network
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_examples + transfer_training_examples + hotel_examples) # Map each word to a numerical index
word_indices = tokenizer.word_index # Get the word to index map
training_sequences = tokenizer.texts_to_sequences(training_examples)# Replace each word in the examples with it's equivalent numerical index
transfer_training_sequences = tokenizer.texts_to_sequences(transfer_training_examples)
hotel_sequences = tokenizer.texts_to_sequences(hotel_examples)
training_sequences_padded = sequence.pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH) # Pad examples that are too short with 0s
transfer_training_sequences_padded = sequence.pad_sequences(transfer_training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
hotel_sequences_padded = sequence.pad_sequences(hotel_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("Number of unique words: ", len(word_indices))

374942


In [25]:
# Split data into training, validation and test sets
random_indices = numpy.arange(training_labels.shape[0])
numpy.random.shuffle(random_indices) # Shuffle the indices randomly
training_sequences_padded = training_sequences_padded[random_indices] # Do array indexing by the random shuffled indices
training_labels = training_labels[random_indices]
random_indices = numpy.arange(transfer_training_labels.shape[0])
numpy.random.shuffle(random_indices)
transfer_training_sequences_padded = transfer_training_sequences_padded[random_indices]
transfer_training_labels = transfer_training_labels[random_indices]

validation_size = int((VALIDATION_SPLIT + TEST_SPLIT) * training_labels.shape[0])
test_size = int(TEST_SPLIT * training_labels.shape[0])

x_train = training_sequences_padded[:-validation_size]
y_train = training_labels[:-validation_size]
x_val = training_sequences_padded[-validation_size:-test_size]
y_val = training_labels[-validation_size:-test_size]
x_test = training_sequences_padded[-test_size:]
y_test = training_labels[-test_size:]

# Transfer Learning Set
x_transfer_train = training_sequences_padded[:-validation_size]
y_transfer_train = training_labels[:-validation_size]
x_transfer_val = training_sequences_padded[-validation_size:-test_size]
y_transfer_val = training_labels[-validation_size:-test_size]
x_transfer_test = training_sequences_padded[-test_size:]
y_transfer_test = training_labels[-test_size:]

In [26]:
# GloVe embedding to map similarities into an embedding matrix
glove_embeddings = {} # dictionary of word to it's respective embedding list
with open(GLOVE_EMBEDDING_FILE_NAME, "r", encoding='utf-8') as glove_file:
    for line in glove_file:
        embedding_list = line.split()
        word = embedding_list[0]
        glove_embeddings[word] = numpy.asarray(embedding_list[1:], dtype='float32')

embedding_matrix = numpy.zeros((len(word_indices) + 1, MAX_SEQUENCE_LENGTH)) # initialize matrix of embeddings for each index in word_indices. Words that are not present in the embedding are initialized to 0
for word, index in word_indices.items():
    embedding_list = glove_embeddings.get(word)
    if embedding_list is not None:
        embedding_matrix[index] = embedding_list

In [27]:
# Build LSTM model
embedding_layer = Embedding(len(word_indices) + 1, 
                            MAX_SEQUENCE_LENGTH, 
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=False)

model_lstm = Sequential()
model_lstm.add(embedding_layer)
model_lstm.add(LSTM(32))
model_lstm.add(Dropout(0.4)) #Dropout layer for regularization
model_lstm.add(Dense(1, activation='sigmoid')) # output layer using sigmoid activation function

In [28]:
# Train model and output training results
model_lstm.compile('adam', 'binary_crossentropy', metrics=['accuracy', precision, recall])
model_lstm.fit(x_train, y_train,
         batch_size=BATCH_SIZE,
         epochs=20,
         validation_data=[x_val, y_val])
results = model_lstm.evaluate(x_test, y_test)

print("Loss: ", results[0])
print("Accuracy: ", results[1])
print("Precision: ", results[2])
print("Recall: ", results[3])

Train on 85361 samples, validate on 24388 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss:  0.1274158432074284
Accuracy:  0.9543217976053797
Precision:  0.9604440935912906
Recall:  0.9896093709206452


In [29]:
# Test kindle review test set
results = model_lstm.evaluate(x_transfer_test, y_transfer_test)

print("Loss: ", results[0])
print("Accuracy: ", results[1])
print("Precision: ", results[2])
print("Recall: ", results[3])

Loss:  0.1274158432074284
Accuracy:  0.9543217976053797
Precision:  0.9604440935912906
Recall:  0.9896093709206452


In [30]:
# Test hotel review test set
results = model_lstm.evaluate(hotel_sequences_padded, hotel_labels)

print("Loss: ", results[0])
print("Accuracy: ", results[1])
print("Precision: ", results[2])
print("Recall: ", results[3])

Loss:  0.4860623988386703
Accuracy:  0.8496575342465753
Precision:  0.862609209099861
Recall:  0.9738314458768662
