In [None]:
PATH_TO_DATA = 'input/hotel/Hotel_Reviews.csv'

In [None]:
# some necessary imports
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Utility
import re
from collections import Counter
import logging
import time
import pickle
import itertools
nltk.download('stopwords')

In [None]:
# DATASET
DATASET_ENCODING = "ISO-8859-1"
# TEXT CLENAING
TEXT_CLEANING_RE = "[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
train_df = pd.read_csv(PATH_TO_DATA, encoding =DATASET_ENCODING)

In [None]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
df = train_df[['Negative_Review', 'Positive_Review']]
df.Negative_Review = df.Negative_Review.apply(lambda x: preprocess(x))
df.Positive_Review = df.Positive_Review.apply(lambda x: preprocess(x))

This step removes the noise, and transfers text content to the panda list.

In [None]:
texts = []
labels = []
texts_test = []
labels_test = []
for i in range (60000):
    if df.Negative_Review[i+10000] != "negative":
        texts.append(df.Negative_Review[i+10000])
        labels.append(0)
for i in range (60000):
    if df.Positive_Review[i+10000] != "positive":
        texts.append(df.Positive_Review[i+10000])
        labels.append(1)
for i in range (6000):
    if df.Negative_Review[i] != "negative":
        texts_test.append(df.Negative_Review[i])
        labels_test.append(0)
for i in range (6000):
    if df.Positive_Review[i] != "positive":
        texts_test.append(df.Positive_Review[i])
        labels_test.append(1)

This step separates training set, validation set, and test set.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 20  # We will cut reviews after 20 words
max_words = 5000  # We will only consider the top 5,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)  # https://keras.io/preprocessing/text/
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  # This turns strings into lists of integer indices.

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

This step tokenizes labels and data in training set and validation set.

In [None]:
d = 0
for i in range (len(sequences)):
    if len(sequences[i-d]) == 0:
        del sequences[i-d]
        del labels[i-d]
        d = d+1

This step deletes 0 units and their labels.

In [None]:
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Let the data be sorted in random order.

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(max_words, 64, input_length=maxlen))
model.add(LSTM(64,
              dropout=0.3,
              recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(data, labels,
                    epochs=10,
                    batch_size=128,
                    validation_split = 2/9)
model.save_weights('lstm_hotel_validate.h5')