# Sentiment analysis on amazon reviews

In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, InputLayer, LSTM
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from sklearn.model_selection import train_test_split

from src import embedding_helper, data_helper

Using TensorFlow backend.


In [3]:
data_dir = "data/"
seed = 42

embedding_dim = 100
max_sequence_length = 100
test_size = 0.1

In [4]:
embedding_file = 'glove.6B.{}d.txt'.format(embedding_dim)
file_name = "{}reviews.csv".format(data_dir)
texts, scores = data_helper.extract_data(file_name, n_rows=None)

# Pre-processing

In [5]:
# Cleaning up the texts by removing stopwords, contractions and unwanted characters
texts = data_helper.clean_data(texts, remove_stopwords=True)
num_words = data_helper.calc_num_words(texts, threshold=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Erik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Creating dictionary for connecting indices with words
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [7]:
# Creating the embedding index and the embedding matrix
embeddings_index = embedding_helper.create_embeddings_index(data_dir, embedding_file)
embedding_matrix = embedding_helper.create_embedding_matrix(embeddings_index, word_index, num_words, embedding_dim)

In [8]:
# Padding sequence so that they all have the same length. Need for vectorization in Keras
X = pad_sequences(sequences, maxlen=max_sequence_length)
X_train, X_val, y_train, y_val = train_test_split(X, scores, test_size=test_size, random_state=seed)

# Modelling

In [9]:
tf.set_random_seed(seed)

model = Sequential()
model.add(InputLayer(input_shape=(max_sequence_length,), dtype='int32', name="input"))
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], trainable=False, name="embedded"))  

model.add(LSTM(128, name="LSTM_1"))
model.add(Dense(1, activation='sigmoid', name='output'))

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 100)               0         
_________________________________________________________________
embedded (Embedding)         (None, 100, 100)          6941400   
_________________________________________________________________
LSTM_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 7,058,777
Trainable params: 117,377
Non-trainable params: 6,941,400
_________________________________________________________________


In [11]:
tf.set_random_seed(seed)

batch_size = 128
num_epochs = 5
learning_rate = 0.0005

optimizer = Adam(lr=learning_rate)

with tf.device('/gpu:0'):
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=["accuracy"])

    model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs, 
              validation_data=(X_val, y_val))

Train on 511608 samples, validate on 56846 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Example output

In [12]:
# Reversing word_index, i.e. index to word
reversed_word_index = dict([(v, k) for k, v in word_index.items()])

In [13]:
example = X_val[0,:].reshape((1, max_sequence_length))

score = model.predict(example)
pred_sentiment = data_helper.sentiment(score)

setence = ""

for i in range(0, example.shape[1]):
    if example[0,i] != 0:
        setence = setence + " " + reversed_word_index[example[0,i]]
        
print("Sentence:{} \n".format(setence))
print("Sentiment: {}".format(pred_sentiment))

Sentence: tried couple brands gluten free sandwich cookies best bunch crunchy true texture real cookies gluten free might think filling makes bit sweet means satisfied sweet tooth sooner chocolate version glutino good true chocolatey taste something gluten free brands 

Sentiment: positive


In [14]:
example = X_val[2,:].reshape((1, max_sequence_length))

score = model.predict(example)
pred_sentiment = data_helper.sentiment(score)

setence = ""

for i in range(0, example.shape[1]):
    if example[0,i] != 0:
        setence = setence + " " + reversed_word_index[example[0,i]]
        
print("Sentence:{} \n".format(setence))
print("Sentiment: {}".format(pred_sentiment))

Sentence: little less expected tends muddy taste expected since said favorite company 

Sentiment: negative
