# LSTM for Sentiment Analysis

In [2]:
import string
import functools
import pandas as pd
import numpy as np
import h5py
import html

import my_utils

from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import stopwords as sw

from keras import metrics
from keras import losses
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D

Using TensorFlow backend.


## 1 - Prepare Data

The data format expected for a LSTM network is not the same as what we previously used for NB model. LSTM is expecting sequence input, so we have to make our data into the right shape.

In [3]:
kindle_data = pd.read_csv('sampled_data.csv')
kindle_data_sample = kindle_data.sample(frac=0.02)

> Even if you are using the whole dataset, please make sure you do the shuffle. Otherwise the mini-batch traning will not work. 

In [4]:
len(kindle_data_sample)

2537

### Preprocessing

In [5]:
lemmatizer = WordNetLemmatizer()
stop = sw.words('english')
translation = str.maketrans(string.punctuation,' '*len(string.punctuation))

def preprocessing(line):
    line = html.unescape(str(line))
    line = str(line).translate(translation)
    line = word_tokenize(line.lower())
    
    line = [lemmatizer.lemmatize(t) for t in line if t not in stop]
    return ' '.join(line)

In [6]:
data = []
n = len(kindle_data_sample)
for i in range(n):
    data.append(preprocessing(kindle_data['reviewText'][i]))
    my_utils.print_progress(i + 1, n, decimals=0, bar_length=50)

labels = [1 if x == 'pos' else 0 for x in kindle_data_sample['overall']]



### Get Vocabulary

Since we're going to encoding text into word index sequence, we need to build a map between word and index in vocabulary (*usually from the most frequent one to the least frequent one*).

In [7]:
tokens = [w for line in data for w in line.split()]
freqdist = FreqDist(tokens)

In [8]:
freqdist_10 = {k:v for k, v in freqdist.items() if v >= 50}
len(freqdist_10)

343

In [11]:
freqdist.most_common(10)

[('book', 3189),
 ('read', 1599),
 ('story', 1572),
 ('one', 939),
 ('love', 834),
 ('character', 783),
 ('great', 740),
 ('good', 708),
 ('like', 657),
 ('well', 585)]

In [9]:
VOCA_SIZE = 300

In [10]:
vocabulary = {word[0]: idx + 1
              for idx, word in enumerate(list(freqdist.most_common(VOCA_SIZE)))}

### Encoding

In [12]:
def encoding(text, vocab, max_len=100):
    res = []
    
    # Text is preprocessed and splitable by white space
    for word in text.split():
        if word in vocab:
            res.append(vocab[word])
    # Fix lenghth
    if len(res) > max_len:
        res = res[:max_len]
    else:
        res = [0] * (max_len - len(res)) + res
    return res

In [13]:
# Statistics on sequence length
seq_len = np.asarray([len(line.split()) for line in data])
print('Average length:', np.average(seq_len))
print('Max length:', np.max(seq_len))
print('Min length:', np.min(seq_len))
print('StdDev:', np.std(seq_len))

Average length: 41.5534095388
Max length: 663
Min length: 2
StdDev: 53.061342941


In [14]:
SEQ_MAX_LEN = 50

In [15]:
X = []
for i in range(len(data)):
    X.append(encoding(data[i], vocabulary, max_len=SEQ_MAX_LEN))
    my_utils.print_progress(i + 1, len(data), decimals=0, bar_length=50, prefix='Encoding data:')



In [16]:
Y = labels

### Save Sequence Data

In [18]:
with h5py.File("amazon_sample_data.h5", "w") as f:
    f.create_dataset("X", data=X)
    f.create_dataset("Y", data=Y)

## 2 - Load Data

In [19]:
def load_dataset():
    dataset = h5py.File('amazon_sample_data.h5', "r")
    X = np.array(dataset["X"][:])
    Y = np.array(dataset["Y"][:])
    return X, Y

In [20]:
X, Y = load_dataset()

In [21]:
# Print shape
print('X', X.shape)
print('Y', Y.shape)

X (2537, 50)
Y (2537,)


## 3 - Build Model

In [22]:
# Some Parameters
LSTM_UNITS = 100
DROP_OUT = 0.3
RNN_DROP_OUT = 0.2
WORD_VEC_LEN = 64

In [23]:
model = Sequential()

# Word Embedding Layer
model.add(Embedding(
    input_dim=VOCA_SIZE + 1, 
    output_dim=WORD_VEC_LEN))

# LSTM Layer
model.add(LSTM(
    LSTM_UNITS, 
    dropout=DROP_OUT,
    recurrent_dropout=RNN_DROP_OUT))

# FC Layer
model.add(Dense(1, activation='sigmoid'))

In [24]:
model.compile(
    loss=losses.binary_crossentropy,
    optimizer='adam',
    metrics=[metrics.binary_accuracy])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          19264     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 85,365
Trainable params: 85,365
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Hyper-parameters
N_EPOCH = 10
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.1

In [None]:
history = model.fit(
    X, Y, 
    epochs=N_EPOCH,
    batch_size=BATCH_SIZE,)

In [None]:
model2 = Sequential()
model2.add(Embedding(
    input_dim= VOCA_SIZE + 1,
    output_dim=WORD_VEC_LEN))

model2.add(Conv1D(
    filters=32, 
    kernel_size=3,
    padding='same', 
    activation='relu'))

model2.add(MaxPooling1D(pool_size=2))

model2.add(LSTM(
    LSTM_UNITS, 
    dropout=0.2, 
    recurrent_dropout=0.2))

model2.add(Dense(1, activation='sigmoid'))

model2.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=[metrics.binary_accuracy])

model2.summary()

In [None]:
history2 = model2.fit(
    X, Y, epochs=10, batch_size=64, validation_split=0.2)