# Sentiment Classification using pre-trained word embeddings and a FeedForward net

1. Uses Word2vec word embeddings: 94.15% (2 epochs)
2. Learns own weights during training 92.93% (2 epochs)
3. Uses GLoVe word embeddings: 94.29% (2 epochs)

*Approach:*

Convert all text samples in the dataset into sequences of word indices. A "word index" would simply be an integer ID for the word. We will only consider the top 20,000 most commonly occuring words in the dataset, and we will truncate the sequences to a maximum length of 1000 words.

Prepare an "embedding matrix" which will contain at index i the embedding vector for the word of index i in our word index.

Load this embedding matrix into a Keras Embedding layer, set to be frozen (its weights, the embedding vectors, will not be updated during training).

Build on top of it a LSTM recurrent neural network.

In [2]:
from __future__ import print_function
import os
import sys
import numpy as np
import pandas as pd
import pickle
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import gensim

BASE_DIR = 'C:/Users/James/Desktop/NLP Summer/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
GLOVE_DIR = BASE_DIR + '/glove.42B.300d/'
TRAIN_DATA_FILE = BASE_DIR + '/AFF/review_sub100k.csv'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

Create text samples and labels and format into tensors that can be fed into a neural network. 
To do this, we will rely on Keras utilities keras.preprocessing.text.Tokenizer and keras.preprocessing.sequence.pad_sequences
as well as an external text processor (text_to_wordlist).

In [3]:
# second, prepare text samples and their labels
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text

# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

Processing text dataset


Data Preparation

In [4]:
#create arrays for text data and labels

texts = [] 
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts.append(text_to_wordlist(values[1]))
        labels.append(int(values[0]))
print('Found %s texts in train.csv' % len(texts))

#store and load using pickle
pickle.dump((texts, labels), open("texts_labels_index.pkl", 'wb'))
(texts, labels) = pickle.load(open("texts_labels_index.pkl", 'rb'))


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

Found 100000 texts in train.csv
Found 57255 unique tokens
Shape of data tensor: (100000, 1000)
Shape of label tensor: (100000,)


In [5]:
print(x_train.shape)

(80000, 1000)


# Approach 1: using Word2Vec weights

In [6]:
## index word vectors

print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

Indexing word vectors
Found 3000000 word vectors of word2vec


In [7]:
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix_w2v = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix_w2v[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix_w2v, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 20530


In [8]:
embedding_layer_w2v = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix_w2v],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

In [10]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer_w2v(sequence_input)

In [18]:
import tensorflow as tf
tf.__version__

'1.2.1'

In [23]:
print("start training model...")
model = Sequential()
#model.add(embedded_sequences)
model.add(Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix_w2v], input_length=MAX_SEQUENCE_LENGTH))
#model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, embedding_layer, input_length=MAX_SEQUENCE_LENGTH))
#model.add(Embedding(word_index, EMBEDDING_DIM, weights=[embedding_matrix_w2v], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
#model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              class_mode='binary')

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128, verbose=2)

start training model...


ValueError: Error when checking target: expected activation_22 to have 3 dimensions, but got array with shape (80000, 1)

In [None]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act = 'relu'

In [None]:
from keras.layers import Conv1D, MaxPooling1D, Embedding

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = Sequential()
model.add(Dense(32, input_dim=784))
model.add(Activation('relu'))



x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=128)

In [None]:
# import the necessary packages
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from keras.models import Sequential
from keras.layers import Activation
from keras.optimizers import SGD
from keras.layers import Dense

In [None]:
# define the architecture of the network

lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer_w2v(sequence_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
x = BatchNormalization()(x)
x = Dense(num_dense, activation=act)(x)
x = Dropout(rate_drop_dense)(x)
x = BatchNormalization()(x)

preds = Dense(1, activation='sigmoid')(x)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = Sequential()
model.add(Dense(embedded_sequences,
	activation="relu"))
model.add(Dense(384, init="uniform", activation="relu"))
model.add(Dense(2))
model.add(Activation("softmax"))

In [None]:

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=128)

In [None]:
from keras.optimizers import RMSprop
#lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = Sequential()
model.add(embedded_sequences)
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(10))
model.add(Activation('softmax'))

rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms, metrics=['accuracy'])
#print 'Model compield in {0} seconds'.format(time.time() - start_time)

# x = lstm_layer(embedded_sequences)
# x = Dropout(rate_drop_dense)(x)
# x = BatchNormalization()(x)
# x = Dense(num_dense, activation=act)(x)
# x = Dropout(rate_drop_dense)(x)
# x = BatchNormalization()(x)

preds = Dense(1, activation='sigmoid')(model)

# lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

# sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences_1 = embedding_layer(sequence_1_input)
# x1 = lstm_layer(embedded_sequences_1)

# sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences_2 = embedding_layer(sequence_2_input)
# y1 = lstm_layer(embedded_sequences_2)

# merged = concatenate([x1, y1])
# merged = Dropout(rate_drop_dense)(merged)
# merged = BatchNormalization()(merged)

# merged = Dense(num_dense, activation=act)(merged)
# merged = Dropout(rate_drop_dense)(merged)
# merged = BatchNormalization()(merged)

# preds = Dense(1, activation='sigmoid')(merged)

In [None]:
## train the model

#model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
model.summary()

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_val, y_val, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# Approach 2: Initializing Embedding layer from scratch and learning its weights during training

We can also test how well we would have performed by not using pre-trained word embeddings, but instead initializing our Embedding layer from scratch and learning its weights during training. We just need to replace our Embedding layer with the following:

In [None]:
embedding_layer_self = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH)

# Hence, no weights paramenter

In [None]:
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer_self(sequence_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
x = BatchNormalization()(x)
x = Dense(num_dense, activation=act)(x)
x = Dropout(rate_drop_dense)(x)
x = BatchNormalization()(x)

preds = Dense(1, activation='sigmoid')(x)

In [None]:
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
model.summary()

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_val, y_val, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# Approach 3: Use GloVe Weights:

Next, we compute an index mapping words to known embeddings (GloVe), by parsing the data dump of pre-trained embeddings:

In [None]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.42B.300d.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
## prepare embeddings

print('Preparing embedding matrix')

num_words = min(MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix_glove = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_glove[i] = embedding_vector


In [None]:
# load pre-trained word embeddings into an Embedding layer works
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix_glove],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

Develop LSTM Model

In [None]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act = 'relu'

In [None]:
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
x = BatchNormalization()(x)
x = Dense(num_dense, activation=act)(x)
x = Dropout(rate_drop_dense)(x)
x = BatchNormalization()(x)

preds = Dense(1, activation='sigmoid')(x)

In [None]:
## train the model

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()


In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_val, y_val, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Other kind of LSTM Model: Use in case of backup 

In [None]:
# create the model
# model = Sequential()
# #Word2Vec
# model.add(Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
# #for pre-trained weights - GloVe
# #model.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
# #initializing our Embedding layer from scratch and learning its weights during training.
# #model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
# #model.add(Embedding(embedding_layer))
# model.add(Flatten())
# model.add(Dense(250, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())