In [2]:
import pandas as pd
from keras.layers import LSTM, Dense, Embedding, Masking
from keras.optimizers import Adam
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
import numpy as np
from keras.utils import np_utils
from keras.models import load_model
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import word2vec

In [3]:
embedding_dim = 50

In [4]:
train_file_text = './imdbDataset/imdb_train_text.txt'
train_file_label = './imdbDataset/imdb_train_labels.txt'
test_file_text = './imdbDataset/imdb_test_text.txt'
test_file_label = './imdbDataset/imdb_test_labels.txt'

In [5]:
training_reviews = []
with open(train_file_text) as fin:
    for l in fin:
        training_reviews.append(l.replace('<br /><br />', " ").strip("\"\n"))

In [6]:
test_reviews = []
with open(test_file_text) as fin:
    for l in fin:
        test_reviews.append(l.replace('<br /><br />', " ").strip("\"\n"))

In [7]:
training_labels = []
with open(train_file_label) as fin:
    for l in fin:
        training_labels.append(int(l.strip()))

In [8]:
test_labels = []
with open(test_file_label) as fin:
    for l in fin:
        test_labels.append(int(l.strip()))

In [11]:
def dataprep(reviews):
    table = str.maketrans('', '', string.punctuation)
    eng_stopwords = stopwords.words('english')
    # word-tokenize
    words_in_reviews = [review.split() for review in reviews]
    print("tokenization done..")
    # removing punctuation
    words_in_reviews = [[word.translate(table) for word in sent] for sent in words_in_reviews]
    print("punctuations removed...")
    # removing stop-words
    words_in_reviews = [[w.lower() for w in review_words if w.lower() not in eng_stopwords and len(w) > 2] for review_words in words_in_reviews]
    print("stopwords removed...")
    # stemming
    #stemmer = PorterStemmer()
    #stemmed_reviews = [[stemmer.stem(w) for w in words] for words in words_in_reviews]
    #print("stemming done...")
    #return stemmed_reviews
    return words_in_reviews

In [12]:
test_processed_reviews = dataprep(test_reviews)

tokenization done..
punctuations removed...
stopwords removed...


In [13]:
train_processed_reviews = dataprep(training_reviews)

tokenization done..
punctuations removed...
stopwords removed...


In [87]:
word2vecModel = word2vec.Word2Vec(train_processed_reviews + test_processed_reviews, iter=10)

In [18]:
word2vecModel.corpus_count

50000

In [19]:
word2vecModel.corpus_total_words

5906919

In [20]:
word2vecModel.wv.most_similar('love')

[('asleep', 0.5823028087615967),
 ('bermuda', 0.5716625452041626),
 ('apart', 0.524666428565979),
 ('wayside', 0.5025482177734375),
 ('loves', 0.4977417588233948),
 ('romantic', 0.47469043731689453),
 ('loved', 0.4740931987762451),
 ('flat', 0.46361207962036133),
 ('hate', 0.46154505014419556),
 ('friendship', 0.44779422879219055)]

In [21]:
word2vecModel.wv.similar_by_word(word='worst')

[('stupidest', 0.7889276742935181),
 ('best', 0.7459392547607422),
 ('scariest', 0.7380221486091614),
 ('cheesiest', 0.7299014329910278),
 ('funniest', 0.7029139995574951),
 ('greatest', 0.676495373249054),
 ('poorest', 0.6579285264015198),
 ('weirdest', 0.6461619138717651),
 ('finest', 0.6428632736206055),
 ('dumbest', 0.6420060396194458)]

In [22]:
def getIndex(words, word2vecModel):
    indices = []
    for w in words:
        try:
            indices.append(word2vecModel.wv.vocab[w].index)
        except KeyError:
            pass
    return np.array(indices)

In [23]:
train_index = np.array([getIndex(sentence, word2vecModel) for sentence in train_processed_reviews])

In [24]:
test_index = np.array([getIndex(sentence, word2vecModel) for sentence in test_processed_reviews])

###### Experiment 1: without any data cleaning training embedding layer

In [26]:
vocab_size = len(word2vecModel.wv.vocab) + 1

In [27]:
vocab_size

42505

In [32]:
max_length = max([len(x) for x in np.concatenate((train_index,test_index))])

In [33]:
avg_length = sum([len(x) for x in np.concatenate((train_index,test_index))])/(len(train_index) + len(test_index))

In [34]:
max_sequence_length = 500

In [35]:
print("Average length is {} and max length is {}\nSetting max sequence length to be {}".format(avg_length, max_length, max_sequence_length))

Average length is 114.60574 and max length is 1337
Setting max sequence length to be 500


In [36]:
tokenizer = Tokenizer()

In [37]:
#train_vector = tokenizer.texts_to_sequences(texts=train_processed_reviews)
train_sequences = pad_sequences(sequences=train_index, maxlen=max_sequence_length, padding='post', value=-1)

In [38]:
#test_vector = tokenizer.texts_to_sequences(texts=test_processed_reviews)
test_sequences = pad_sequences(sequences=test_index, maxlen=max_sequence_length, padding='post', value=-1)

In [39]:
trainY = np.zeros(len(training_labels))
trainY[np.array(training_labels) > 6] = 1
testY = np.zeros(len(test_labels))
testY[np.array(test_labels) > 6] = 1

In [40]:
train_label_one_hot = np_utils.to_categorical(np.array(training_labels)-1)
test_label_one_hot = np_utils.to_categorical(np.array(test_labels)-1)

In [41]:
len(train_label_one_hot[0])

10

In [46]:
model = Sequential()
model.add(Masking(mask_value=-1.))
model.add(word2vecModel.wv.get_keras_embedding(False))
#model.add(LSTM(units=300,dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(units=50,dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
from keras.callbacks import ModelCheckpoint

In [44]:
chkpoint = ModelCheckpoint('weights-rnn-gensim-{epoch:02d}-{val_loss:.2f}.hdf5', mode='min', verbose=1, monitor='val_loss', save_best_only=True)

In [126]:
model.fit(x=train_sequences, y=trainY, epochs=15, batch_size=128, initial_epoch=0, validation_data=(test_sequences, testY), callbacks=[chkpoint])

Train on 25000 samples, validate on 25000 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.69272, saving model to weights-rnn-gensim-01-0.69.hdf5
Epoch 2/15

Epoch 00002: val_loss improved from 0.69272 to 0.69257, saving model to weights-rnn-gensim-02-0.69.hdf5
Epoch 3/15

Epoch 00003: val_loss improved from 0.69257 to 0.69241, saving model to weights-rnn-gensim-03-0.69.hdf5
Epoch 4/15

Epoch 00004: val_loss did not improve from 0.69241
Epoch 5/15

Epoch 00005: val_loss did not improve from 0.69241
Epoch 6/15

Epoch 00006: val_loss did not improve from 0.69241
Epoch 7/15

Epoch 00007: val_loss improved from 0.69241 to 0.69240, saving model to weights-rnn-gensim-07-0.69.hdf5
Epoch 8/15

Epoch 00008: val_loss improved from 0.69240 to 0.55059, saving model to weights-rnn-gensim-08-0.55.hdf5
Epoch 9/15

Epoch 00009: val_loss improved from 0.55059 to 0.43724, saving model to weights-rnn-gensim-09-0.44.hdf5
Epoch 10/15

Epoch 00010: val_loss improved from 0.43724 to 0.42331, 

<keras.callbacks.History at 0x7f5231e1ee48>

In [51]:
#model = load_model('weights-rnn-gensim-15-0.39.hdf5')

In [52]:
model.fit(x=train_sequences, y=trainY, epochs=30, batch_size=128, initial_epoch=15, validation_data=(test_sequences, testY), callbacks=[chkpoint])

Train on 25000 samples, validate on 25000 samples
Epoch 16/30

Epoch 00016: val_loss improved from inf to 0.40934, saving model to weights-rnn-gensim-16-0.41.hdf5
Epoch 17/30

Epoch 00017: val_loss improved from 0.40934 to 0.39609, saving model to weights-rnn-gensim-17-0.40.hdf5
Epoch 18/30

Epoch 00018: val_loss did not improve from 0.39609
Epoch 19/30

Epoch 00019: val_loss improved from 0.39609 to 0.35673, saving model to weights-rnn-gensim-19-0.36.hdf5
Epoch 20/30

Epoch 00020: val_loss did not improve from 0.35673
Epoch 21/30

Epoch 00021: val_loss did not improve from 0.35673
Epoch 22/30

Epoch 00022: val_loss improved from 0.35673 to 0.35602, saving model to weights-rnn-gensim-22-0.36.hdf5
Epoch 23/30

Epoch 00023: val_loss did not improve from 0.35602
Epoch 24/30

Epoch 00024: val_loss improved from 0.35602 to 0.35018, saving model to weights-rnn-gensim-24-0.35.hdf5
Epoch 25/30

Epoch 00025: val_loss improved from 0.35018 to 0.33769, saving model to weights-rnn-gensim-25-0.34.h

<keras.callbacks.History at 0x7f030d7bb908>

In [54]:
model.fit(x=train_sequences, y=trainY, epochs=40, batch_size=128, initial_epoch=30, validation_data=(test_sequences, testY), callbacks=[chkpoint])

Train on 25000 samples, validate on 25000 samples
Epoch 31/40

Epoch 00031: val_loss improved from 0.33131 to 0.32965, saving model to weights-rnn-gensim-31-0.33.hdf5
Epoch 32/40

Epoch 00032: val_loss improved from 0.32965 to 0.31007, saving model to weights-rnn-gensim-32-0.31.hdf5
Epoch 33/40

Epoch 00033: val_loss improved from 0.31007 to 0.30433, saving model to weights-rnn-gensim-33-0.30.hdf5
Epoch 34/40

Epoch 00034: val_loss improved from 0.30433 to 0.30158, saving model to weights-rnn-gensim-34-0.30.hdf5
Epoch 35/40

Epoch 00035: val_loss did not improve from 0.30158
Epoch 36/40

Epoch 00036: val_loss improved from 0.30158 to 0.29683, saving model to weights-rnn-gensim-36-0.30.hdf5
Epoch 37/40

Epoch 00037: val_loss improved from 0.29683 to 0.28225, saving model to weights-rnn-gensim-37-0.28.hdf5
Epoch 38/40

Epoch 00038: val_loss did not improve from 0.28225
Epoch 39/40

Epoch 00039: val_loss did not improve from 0.28225
Epoch 40/40

Epoch 00040: val_loss did not improve from 

<keras.callbacks.History at 0x7f030d920780>

In [129]:
word2vecModel.save('gensim_imbd_word_embeddings')

In [55]:
model.save('rnn-gensim-embedding-epoch-40.hdf5')