In [1]:
import re
from os.path import join
from gensim.models import KeyedVectors
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPool1D, Dense, Dropout, Flatten
import numpy as np
import random

Word2Vec_PATH = '../resources/GoogleNews-vectors-negative300.bin'

Using TensorFlow backend.


In [2]:
# Load the Word2Vec data. We'll need this later

class Embeddings:
    """Loads and manages the Word2vec Data"""

    def __init__(self):
        print('Loading word vectors. This will take a while...')
        self.data = KeyedVectors.load_word2vec_format(Word2Vec_PATH, binary=True)
        print('done.')

    def lookup(self, word):
        """
        return the vector representation of a word. falls back to a zero vector if no match was found
        :param word: the word to lookup
        :return: the vector representation
        """
        if word == '//pad//':
            return [0 for i in range(300)]
        try:
            return np.asarray(self.data.word_vec(word))
        # Replace words not found in vocabulary with zeros
        except KeyError:
            return np.zeros(300)
    
emb = Embeddings()

Loading word vectors. This will take a while...
done.


In [3]:
# Test the lookup

print("The vector for cat is: %s" % emb.lookup("cat"))

The vector for cat is: [ 0.0123291   0.20410156 -0.28515625  0.21679688  0.11816406  0.08300781
  0.04980469 -0.00952148  0.22070312 -0.12597656  0.08056641 -0.5859375
 -0.00445557 -0.296875   -0.01312256 -0.08349609  0.05053711  0.15136719
 -0.44921875 -0.0135498   0.21484375 -0.14746094  0.22460938 -0.125
 -0.09716797  0.24902344 -0.2890625   0.36523438  0.41210938 -0.0859375
 -0.07861328 -0.19726562 -0.09082031 -0.14160156 -0.10253906  0.13085938
 -0.00346375  0.07226562  0.04418945  0.34570312  0.07470703 -0.11230469
  0.06738281  0.11230469  0.01977539 -0.12353516  0.20996094 -0.07226562
 -0.02783203  0.05541992 -0.33398438  0.08544922  0.34375     0.13964844
  0.04931641 -0.13476562  0.16308594 -0.37304688  0.39648438  0.10693359
  0.22167969  0.21289062 -0.08984375  0.20703125  0.08935547 -0.08251953
  0.05957031  0.10205078 -0.19238281 -0.09082031  0.4921875   0.03955078
 -0.07080078 -0.0019989  -0.23046875  0.25585938  0.08984375 -0.10644531
  0.00105286 -0.05883789  0.0510253

In [4]:
"""
Now lets load the dataset. We are going to use the imbd dataset.
"""
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [5]:
# lets take a look at our data
print("training data: %s" % np.shape(x_train))
print("testing data: %s" % np.shape(x_test))

print("number of unique words: %d" % len(np.unique(np.hstack(x_train + x_test))))
print("average length of review: %d" % np.average([len(x) for x in x_train]))

training data: 25000
testing data: 25000
number of unique words: 88585
average length of review: 238


In [6]:
"""
Now lets construct our model. We are going to use a convolutional layer with 250 filters of size 3.
This is followed by a MaxPool layer, Dropout and finally one densely
connected layer.
For our output we'll use a normal softmax.
"""
model = Sequential()
"""
Now we add the convolutional layer. Keras expects us to provide a shape for the first layer.
We'll use sequences of length 400 and of dimensionality 300 (innate to word2vec).
Further we'll use relu as the activation funcition.
"""
model.add(Conv1D(filters=250, kernel_size=3, input_shape=(400, 300), activation="relu"))
"""
Now we'll apply a max pooling layer with size 5
"""
model.add(GlobalMaxPool1D())
# one dense layer with 500 neurons
model.add(Dense(1000))
# Now some dropout
model.add(Dropout(0.2))
# and our output layer
model.add(Dense(1, activation='sigmoid'))

# finally our optimizer. we are going with adam here. Note that this is a binary classification dataset.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 398, 250)          225250    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              251000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1001      
Total params: 477,251
Trainable params: 477,251
Non-trainable params: 0
_________________________________________________________________


In [8]:
"""
The dataset is provided as indexes in a dict of words. If we want to use our word embeddings we have
to first transform the indexes back to normal text. Then we can look them up in our embeddings
"""
def embed(data, length):
    tokens = [inv_dict.get(index) for index in data]
    tokens += ['//pad//'] * (length - len(tokens))
    return np.asarray([emb.lookup(word) for word in tokens[:length]])
    
"""
As the dataset becomes quite big once we transform into its word embedded form we are going to define a generator here.
We are also going to pad/cut the texts to a fixed length of 400 words.
"""

inv_dict = {value: key for key, value in imdb.get_word_index().items()}
def data_generator(data, target, length=400, num=20):
    while True:
        for i in range(0, len(data), num):
            texts = np.asarray([embed(entry, length) for entry in data[i:i+num]])
            results = np.asarray(target[i:i+num])
            yield (texts, results)


# now lets train our model
model.fit_generator(data_generator(x_train, y_train), steps_per_epoch=1250, epochs=3, max_q_size=20)


# and test its performance
scores = model.evaluate_generator(data_generator(x_test, y_test), 1250, max_q_size=20)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 84.42%
