# Creating Corpus

In [1]:
import os
import zipfile
import multiprocessing

from gensim.models import word2vec
from keras.utils.data_utils import get_file

Using TensorFlow backend.


# Downloading the dataset
You can download training data and evaluation data.

In [2]:
def maybe_download(url):
    """
    Download a file if not present.
    """
    filename = url.split('/')[-1]
    path = get_file(filename, url)
    return path
    

def unzip(zip_filename):
    """
    Extract a file from the zipfile
    """
    with zipfile.ZipFile(zip_filename) as f:
        for filename in f.namelist():
            dirname = os.path.dirname(filename)
            f.extract(filename, dirname)
            return os.path.abspath(filename)
            

url = 'http://mattmahoney.net/dc/text8.zip'
filename = maybe_download(url)
text_file = unzip(filename)

# Training the Model
You can read training data from a text file using the word2vec.Text8Corpus class.
By default, it assumes that the text file is given.
Now, we obtained skip-gram model. Let's train it by calling train_on_batch and passing training examples:

In [3]:
sentences = word2vec.Text8Corpus(text_file)
window_sizes = [1, 2, 4, 8, 16]
model_names = []
for window_size in window_sizes:
    print('Starting training with window size {}...'.format(window_size))
    out_model = "text8.window_size_{}.model".format(window_size)
    model_names.append(out_model)  # TODO: {window_size: model_name} or [window_size,...]
    model = word2vec.Word2Vec(sentences, size=200, sg=1, window=window_size, workers=multiprocessing.cpu_count())
    model.save(out_model)
    print('Finished')

Starting training with window size 1...


Finished
Starting training with window size 2...


Finished
Starting training with window size 4...


Finished
Starting training with window size 8...


Finished
Starting training with window size 16...


Finished


# Building the classifier

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15, validation_data=(X_test, y_test))

# Evaluation
## Evaluation of Classifier

In [None]:
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

## Evaluation of word embeddings

In [13]:
import gensim.test
from gensim.models import KeyedVectors
module_path = gensim.test.__path__[0]
for model_name in model_names:
    model = KeyedVectors.load(model_name)
    _, spearman, _ = model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data', 'wordsim353.tsv'))
    print(spearman.correlation)

0.552537658195


0.625260056936


0.673896272619


0.713052918355


0.708916706703


# Result