In [1]:
import gensim
import numpy as np
from nltk.tokenize import word_tokenize

np.random.seed(1001)

In [2]:
file_path = 'data/snli_sentences_100k.txt'
with open(file_path, 'r') as f:
    snli_data = f.readlines()

In [3]:
snli_data[:10]

['a smiling woman with a black smear across her face holds an umbrella.\n',
 'a woman walks through a bad neighborhood.\n',
 'a passenger jet lands on a black runway.\n',
 'a woman weaves cloth.\n',
 'a man with a yellow hat on is leaning on a wall of some sort.\n',
 'two little blond girls twirl their hair.\n',
 'a man of asian descent wearing red is leaning over another man of asian descent wearing blue who has fallen on the grass in the park on a sunny day.\n',
 'the two dogs are looking at each other.\n',
 'there are two guys partying.\n',
 'the photographer is shooting the models on a ledge\n']

In [4]:
print('[INFO] Number of sentences = {}'.format(len(snli_data)))

[INFO] Number of sentences = 100000


It is a good practice to pre-process the original data by converting to lower case, remove numbers, special character, etc.

In [5]:
sentences = [s.strip() for s in snli_data]
sentences[:10]

['a smiling woman with a black smear across her face holds an umbrella.',
 'a woman walks through a bad neighborhood.',
 'a passenger jet lands on a black runway.',
 'a woman weaves cloth.',
 'a man with a yellow hat on is leaning on a wall of some sort.',
 'two little blond girls twirl their hair.',
 'a man of asian descent wearing red is leaning over another man of asian descent wearing blue who has fallen on the grass in the park on a sunny day.',
 'the two dogs are looking at each other.',
 'there are two guys partying.',
 'the photographer is shooting the models on a ledge']

In [6]:
np.random.shuffle(sentences)

In [7]:
sentences = [word_tokenize(s) for s in sentences]

In [8]:
sentences[:2]

[['three',
  'people',
  'are',
  'looking',
  'at',
  'merchandise',
  'of',
  'a',
  'jewelry',
  'kiosk',
  '.'],
 ['children', 'play', 'at', 'the', 'park']]

### Gensim Model Building

In [9]:
%%time
w2v_model = gensim.models.Word2Vec(
    sentences,
    size=300, # Dimension of the word embedding
    window=2, # The maximum distance between the current and predicted word within a sentence.
    min_count=1, # Ignores all words with total frequency lower than this.
    sg=1, # If 1, skip-gram is employed; otherwise, CBOW is used.
    negative=10, # Number of negative samples to be drawn
    iter=20, # Number of epochs over the corpus
)
    

CPU times: user 1min 37s, sys: 88 ms, total: 1min 37s
Wall time: 33.2 s


In [10]:
w2v_model.save('data/w2v_300d_snli_data.pkl')