# Vector store

In [5]:
!pip install -q gensim

In [6]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [None]:

# 1. Parameters
VECTOR_SIZE = 100
WINDOW      = 5
MIN_COUNT   = 5
WORKERS     = 4
MAX_WORDS_IN_BATCH = 1000
#MAX_WORDS From https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec_inner.pyx#L27

### CustomLineSentence

In [8]:
import itertools
from gensim import utils

class CustomLineSentence:
    def __init__(self, source, limit=None, max_sentence_length=10000):
        self.source = source
        self.limit = limit
        self.max_sentence_length = max_sentence_length

    def __iter__(self):
        """Iterate through the lines in the source."""
        try:
            self.source.seek(0)
            for line in itertools.islice(self.source, self.limit):
                line = utils.to_unicode(line).split()
                i = 0
                while i < len(line):
                    yield line[i: i + self.max_sentence_length]
                    i += self.max_sentence_length
        except AttributeError:
            with utils.open(self.source, 'rb') as fin:
                for line in itertools.islice(fin, self.limit):
                    line = utils.to_unicode(line).split()
                    i = 0
                    while i < len(line):
                        yield line[i: i + self.max_sentence_length]
                        i += self.max_sentence_length

In [None]:
# 2. Load the text8 corpus as a stream of sentences
sentences = LineSentence('data/text8')

# 3. Train Word2Vec
model = Word2Vec(
    sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS
)

# 4. Save the trained model
model.save('models/text8_w2v_100d.model')

# 5. Quick sanity check
print(model.wv.most_similar('king', topn=10))