# Vector store

In [1]:
!pip install -q gensim

In [2]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from collections import defaultdict, namedtuple

In [3]:
# 1. Parameters
VECTOR_SIZE = 100
WINDOW      = 5
MIN_COUNT   = 5
WORKERS     = 4
MAX_WORDS_IN_BATCH = 10000
#MAX_WORDS From https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec_inner.pyx#L27

### CustomLineSentence

In [4]:
import itertools
from gensim import utils

class CustomLineSentence:
    def __init__(self, source):
        self.source = source

    def __iter__(self):
        """Iterate through the lines in the source."""
        with utils.open(self.source, 'rb') as fin:
            for line in itertools.islice(fin, None):
                line = utils.to_unicode(line).split()
                i = 0
                while i < len(line):
                    yield line[i: i + MAX_WORDS_IN_BATCH]
                    i += MAX_WORDS_IN_BATCH

In [5]:
# 2. Load the text8 corpus as a stream of sentences
sentences = CustomLineSentence('data/text8')

In [None]:
class CustomWord2Vec:
     def __init__(self, sentences=None, vector_size=100, window=5, min_count=5, workers=3):

          corpus_iterable = sentences

          self.sentences = sentences
          self.vector_size = vector_size
          self.window = window
          self.min_count = min_count
          self.workers = workers

          # from Word2Vec parameters
          self.shrink_windows=True
          self.compute_loss=False
          self.sorted_vocab=1
          self.null_word=0
          self.epochs=5
          self.hashfxn=hash
          self.cbow_mean=1
          self.ns_exponent=0.75
          self.negayive=5
          self.hs=0
          self.sg=0
          self.min_alpha=0.0001
          self.seed=1
          self.sample=0.001
          self.alpha=0.025
          
          # normal init valuse
          self.train_count = 0
          self.total_train_time = 0
          self.running_training_loss = 0
          self.corpus_count = 0
          self.corpus_total_words = 0

          # Main part build vocab
          self.build_vocab(corpus_iterable=corpus_iterable, corpus_file=None, trim_rule=None)


     def scan_vocab(self, sentences=None, corpus_file=None):
          """Scan the corpus to determine the vocabulary size and word frequencies."""
          if corpus_file:
               sentences = CustomLineSentence(corpus_file) # potentially this could be removed - TODO: test this later

          sentence_no = -1
          total_words = 0
          vocab = defaultdict(int)

          for sentence_no, sentence in enumerate(sentences):
            for word in sentence:
                vocab[word] += 1
            total_words += len(sentence)

          corpus_count = sentence_no + 1
          self.raw_vocab = vocab

          return total_words, corpus_count

     def build_vocab(self, corpus_iterable, corpus_file=None, trim_rule=None, progress_per=10000):
          total_words, corpus_count = self.scan_vocab(sentences=corpus_iterable, corpus_file=corpus_file)
          print(f"Total words: {total_words}, Corpus count: {corpus_count}")
          

     def train():
          pass
          


In [20]:
model = CustomWord2Vec(
    sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS
)

Total words: 17005207, Corpus count: 1701


In [7]:
# 3. Train Word2Vec
model = Word2Vec(
    sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS
)

# 4. Save the trained model
model.save('models/text8_w2v_100d.model')

# 5. Quick sanity check
print(model.wv.most_similar('king', topn=10))

[('queen', 0.7531704306602478), ('prince', 0.7207016944885254), ('emperor', 0.7185539603233337), ('throne', 0.700005292892456), ('vii', 0.675726592540741), ('aragon', 0.6742899417877197), ('kings', 0.6729216575622559), ('viii', 0.6662270426750183), ('pope', 0.6596798300743103), ('judah', 0.654106855392456)]
