# Word2Vec in Python 

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
from gensim.models.word2vec import Word2Vec 
import logging
from utils import *
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Data Parameters

In [8]:
START_YEAR = 1805
END_YEAR = 1809
ROOT = "../../data/ResearchDrive/"
OUTPUT = f"../../models/{START_YEAR}-{END_YEAR}.w2v.model"

## Model Hyperparameters

In [9]:
SIZE = 100
WINDOW = 5
MIN_COUNT = 10

## Training and saving a model

In [None]:
sentences = SentIterator(ROOT,date_range=(START_YEAR,END_YEAR))
model = Word2Vec(size=SIZE, window=WINDOW, min_count=MIN_COUNT)
model.build_vocab(sentences=sentences)
total_examples = model.corpus_count
model.train(sentences=sentences, total_examples=total_examples, epochs=5)

2019-11-07 17:16:08,735 : INFO : collecting all words and their counts
2019-11-07 17:16:09,405 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-07 17:16:10,412 : INFO : PROGRESS: at sentence #10000, processed 187706 words, keeping 41153 word types
2019-11-07 17:16:11,561 : INFO : PROGRESS: at sentence #20000, processed 400253 words, keeping 71531 word types
2019-11-07 17:16:12,626 : INFO : PROGRESS: at sentence #30000, processed 617115 words, keeping 99932 word types
2019-11-07 17:16:13,757 : INFO : PROGRESS: at sentence #40000, processed 818297 words, keeping 129718 word types
2019-11-07 17:16:14,975 : INFO : PROGRESS: at sentence #50000, processed 1023624 words, keeping 152928 word types
2019-11-07 17:16:16,164 : INFO : PROGRESS: at sentence #60000, processed 1231788 words, keeping 178708 word types
2019-11-07 17:16:17,446 : INFO : PROGRESS: at sentence #70000, processed 1437708 words, keeping 201671 word types
2019-11-07 17:16:18,628 : INFO : PROGRE

2019-11-07 17:17:24,534 : INFO : PROGRESS: at sentence #710000, processed 13244641 words, keeping 1213341 word types
2019-11-07 17:17:26,291 : INFO : PROGRESS: at sentence #720000, processed 13439102 words, keeping 1227841 word types
2019-11-07 17:17:28,040 : INFO : PROGRESS: at sentence #730000, processed 13619900 words, keeping 1242306 word types
2019-11-07 17:17:29,387 : INFO : PROGRESS: at sentence #740000, processed 13824742 words, keeping 1256772 word types
2019-11-07 17:17:30,446 : INFO : PROGRESS: at sentence #750000, processed 13994324 words, keeping 1271786 word types
2019-11-07 17:17:31,639 : INFO : PROGRESS: at sentence #760000, processed 14201197 words, keeping 1287657 word types
2019-11-07 17:17:32,737 : INFO : PROGRESS: at sentence #770000, processed 14376974 words, keeping 1305691 word types
2019-11-07 17:17:33,981 : INFO : PROGRESS: at sentence #780000, processed 14582838 words, keeping 1322388 word types
2019-11-07 17:17:34,987 : INFO : PROGRESS: at sentence #790000, 

## Save model

In [5]:
model.save(OUTPUT)

2019-11-07 17:14:39,290 : INFO : saving Word2Vec object under ../../models/1805-1809.w2v.model, separately None
2019-11-07 17:14:39,291 : INFO : not storing attribute vectors_norm
2019-11-07 17:14:39,292 : INFO : not storing attribute cum_table
2019-11-07 17:14:39,844 : INFO : saved ../../models/1805-1809.w2v.model


## Load model

In [6]:
model = Word2Vec.load(OUTPUT)
print(model)

2019-11-07 17:14:42,039 : INFO : loading Word2Vec object from ../../models/1805-1809.w2v.model
2019-11-07 17:14:42,467 : INFO : loading wv recursively from ../../models/1805-1809.w2v.model.wv.* with mmap=None
2019-11-07 17:14:42,467 : INFO : setting ignored attribute vectors_norm to None
2019-11-07 17:14:42,468 : INFO : loading vocabulary recursively from ../../models/1805-1809.w2v.model.vocabulary.* with mmap=None
2019-11-07 17:14:42,468 : INFO : loading trainables recursively from ../../models/1805-1809.w2v.model.trainables.* with mmap=None
2019-11-07 17:14:42,469 : INFO : setting ignored attribute cum_table to None
2019-11-07 17:14:42,470 : INFO : loaded ../../models/1805-1809.w2v.model


Word2Vec(vocab=66631, size=100, alpha=0.025)


## Query model

In [7]:
query = 'vrouw'
model.most_similar(query)

  
2019-11-07 17:14:45,454 : INFO : precomputing L2-norms of word weight vectors


[('moeder', 0.7732312679290771),
 ('Vrouw', 0.7363426685333252),
 ('Moeder', 0.7242480516433716),
 ('Vronw', 0.7235069274902344),
 ('Dame', 0.7200417518615723),
 ('Schuur', 0.6727055907249451),
 ('dochter', 0.6695880889892578),
 ('bevallen', 0.6690483093261719),
 ('Bom', 0.6606713533401489),
 ('hagelbui', 0.6589257717132568)]