# Word2Vec in Python 

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
from gensim.models.word2vec import Word2Vec 
import logging
from utils import *
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Data Parameters

In [8]:
START_YEAR = 1805
END_YEAR = 1809
ROOT = "../../data/ResearchDrive/"
OUTPUT = f"../../models/{START_YEAR}-{END_YEAR}.w2v.model"

## Model Hyperparameters

In [9]:
SIZE = 100
WINDOW = 5
MIN_COUNT = 10
# Add other if necessary

## Training and saving a model

In [10]:
sentences = SentIterator(ROOT,date_range=(START_YEAR,END_YEAR))
model = Word2Vec(size=SIZE, window=WINDOW, min_count=MIN_COUNT)
model.build_vocab(sentences=sentences)
total_examples = model.corpus_count
model.train(sentences=sentences, total_examples=total_examples, epochs=5)

2019-11-07 17:16:08,735 : INFO : collecting all words and their counts
2019-11-07 17:16:09,405 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-07 17:16:10,412 : INFO : PROGRESS: at sentence #10000, processed 187706 words, keeping 41153 word types
2019-11-07 17:16:11,561 : INFO : PROGRESS: at sentence #20000, processed 400253 words, keeping 71531 word types
2019-11-07 17:16:12,626 : INFO : PROGRESS: at sentence #30000, processed 617115 words, keeping 99932 word types
2019-11-07 17:16:13,757 : INFO : PROGRESS: at sentence #40000, processed 818297 words, keeping 129718 word types
2019-11-07 17:16:14,975 : INFO : PROGRESS: at sentence #50000, processed 1023624 words, keeping 152928 word types
2019-11-07 17:16:16,164 : INFO : PROGRESS: at sentence #60000, processed 1231788 words, keeping 178708 word types
2019-11-07 17:16:17,446 : INFO : PROGRESS: at sentence #70000, processed 1437708 words, keeping 201671 word types
2019-11-07 17:16:18,628 : INFO : PROGRE

2019-11-07 17:17:24,534 : INFO : PROGRESS: at sentence #710000, processed 13244641 words, keeping 1213341 word types
2019-11-07 17:17:26,291 : INFO : PROGRESS: at sentence #720000, processed 13439102 words, keeping 1227841 word types
2019-11-07 17:17:28,040 : INFO : PROGRESS: at sentence #730000, processed 13619900 words, keeping 1242306 word types
2019-11-07 17:17:29,387 : INFO : PROGRESS: at sentence #740000, processed 13824742 words, keeping 1256772 word types
2019-11-07 17:17:30,446 : INFO : PROGRESS: at sentence #750000, processed 13994324 words, keeping 1271786 word types
2019-11-07 17:17:31,639 : INFO : PROGRESS: at sentence #760000, processed 14201197 words, keeping 1287657 word types
2019-11-07 17:17:32,737 : INFO : PROGRESS: at sentence #770000, processed 14376974 words, keeping 1305691 word types
2019-11-07 17:17:33,981 : INFO : PROGRESS: at sentence #780000, processed 14582838 words, keeping 1322388 word types
2019-11-07 17:17:34,987 : INFO : PROGRESS: at sentence #790000, 

2019-11-07 17:18:32,304 : INFO : EPOCH 1 - PROGRESS: at 2.97% examples, 88845 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:33,335 : INFO : EPOCH 1 - PROGRESS: at 3.69% examples, 90999 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:34,337 : INFO : EPOCH 1 - PROGRESS: at 4.30% examples, 92115 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:35,346 : INFO : EPOCH 1 - PROGRESS: at 4.99% examples, 93485 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:36,388 : INFO : EPOCH 1 - PROGRESS: at 5.69% examples, 94195 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:37,403 : INFO : EPOCH 1 - PROGRESS: at 6.43% examples, 94856 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:38,451 : INFO : EPOCH 1 - PROGRESS: at 7.09% examples, 95381 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:39,484 : INFO : EPOCH 1 - PROGRESS: at 7.78% examples, 95163 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:18:40,486 : INFO : EPOCH 1 - PROGRESS: at 8.45% examples, 95353 words/s, in_qsize 0, out_qsize 0
2

2019-11-07 17:19:48,373 : INFO : EPOCH 1 - PROGRESS: at 58.10% examples, 100066 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:49,373 : INFO : EPOCH 1 - PROGRESS: at 58.82% examples, 100110 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:50,409 : INFO : EPOCH 1 - PROGRESS: at 59.53% examples, 100082 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:51,456 : INFO : EPOCH 1 - PROGRESS: at 60.27% examples, 99996 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:52,537 : INFO : EPOCH 1 - PROGRESS: at 60.90% examples, 99866 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:53,547 : INFO : EPOCH 1 - PROGRESS: at 61.70% examples, 99681 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:54,583 : INFO : EPOCH 1 - PROGRESS: at 62.50% examples, 99683 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:55,616 : INFO : EPOCH 1 - PROGRESS: at 63.23% examples, 99604 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:19:56,653 : INFO : EPOCH 1 - PROGRESS: at 64.04% examples, 99617 words/s, in_qsize 0, o

2019-11-07 17:21:01,101 : INFO : EPOCH 2 - PROGRESS: at 10.79% examples, 98285 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:02,120 : INFO : EPOCH 2 - PROGRESS: at 11.57% examples, 98754 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:03,133 : INFO : EPOCH 2 - PROGRESS: at 12.22% examples, 99103 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:04,166 : INFO : EPOCH 2 - PROGRESS: at 12.93% examples, 99197 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:05,195 : INFO : EPOCH 2 - PROGRESS: at 13.65% examples, 99093 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:06,213 : INFO : EPOCH 2 - PROGRESS: at 14.36% examples, 98977 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:07,264 : INFO : EPOCH 2 - PROGRESS: at 15.07% examples, 99407 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:08,287 : INFO : EPOCH 2 - PROGRESS: at 15.84% examples, 98968 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:21:09,317 : INFO : EPOCH 2 - PROGRESS: at 16.59% examples, 99014 words/s, in_qsize 0, out_

2019-11-07 17:22:16,370 : INFO : EPOCH 2 - PROGRESS: at 65.50% examples, 99300 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:17,371 : INFO : EPOCH 2 - PROGRESS: at 66.22% examples, 99430 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:18,417 : INFO : EPOCH 2 - PROGRESS: at 67.00% examples, 99419 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:19,422 : INFO : EPOCH 2 - PROGRESS: at 67.82% examples, 99395 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:20,481 : INFO : EPOCH 2 - PROGRESS: at 68.58% examples, 99442 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:21,513 : INFO : EPOCH 2 - PROGRESS: at 69.51% examples, 99379 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:22,539 : INFO : EPOCH 2 - PROGRESS: at 70.31% examples, 99473 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:23,607 : INFO : EPOCH 2 - PROGRESS: at 71.10% examples, 99438 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:22:24,609 : INFO : EPOCH 2 - PROGRESS: at 71.89% examples, 99487 words/s, in_qsize 0, out_

2019-11-07 17:23:28,837 : INFO : EPOCH 3 - PROGRESS: at 18.80% examples, 103478 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:29,867 : INFO : EPOCH 3 - PROGRESS: at 19.72% examples, 103299 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:30,899 : INFO : EPOCH 3 - PROGRESS: at 20.48% examples, 103177 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:31,928 : INFO : EPOCH 3 - PROGRESS: at 21.38% examples, 103394 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:32,980 : INFO : EPOCH 3 - PROGRESS: at 22.16% examples, 103570 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:33,986 : INFO : EPOCH 3 - PROGRESS: at 22.97% examples, 103222 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:35,017 : INFO : EPOCH 3 - PROGRESS: at 23.73% examples, 103466 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:36,057 : INFO : EPOCH 3 - PROGRESS: at 24.53% examples, 103413 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:23:37,117 : INFO : EPOCH 3 - PROGRESS: at 25.40% examples, 103690 words/s, in_qsiz

2019-11-07 17:24:44,259 : INFO : EPOCH 3 - PROGRESS: at 76.24% examples, 103056 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:45,292 : INFO : EPOCH 3 - PROGRESS: at 77.03% examples, 102993 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:46,317 : INFO : EPOCH 3 - PROGRESS: at 77.75% examples, 103052 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:47,334 : INFO : EPOCH 3 - PROGRESS: at 78.53% examples, 103025 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:48,375 : INFO : EPOCH 3 - PROGRESS: at 79.27% examples, 103059 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:49,379 : INFO : EPOCH 3 - PROGRESS: at 79.99% examples, 103067 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:50,430 : INFO : EPOCH 3 - PROGRESS: at 80.75% examples, 103027 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:51,431 : INFO : EPOCH 3 - PROGRESS: at 81.48% examples, 103017 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:24:52,446 : INFO : EPOCH 3 - PROGRESS: at 82.24% examples, 103082 words/s, in_qsiz

2019-11-07 17:25:56,133 : INFO : EPOCH 4 - PROGRESS: at 30.71% examples, 106226 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:25:57,182 : INFO : EPOCH 4 - PROGRESS: at 31.58% examples, 106298 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:25:58,233 : INFO : EPOCH 4 - PROGRESS: at 32.40% examples, 106148 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:25:59,279 : INFO : EPOCH 4 - PROGRESS: at 33.27% examples, 106198 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:26:00,281 : INFO : EPOCH 4 - PROGRESS: at 34.03% examples, 106339 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:26:01,337 : INFO : EPOCH 4 - PROGRESS: at 34.89% examples, 106531 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:26:02,350 : INFO : EPOCH 4 - PROGRESS: at 35.67% examples, 106585 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:26:03,387 : INFO : EPOCH 4 - PROGRESS: at 36.55% examples, 106758 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:26:04,387 : INFO : EPOCH 4 - PROGRESS: at 37.38% examples, 106853 words/s, in_qsiz

2019-11-07 17:27:11,288 : INFO : EPOCH 4 - PROGRESS: at 87.95% examples, 104328 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:12,316 : INFO : EPOCH 4 - PROGRESS: at 88.79% examples, 104331 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:13,347 : INFO : EPOCH 4 - PROGRESS: at 89.55% examples, 104330 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:14,375 : INFO : EPOCH 4 - PROGRESS: at 90.50% examples, 104285 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:15,391 : INFO : EPOCH 4 - PROGRESS: at 91.22% examples, 104301 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:16,421 : INFO : EPOCH 4 - PROGRESS: at 92.13% examples, 104251 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:17,423 : INFO : EPOCH 4 - PROGRESS: at 92.89% examples, 104295 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:18,481 : INFO : EPOCH 4 - PROGRESS: at 93.80% examples, 104194 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:27:19,482 : INFO : EPOCH 4 - PROGRESS: at 94.60% examples, 104237 words/s, in_qsiz

2019-11-07 17:28:24,111 : INFO : EPOCH 5 - PROGRESS: at 40.84% examples, 99720 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:25,135 : INFO : EPOCH 5 - PROGRESS: at 41.70% examples, 99683 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:26,151 : INFO : EPOCH 5 - PROGRESS: at 42.45% examples, 99879 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:27,175 : INFO : EPOCH 5 - PROGRESS: at 43.25% examples, 99816 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:28,205 : INFO : EPOCH 5 - PROGRESS: at 44.00% examples, 100061 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:29,209 : INFO : EPOCH 5 - PROGRESS: at 44.79% examples, 99917 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:30,221 : INFO : EPOCH 5 - PROGRESS: at 45.56% examples, 100092 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:31,226 : INFO : EPOCH 5 - PROGRESS: at 46.37% examples, 99982 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:28:32,256 : INFO : EPOCH 5 - PROGRESS: at 47.14% examples, 100223 words/s, in_qsize 0, o

2019-11-07 17:29:39,136 : INFO : EPOCH 5 - PROGRESS: at 99.30% examples, 101940 words/s, in_qsize 0, out_qsize 0
2019-11-07 17:29:40,003 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-11-07 17:29:40,004 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-11-07 17:29:40,010 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-07 17:29:40,011 : INFO : EPOCH - 5 : training on 23500823 raw words (13602954 effective words) took 133.5s, 101889 effective words/s
2019-11-07 17:29:40,012 : INFO : training on a 117504115 raw words (68013079 effective words) took 672.8s, 101084 effective words/s


(68013079, 117504115)

## Save model

In [11]:
model.save(OUTPUT)

2019-11-07 18:19:06,551 : INFO : saving Word2Vec object under ../../models/1805-1809.w2v.model, separately None
2019-11-07 18:19:06,552 : INFO : not storing attribute vectors_norm
2019-11-07 18:19:06,552 : INFO : not storing attribute cum_table
2019-11-07 18:19:07,067 : INFO : saved ../../models/1805-1809.w2v.model


## Load model

In [12]:
model = Word2Vec.load(OUTPUT)
print(model)

2019-11-07 18:19:10,364 : INFO : loading Word2Vec object from ../../models/1805-1809.w2v.model
2019-11-07 18:19:10,760 : INFO : loading wv recursively from ../../models/1805-1809.w2v.model.wv.* with mmap=None
2019-11-07 18:19:10,761 : INFO : setting ignored attribute vectors_norm to None
2019-11-07 18:19:10,762 : INFO : loading vocabulary recursively from ../../models/1805-1809.w2v.model.vocabulary.* with mmap=None
2019-11-07 18:19:10,762 : INFO : loading trainables recursively from ../../models/1805-1809.w2v.model.trainables.* with mmap=None
2019-11-07 18:19:10,763 : INFO : setting ignored attribute cum_table to None
2019-11-07 18:19:10,763 : INFO : loaded ../../models/1805-1809.w2v.model


Word2Vec(vocab=61546, size=100, alpha=0.025)


## Query model

In [13]:
query = 'vrouw'
model.most_similar(query)

  
2019-11-07 18:19:15,099 : INFO : precomputing L2-norms of word weight vectors


[('moeder', 0.6777811646461487),
 ('jufvrouw', 0.6739053130149841),
 ('dochter', 0.6621812582015991),
 ('echtgenote', 0.6530464887619019),
 ('weduwe', 0.6498156189918518),
 ('jonge', 0.6464175581932068),
 ('alida', 0.6319893598556519),
 ('johanna', 0.6296401619911194),
 ('stuurman', 0.6256107091903687),
 ('dame', 0.6101760268211365)]