In [77]:
# Imports
from time import time
from os.path import join as join_path
import numpy as np
import pandas as pd

import multiprocessing
cores = multiprocessing.cpu_count()

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.doc2vec import TaggedDocument
import logging # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import nltk
nltk.download('punkt')

from utils import clean_text
from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load and prepare data

In [441]:
# Constants
cord_data_dir = 'data'
cord_data_path = join_path(cord_data_dir, 'cord-19-data.csv')
w2v_saved_models_dir = 'models-word2vec'
d2v_saved_models_dir = 'models-doc2vec'
saved_models_prefix = 'model'
train_logs_path = 'train_logs.txt'

In [3]:
cord_data = pd.read_csv(cord_data_path)
cord_data_eng = cord_data[cord_data['language'] == 'en']
eng_texts = cord_data_eng[['cord_uid', 'body_text']].values

In [4]:
cord_num_sentences = 0
for _, text in tqdm(eng_texts):
    sentences = nltk.tokenize.sent_tokenize(text)
    cord_num_sentences += len(sentences)
print(f'Total number of CORD-19 sentences: {cord_num_sentences}')

HBox(children=(IntProgress(value=0, max=32789), HTML(value='')))


Total number of CORD-19 sentences: 6522945


In [422]:
class CORDDataIteratorWord2Vec():
    def __init__(self, texts: np.ndarray):
        self.texts = texts
    
    def __iter__(self):
        for text in self.texts:
            sentences = nltk.tokenize.sent_tokenize(text)
            cleaned_sentences = [clean_text(sent).split() for sent in sentences]
            for sentence in cleaned_sentences:
                yield sentence

In [423]:
class CORDDataIteratorDoc2Vec():
    def __init__(self, texts: np.ndarray):
        self.texts = texts
    
    def __iter__(self):
        for cord_uid, cord_text in self.texts:
            sentences = nltk.tokenize.sent_tokenize(cord_text)
            cleaned_sentences = [clean_text(sent) for sent in sentences]
            for sentence in cleaned_sentences:
                yield TaggedDocument(sentence, [cord_uid])

In [424]:
cord_sentences = CORDDataIteratorDoc2Vec(eng_texts)

## Learn word embeddings using Doc2Vec

In [432]:
class DocEpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, start_epoch: int = 1):
        self.output_dir = output_dir
        self.prefix = prefix
        self.epoch = start_epoch

    def on_epoch_end(self, model):        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1    

In [426]:
# Setup initial model
d2v_model = Doc2Vec(
    min_count=20,
    window=2,
    vector_size=300,
    negative=20,
    workers=cores-1,
    callbacks=[DocEpochSaver(d2v_saved_models_dir, saved_models_prefix)]
)
# Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [427]:
# Build vocabulary
t = time()
d2v_model.build_vocab(tqdm(cord_sentences, total=cord_num_sentences), progress_per=int(cord_num_sentences / 100))
print(f'Time to build vocab: {round((time() - t) / 60, 2)} mins')

HBox(children=(IntProgress(value=0, max=6522945), HTML(value='')))

INFO - 19:06:49: collecting all words and their counts
INFO - 19:06:49: PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO - 19:07:05: PROGRESS: at example #65229, processed 826691 words (53247/s), 50857 word types, 390 tags
INFO - 19:07:21: PROGRESS: at example #130458, processed 1692734 words (53884/s), 80232 word types, 792 tags
INFO - 19:07:36: PROGRESS: at example #195687, processed 2505331 words (52271/s), 102579 word types, 1078 tags
INFO - 19:07:53: PROGRESS: at example #260916, processed 3357970 words (52272/s), 127475 word types, 1242 tags
INFO - 19:08:08: PROGRESS: at example #326145, processed 4161317 words (53732/s), 142573 word types, 1366 tags
INFO - 19:08:24: PROGRESS: at example #391374, processed 5068645 words (54286/s), 163973 word types, 1546 tags
INFO - 19:08:40: PROGRESS: at example #456603, processed 5944613 words (55987/s), 178580 word types, 1635 tags
INFO - 19:08:56: PROGRESS: at example #521832, processed 6840406 words (54734/s), 1933

INFO - 19:27:11: PROGRESS: at example #4761717, processed 65590632 words (55566/s), 1105178 word types, 22569 tags
INFO - 19:27:27: PROGRESS: at example #4826946, processed 66530588 words (55359/s), 1121266 word types, 22890 tags
INFO - 19:27:44: PROGRESS: at example #4892175, processed 67481144 words (56033/s), 1135785 word types, 23259 tags
INFO - 19:28:02: PROGRESS: at example #4957404, processed 68470774 words (55967/s), 1151645 word types, 23625 tags
INFO - 19:28:20: PROGRESS: at example #5022633, processed 69437622 words (55488/s), 1166696 word types, 24006 tags
INFO - 19:28:37: PROGRESS: at example #5087862, processed 70380745 words (54919/s), 1181805 word types, 24353 tags
INFO - 19:28:54: PROGRESS: at example #5153091, processed 71301301 words (54109/s), 1197449 word types, 24706 tags
INFO - 19:29:10: PROGRESS: at example #5218320, processed 72191092 words (54216/s), 1212786 word types, 25144 tags
INFO - 19:29:26: PROGRESS: at example #5283549, processed 73015062 words (51031/




INFO - 19:34:38: effective_min_count=20 retains 94364 unique words (6% of original 1407864, drops 1313500)
INFO - 19:34:38: effective_min_count=20 leaves 86935046 word corpus (96% of original 90067643, drops 3132597)
INFO - 19:34:38: deleting the raw counts dictionary of 1407864 items
INFO - 19:34:38: sample=0.001 downsamples 17 most-common words
INFO - 19:34:38: downsampling leaves estimated 84170500 word corpus (96.8% of prior 86935046)
INFO - 19:34:38: estimated required memory for 94364 words and 300 dimensions: 319560200 bytes
INFO - 19:34:38: resetting layer weights


Time to build vocab: 28.2 mins


In [449]:
d2v_model = Word2Vec.load('models-doc2vec/model_epoch_2.model')

INFO - 22:03:40: loading Word2Vec object from models-doc2vec/model_epoch_2.model
INFO - 22:03:41: loading vocabulary recursively from models-doc2vec/model_epoch_2.model.vocabulary.* with mmap=None
INFO - 22:03:41: loading trainables recursively from models-doc2vec/model_epoch_2.model.trainables.* with mmap=None
INFO - 22:03:41: loading syn1neg from models-doc2vec/model_epoch_2.model.trainables.syn1neg.npy with mmap=None
INFO - 22:03:41: loading wv recursively from models-doc2vec/model_epoch_2.model.wv.* with mmap=None
INFO - 22:03:41: loading vectors from models-doc2vec/model_epoch_2.model.wv.vectors.npy with mmap=None
INFO - 22:03:41: loading docvecs recursively from models-doc2vec/model_epoch_2.model.docvecs.* with mmap=None
INFO - 22:03:41: loaded models-doc2vec/model_epoch_2.model


In [451]:
# Train model
t = time()
d2v_model.train(cord_sentences, total_examples=d2v_model.corpus_count, epochs=8, report_delay=30, callbacks=[DocEpochSaver(d2v_saved_models_dir, saved_models_prefix, 3)])
print(f'Time to train the model: {round((time() - t) / 60, 2)} mins')

INFO - 22:05:05: training model with 7 workers on 94364 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=20 window=2
INFO - 22:05:06: EPOCH 1 - PROGRESS: at 0.04% examples, 26323 words/s, in_qsize 0, out_qsize 0
INFO - 22:05:36: EPOCH 1 - PROGRESS: at 1.29% examples, 34342 words/s, in_qsize 0, out_qsize 0
INFO - 22:06:06: EPOCH 1 - PROGRESS: at 2.51% examples, 34391 words/s, in_qsize 0, out_qsize 0
INFO - 22:06:37: EPOCH 1 - PROGRESS: at 3.21% examples, 29237 words/s, in_qsize 13, out_qsize 0
INFO - 22:07:07: EPOCH 1 - PROGRESS: at 3.98% examples, 27442 words/s, in_qsize 13, out_qsize 0
INFO - 22:07:37: EPOCH 1 - PROGRESS: at 4.86% examples, 26722 words/s, in_qsize 13, out_qsize 0
INFO - 22:08:08: EPOCH 1 - PROGRESS: at 5.66% examples, 26279 words/s, in_qsize 13, out_qsize 0
INFO - 22:08:40: EPOCH 1 - PROGRESS: at 6.52% examples, 25924 words/s, in_qsize 13, out_qsize 0
INFO - 22:09:10: EPOCH 1 - PROGRESS: at 7.32% examples, 25686 words/s, in_qsize 13, out_qsize 0
INFO

INFO - 22:47:03: EPOCH 1 - PROGRESS: at 66.45% examples, 23806 words/s, in_qsize 13, out_qsize 0
INFO - 22:47:33: EPOCH 1 - PROGRESS: at 67.27% examples, 23816 words/s, in_qsize 13, out_qsize 0
INFO - 22:48:03: EPOCH 1 - PROGRESS: at 68.06% examples, 23830 words/s, in_qsize 13, out_qsize 0
INFO - 22:48:33: EPOCH 1 - PROGRESS: at 68.84% examples, 23837 words/s, in_qsize 13, out_qsize 0
INFO - 22:49:03: EPOCH 1 - PROGRESS: at 69.61% examples, 23840 words/s, in_qsize 13, out_qsize 0
INFO - 22:49:34: EPOCH 1 - PROGRESS: at 70.38% examples, 23844 words/s, in_qsize 13, out_qsize 0
INFO - 22:50:04: EPOCH 1 - PROGRESS: at 71.16% examples, 23846 words/s, in_qsize 14, out_qsize 0
INFO - 22:50:34: EPOCH 1 - PROGRESS: at 71.92% examples, 23849 words/s, in_qsize 13, out_qsize 0
INFO - 22:51:04: EPOCH 1 - PROGRESS: at 72.67% examples, 23840 words/s, in_qsize 13, out_qsize 0
INFO - 22:51:35: EPOCH 1 - PROGRESS: at 73.44% examples, 23843 words/s, in_qsize 13, out_qsize 0
INFO - 22:52:05: EPOCH 1 - PRO

INFO - 23:24:38: EPOCH 2 - PROGRESS: at 28.86% examples, 25155 words/s, in_qsize 13, out_qsize 0
INFO - 23:25:08: EPOCH 2 - PROGRESS: at 29.67% examples, 25146 words/s, in_qsize 13, out_qsize 0
INFO - 23:25:38: EPOCH 2 - PROGRESS: at 30.53% examples, 25138 words/s, in_qsize 13, out_qsize 0
INFO - 23:26:10: EPOCH 2 - PROGRESS: at 31.40% examples, 25149 words/s, in_qsize 13, out_qsize 0
INFO - 23:26:40: EPOCH 2 - PROGRESS: at 32.26% examples, 25158 words/s, in_qsize 13, out_qsize 0
INFO - 23:27:10: EPOCH 2 - PROGRESS: at 33.07% examples, 25167 words/s, in_qsize 13, out_qsize 0
INFO - 23:27:41: EPOCH 2 - PROGRESS: at 33.94% examples, 25147 words/s, in_qsize 13, out_qsize 0
INFO - 23:28:11: EPOCH 2 - PROGRESS: at 34.83% examples, 25142 words/s, in_qsize 14, out_qsize 0
INFO - 23:28:41: EPOCH 2 - PROGRESS: at 35.63% examples, 25081 words/s, in_qsize 14, out_qsize 0
INFO - 23:29:11: EPOCH 2 - PROGRESS: at 36.41% examples, 25048 words/s, in_qsize 13, out_qsize 0
INFO - 23:29:42: EPOCH 2 - PRO

INFO - 00:07:37: EPOCH 2 - PROGRESS: at 97.31% examples, 24606 words/s, in_qsize 12, out_qsize 0
INFO - 00:08:08: EPOCH 2 - PROGRESS: at 98.07% examples, 24598 words/s, in_qsize 14, out_qsize 0
INFO - 00:08:38: EPOCH 2 - PROGRESS: at 98.84% examples, 24591 words/s, in_qsize 13, out_qsize 0
INFO - 00:09:08: EPOCH 2 - PROGRESS: at 99.63% examples, 24585 words/s, in_qsize 13, out_qsize 0
INFO - 00:09:17: worker thread finished; awaiting finish of 6 more threads
INFO - 00:09:17: worker thread finished; awaiting finish of 5 more threads
INFO - 00:09:18: worker thread finished; awaiting finish of 4 more threads
INFO - 00:09:18: worker thread finished; awaiting finish of 3 more threads
INFO - 00:09:18: worker thread finished; awaiting finish of 2 more threads
INFO - 00:09:18: worker thread finished; awaiting finish of 1 more threads
INFO - 00:09:18: worker thread finished; awaiting finish of 0 more threads
INFO - 00:09:18: EPOCH - 2 : training on 90067643 raw words (90693803 effective words) 

INFO - 00:45:12: EPOCH 3 - PROGRESS: at 55.68% examples, 23111 words/s, in_qsize 13, out_qsize 0
INFO - 00:45:42: EPOCH 3 - PROGRESS: at 56.44% examples, 23134 words/s, in_qsize 13, out_qsize 0
INFO - 00:46:13: EPOCH 3 - PROGRESS: at 57.22% examples, 23150 words/s, in_qsize 13, out_qsize 0
INFO - 00:46:43: EPOCH 3 - PROGRESS: at 57.98% examples, 23166 words/s, in_qsize 13, out_qsize 0
INFO - 00:47:14: EPOCH 3 - PROGRESS: at 58.65% examples, 23131 words/s, in_qsize 13, out_qsize 0
INFO - 00:47:45: EPOCH 3 - PROGRESS: at 59.58% examples, 23190 words/s, in_qsize 13, out_qsize 0
INFO - 00:48:15: EPOCH 3 - PROGRESS: at 60.33% examples, 23199 words/s, in_qsize 13, out_qsize 0
INFO - 00:48:45: EPOCH 3 - PROGRESS: at 61.09% examples, 23211 words/s, in_qsize 13, out_qsize 0
INFO - 00:49:15: EPOCH 3 - PROGRESS: at 61.83% examples, 23210 words/s, in_qsize 13, out_qsize 0
INFO - 00:49:45: EPOCH 3 - PROGRESS: at 62.58% examples, 23201 words/s, in_qsize 13, out_qsize 0
INFO - 00:50:15: EPOCH 3 - PRO

INFO - 01:23:00: EPOCH 4 - PROGRESS: at 15.49% examples, 24350 words/s, in_qsize 13, out_qsize 0
INFO - 01:23:31: EPOCH 4 - PROGRESS: at 16.33% examples, 24250 words/s, in_qsize 13, out_qsize 0
INFO - 01:24:01: EPOCH 4 - PROGRESS: at 17.18% examples, 24223 words/s, in_qsize 12, out_qsize 0
INFO - 01:24:31: EPOCH 4 - PROGRESS: at 17.97% examples, 24153 words/s, in_qsize 13, out_qsize 0
INFO - 01:25:01: EPOCH 4 - PROGRESS: at 18.78% examples, 24131 words/s, in_qsize 13, out_qsize 0
INFO - 01:25:32: EPOCH 4 - PROGRESS: at 19.61% examples, 24104 words/s, in_qsize 13, out_qsize 0
INFO - 01:26:02: EPOCH 4 - PROGRESS: at 20.41% examples, 24087 words/s, in_qsize 13, out_qsize 0
INFO - 01:26:32: EPOCH 4 - PROGRESS: at 21.22% examples, 24081 words/s, in_qsize 13, out_qsize 0
INFO - 01:27:02: EPOCH 4 - PROGRESS: at 22.02% examples, 24068 words/s, in_qsize 13, out_qsize 0
INFO - 01:27:33: EPOCH 4 - PROGRESS: at 22.80% examples, 24101 words/s, in_qsize 13, out_qsize 0
INFO - 01:28:03: EPOCH 4 - PRO

INFO - 02:05:57: EPOCH 4 - PROGRESS: at 82.95% examples, 24089 words/s, in_qsize 12, out_qsize 0
INFO - 02:06:27: EPOCH 4 - PROGRESS: at 83.72% examples, 24094 words/s, in_qsize 13, out_qsize 0
INFO - 02:06:57: EPOCH 4 - PROGRESS: at 84.52% examples, 24096 words/s, in_qsize 13, out_qsize 0
INFO - 02:07:28: EPOCH 4 - PROGRESS: at 85.34% examples, 24104 words/s, in_qsize 13, out_qsize 0
INFO - 02:07:58: EPOCH 4 - PROGRESS: at 86.13% examples, 24107 words/s, in_qsize 13, out_qsize 0
INFO - 02:08:28: EPOCH 4 - PROGRESS: at 86.92% examples, 24111 words/s, in_qsize 13, out_qsize 0
INFO - 02:08:58: EPOCH 4 - PROGRESS: at 87.69% examples, 24110 words/s, in_qsize 13, out_qsize 0
INFO - 02:09:29: EPOCH 4 - PROGRESS: at 88.45% examples, 24102 words/s, in_qsize 14, out_qsize 0
INFO - 02:09:59: EPOCH 4 - PROGRESS: at 89.20% examples, 24091 words/s, in_qsize 13, out_qsize 0
INFO - 02:10:29: EPOCH 4 - PROGRESS: at 89.99% examples, 24053 words/s, in_qsize 13, out_qsize 0
INFO - 02:11:00: EPOCH 4 - PRO

INFO - 02:43:43: EPOCH 5 - PROGRESS: at 42.20% examples, 23076 words/s, in_qsize 13, out_qsize 0
INFO - 02:44:14: EPOCH 5 - PROGRESS: at 43.00% examples, 23082 words/s, in_qsize 13, out_qsize 0
INFO - 02:44:44: EPOCH 5 - PROGRESS: at 43.75% examples, 23087 words/s, in_qsize 13, out_qsize 0
INFO - 02:45:14: EPOCH 5 - PROGRESS: at 44.55% examples, 23093 words/s, in_qsize 13, out_qsize 0
INFO - 02:45:44: EPOCH 5 - PROGRESS: at 45.33% examples, 23103 words/s, in_qsize 13, out_qsize 0
INFO - 02:46:14: EPOCH 5 - PROGRESS: at 46.08% examples, 23112 words/s, in_qsize 13, out_qsize 0
INFO - 02:46:44: EPOCH 5 - PROGRESS: at 46.82% examples, 23111 words/s, in_qsize 13, out_qsize 0
INFO - 02:47:14: EPOCH 5 - PROGRESS: at 47.60% examples, 23132 words/s, in_qsize 13, out_qsize 0
INFO - 02:47:45: EPOCH 5 - PROGRESS: at 48.35% examples, 23150 words/s, in_qsize 13, out_qsize 0
INFO - 02:48:15: EPOCH 5 - PROGRESS: at 49.15% examples, 23170 words/s, in_qsize 13, out_qsize 0
INFO - 02:48:45: EPOCH 5 - PRO

INFO - 03:21:33: EPOCH 6 - PROGRESS: at 2.57% examples, 32888 words/s, in_qsize 11, out_qsize 0
INFO - 03:22:03: EPOCH 6 - PROGRESS: at 3.43% examples, 30111 words/s, in_qsize 13, out_qsize 0
INFO - 03:22:34: EPOCH 6 - PROGRESS: at 4.31% examples, 28421 words/s, in_qsize 14, out_qsize 0
INFO - 03:23:04: EPOCH 6 - PROGRESS: at 5.19% examples, 27966 words/s, in_qsize 13, out_qsize 0
INFO - 03:23:35: EPOCH 6 - PROGRESS: at 6.00% examples, 27243 words/s, in_qsize 13, out_qsize 0
INFO - 03:24:05: EPOCH 6 - PROGRESS: at 6.87% examples, 26985 words/s, in_qsize 13, out_qsize 0
INFO - 03:24:35: EPOCH 6 - PROGRESS: at 7.68% examples, 26662 words/s, in_qsize 13, out_qsize 0
INFO - 03:25:06: EPOCH 6 - PROGRESS: at 8.53% examples, 26489 words/s, in_qsize 13, out_qsize 0
INFO - 03:25:37: EPOCH 6 - PROGRESS: at 9.38% examples, 26359 words/s, in_qsize 13, out_qsize 0
INFO - 03:26:07: EPOCH 6 - PROGRESS: at 10.19% examples, 26152 words/s, in_qsize 13, out_qsize 0
INFO - 03:26:37: EPOCH 6 - PROGRESS: at

INFO - 04:04:25: EPOCH 6 - PROGRESS: at 74.50% examples, 25595 words/s, in_qsize 13, out_qsize 0
INFO - 04:04:55: EPOCH 6 - PROGRESS: at 75.35% examples, 25607 words/s, in_qsize 14, out_qsize 0
INFO - 04:05:26: EPOCH 6 - PROGRESS: at 76.17% examples, 25620 words/s, in_qsize 13, out_qsize 0
INFO - 04:05:56: EPOCH 6 - PROGRESS: at 77.00% examples, 25629 words/s, in_qsize 14, out_qsize 0
INFO - 04:06:26: EPOCH 6 - PROGRESS: at 77.87% examples, 25640 words/s, in_qsize 13, out_qsize 0
INFO - 04:06:57: EPOCH 6 - PROGRESS: at 78.66% examples, 25626 words/s, in_qsize 13, out_qsize 0
INFO - 04:07:27: EPOCH 6 - PROGRESS: at 79.57% examples, 25636 words/s, in_qsize 13, out_qsize 0
INFO - 04:07:58: EPOCH 6 - PROGRESS: at 80.40% examples, 25614 words/s, in_qsize 13, out_qsize 0
INFO - 04:08:28: EPOCH 6 - PROGRESS: at 81.29% examples, 25602 words/s, in_qsize 13, out_qsize 0
INFO - 04:08:58: EPOCH 6 - PROGRESS: at 82.18% examples, 25601 words/s, in_qsize 13, out_qsize 0
INFO - 04:09:28: EPOCH 6 - PRO

INFO - 04:41:57: EPOCH 7 - PROGRESS: at 37.09% examples, 24392 words/s, in_qsize 13, out_qsize 0
INFO - 04:42:27: EPOCH 7 - PROGRESS: at 37.89% examples, 24376 words/s, in_qsize 13, out_qsize 0
INFO - 04:42:57: EPOCH 7 - PROGRESS: at 38.69% examples, 24349 words/s, in_qsize 13, out_qsize 0
INFO - 04:43:28: EPOCH 7 - PROGRESS: at 39.48% examples, 24338 words/s, in_qsize 14, out_qsize 0
INFO - 04:43:58: EPOCH 7 - PROGRESS: at 40.27% examples, 24321 words/s, in_qsize 13, out_qsize 0
INFO - 04:44:28: EPOCH 7 - PROGRESS: at 41.08% examples, 24301 words/s, in_qsize 13, out_qsize 0
INFO - 04:44:58: EPOCH 7 - PROGRESS: at 41.88% examples, 24269 words/s, in_qsize 14, out_qsize 0
INFO - 04:45:28: EPOCH 7 - PROGRESS: at 42.67% examples, 24257 words/s, in_qsize 14, out_qsize 0
INFO - 04:45:58: EPOCH 7 - PROGRESS: at 43.48% examples, 24269 words/s, in_qsize 13, out_qsize 0
INFO - 04:46:29: EPOCH 7 - PROGRESS: at 44.28% examples, 24254 words/s, in_qsize 13, out_qsize 0
INFO - 04:46:59: EPOCH 7 - PRO

INFO - 05:20:20: storing np array 'syn1neg' to models-doc2vec/model_epoch_9.model.trainables.syn1neg.npy
INFO - 05:20:20: storing np array 'vectors' to models-doc2vec/model_epoch_9.model.wv.vectors.npy
INFO - 05:20:21: saved models-doc2vec/model_epoch_9.model
INFO - 05:20:22: EPOCH 8 - PROGRESS: at 0.04% examples, 29194 words/s, in_qsize 0, out_qsize 0
INFO - 05:20:52: EPOCH 8 - PROGRESS: at 1.36% examples, 36578 words/s, in_qsize 0, out_qsize 0
INFO - 05:21:26: EPOCH 8 - PROGRESS: at 2.57% examples, 33409 words/s, in_qsize 11, out_qsize 0
INFO - 05:21:56: EPOCH 8 - PROGRESS: at 3.47% examples, 30860 words/s, in_qsize 13, out_qsize 0
INFO - 05:22:27: EPOCH 8 - PROGRESS: at 4.38% examples, 28975 words/s, in_qsize 13, out_qsize 0
INFO - 05:22:57: EPOCH 8 - PROGRESS: at 5.26% examples, 28525 words/s, in_qsize 13, out_qsize 0
INFO - 05:23:27: EPOCH 8 - PROGRESS: at 6.13% examples, 28051 words/s, in_qsize 14, out_qsize 0
INFO - 05:23:57: EPOCH 8 - PROGRESS: at 7.00% examples, 27734 words/s,

INFO - 06:01:48: EPOCH 8 - PROGRESS: at 71.22% examples, 25902 words/s, in_qsize 13, out_qsize 0
INFO - 06:02:18: EPOCH 8 - PROGRESS: at 72.08% examples, 25916 words/s, in_qsize 13, out_qsize 0
INFO - 06:02:48: EPOCH 8 - PROGRESS: at 72.92% examples, 25915 words/s, in_qsize 13, out_qsize 0
INFO - 06:03:19: EPOCH 8 - PROGRESS: at 73.76% examples, 25921 words/s, in_qsize 13, out_qsize 0
INFO - 06:03:49: EPOCH 8 - PROGRESS: at 74.63% examples, 25930 words/s, in_qsize 13, out_qsize 0
INFO - 06:04:19: EPOCH 8 - PROGRESS: at 75.47% examples, 25941 words/s, in_qsize 13, out_qsize 0
INFO - 06:04:49: EPOCH 8 - PROGRESS: at 76.30% examples, 25955 words/s, in_qsize 13, out_qsize 0
INFO - 06:05:19: EPOCH 8 - PROGRESS: at 77.13% examples, 25958 words/s, in_qsize 13, out_qsize 0
INFO - 06:05:50: EPOCH 8 - PROGRESS: at 78.00% examples, 25967 words/s, in_qsize 13, out_qsize 0
INFO - 06:06:21: EPOCH 8 - PROGRESS: at 78.86% examples, 25968 words/s, in_qsize 13, out_qsize 0
INFO - 06:06:51: EPOCH 8 - PRO

Time to train the model: 494.0 mins


In [443]:
# d2v_model.save('models-doc2vec/model_epoch_2.model')

INFO - 22:02:24: saving Doc2Vec object under models-doc2vec/model_epoch_2.model, separately None
INFO - 22:02:24: storing np array 'syn1neg' to models-doc2vec/model_epoch_2.model.trainables.syn1neg.npy
INFO - 22:02:24: storing np array 'vectors' to models-doc2vec/model_epoch_2.model.wv.vectors.npy
INFO - 22:02:25: saved models-doc2vec/model_epoch_2.model


In [420]:
# Prototype search pipeline below
'''
len(d2v_model.docvecs.index2entity)

query = clean_text('The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia.')
query_vec = d2v_model.infer_vector(query, epochs=100)

eng_texts[0][0]

doc_weight_mat = np.zeros((len(d2v_model.docvecs.index2entity), 300))
for i, cord_uid in enumerate(tqdm(d2v_model.docvecs.index2entity)):
    doc_weight_mat[i] = d2v_model.docvecs[cord_uid]

def cosine_sim(vec: np.ndarray, mat: np.ndarray):
    return vec @ mat.T / (np.linalg.norm(vec) * np.linalg.norm(mat, axis=1))

query_vec.shape, doc_weight_mat.shape

# Find closest document
#keys = d2v_model.docvecs.index2entity
similarities = cosine_sim(query_vec, doc_weight_mat)

top_n = 10
sorted_indicies = similarities.argsort()[::-1]
top_sim = list(zip(np.array(d2v_model.docvecs.index2entity)[sorted_indicies][:top_n], similarities[sorted_indicies][:top_n]))
top_sim

top_cord_uid = top_sim[0][0]
best_text = cord_data[cord_data['cord_uid'] == top_cord_uid.split('_')[0]].body_text.values[0]
best_text_sentences = nltk.tokenize.sent_tokenize(best_text)
best_text_sentences[int(top_cord_uid.split('_')[1])]
'''

"\nlen(d2v_model.docvecs.index2entity)\n\nquery = clean_text('The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia.')\nquery_vec = d2v_model.infer_vector(query, epochs=100)\n\neng_texts[0][0]\n\ndoc_weight_mat = np.zeros((len(d2v_model.docvecs.index2entity), 300))\nfor i, cord_uid in enumerate(tqdm(d2v_model.docvecs.index2entity)):\n    doc_weight_mat[i] = d2v_model.docvecs[cord_uid]\n\ndef cosine_sim(vec: np.ndarray, mat: np.ndarray):\n    return vec @ mat.T / (np.linalg.norm(vec) * np.linalg.norm(mat, axis=1))\n\nquery_vec.shape, doc_weight_mat.shape\n\n# Find closest document\n#keys = d2v_model.docvecs.index2entity\nsimilarities = cosine_sim(query_vec, doc_weight_mat)\n\ntop_n = 10\nsorted_indicies = similarities.argsort()[::-1]\ntop_sim = list(zip(np.array(d2v_model.docvecs.index2entity)[sorted_indicies][:top_n], similarities[sorted_indicies][:top_n]))\ntop_sim\n\ntop_cord_uid = top_sim[0][0]\nbest_text = cord_data[cord_d

## Learn word embeddings using Word2vec

In [40]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, logs_filename: str):
        self.output_dir = output_dir
        self.prefix = prefix
        self.logs_filename = logs_filename
        self.epoch = 0

    def on_epoch_end(self, model):
        cum_loss = model.get_latest_training_loss()
        if self.epoch == 0:
            loss = cum_loss
        else:
            loss = cum_loss - self.loss_previous_step
        self.loss_previous_step = loss
        with open(join_path(self.output_dir, self.logs_filename), 'a+') as file:
            file.write(f'Epoch #{self.epoch}, loss: {loss}\n')
        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1    

In [46]:
# Setup initial model
w2v_model = Word2Vec(
    min_count=20,
    window=2,
    size=300,
    sample=1e-5,
    alpha=0.025,
    min_alpha=0.0005,
    negative=20,
    workers=cores-1,
    callbacks=[EpochSaver(w2v_saved_models_dir, saved_models_prefix, train_logs_path)]
)

In [47]:
# Build vocabulary
t = time()
w2v_model.build_vocab(tqdm(cord_sentences, total=cord_num_sentences), progress_per=int(cord_num_sentences / 100))
print(f'Time to build vocab: {round((time() - t) / 60, 2)} mins')

HBox(children=(IntProgress(value=0, max=6522945), HTML(value='')))

INFO - 20:42:25: collecting all words and their counts
INFO - 20:42:25: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 20:42:36: PROGRESS: at sentence #65229, processed 822775 words, keeping 53450 word types
INFO - 20:42:46: PROGRESS: at sentence #130458, processed 1682936 words, keeping 83744 word types
INFO - 20:42:57: PROGRESS: at sentence #195687, processed 2494896 words, keeping 107266 word types
INFO - 20:43:08: PROGRESS: at sentence #260916, processed 3348805 words, keeping 133645 word types
INFO - 20:43:18: PROGRESS: at sentence #326145, processed 4150375 words, keeping 149386 word types
INFO - 20:43:29: PROGRESS: at sentence #391374, processed 5056104 words, keeping 171433 word types
INFO - 20:43:39: PROGRESS: at sentence #456603, processed 5930671 words, keeping 186592 word types
INFO - 20:43:50: PROGRESS: at sentence #521832, processed 6825462 words, keeping 201622 word types
INFO - 20:44:01: PROGRESS: at sentence #587061, processed 7707620 words, k

INFO - 20:57:12: PROGRESS: at sentence #5283549, processed 72805141 words, keeping 1242555 word types
INFO - 20:57:24: PROGRESS: at sentence #5348778, processed 73658257 words, keeping 1257489 word types
INFO - 20:57:35: PROGRESS: at sentence #5414007, processed 74601054 words, keeping 1278917 word types
INFO - 20:57:46: PROGRESS: at sentence #5479236, processed 75549958 words, keeping 1289450 word types
INFO - 20:57:57: PROGRESS: at sentence #5544465, processed 76469151 words, keeping 1298324 word types
INFO - 20:58:07: PROGRESS: at sentence #5609694, processed 77397295 words, keeping 1307909 word types
INFO - 20:58:18: PROGRESS: at sentence #5674923, processed 78319132 words, keeping 1317168 word types
INFO - 20:58:29: PROGRESS: at sentence #5740152, processed 79245559 words, keeping 1325557 word types
INFO - 20:58:40: PROGRESS: at sentence #5805381, processed 80153081 words, keeping 1334353 word types
INFO - 20:58:50: PROGRESS: at sentence #5870610, processed 80970626 words, keeping




INFO - 21:00:38: effective_min_count=20 retains 100466 unique words (7% of original 1424699, drops 1324233)
INFO - 21:00:38: effective_min_count=20 leaves 86688229 word corpus (96% of original 89871314, drops 3183085)
INFO - 21:00:38: deleting the raw counts dictionary of 1424699 items
INFO - 21:00:38: sample=1e-05 downsamples 5370 most-common words
INFO - 21:00:38: downsampling leaves estimated 36494016 word corpus (42.1% of prior 86688229)
INFO - 21:00:39: estimated required memory for 100466 words and 300 dimensions: 291351400 bytes
INFO - 21:00:39: resetting layer weights


Time to build vocab: 18.54 mins


In [48]:
# Train model
t = time()
w2v_model.train(cord_sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=30)
print(f'Time to train the model: {round((time() - t) / 60, 2)} mins')

INFO - 21:01:12: training model with 7 workers on 100466 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=2
INFO - 21:01:13: EPOCH 1 - PROGRESS: at 0.09% examples, 27104 words/s, in_qsize 0, out_qsize 0
INFO - 21:01:43: EPOCH 1 - PROGRESS: at 2.97% examples, 32075 words/s, in_qsize 0, out_qsize 0
INFO - 21:02:13: EPOCH 1 - PROGRESS: at 5.84% examples, 33382 words/s, in_qsize 0, out_qsize 0
INFO - 21:02:43: EPOCH 1 - PROGRESS: at 8.65% examples, 33875 words/s, in_qsize 0, out_qsize 0
INFO - 21:03:13: EPOCH 1 - PROGRESS: at 11.48% examples, 34215 words/s, in_qsize 0, out_qsize 0
INFO - 21:03:44: EPOCH 1 - PROGRESS: at 14.34% examples, 34224 words/s, in_qsize 0, out_qsize 0
INFO - 21:04:14: EPOCH 1 - PROGRESS: at 17.26% examples, 34268 words/s, in_qsize 0, out_qsize 0
INFO - 21:04:44: EPOCH 1 - PROGRESS: at 20.09% examples, 34264 words/s, in_qsize 0, out_qsize 0
INFO - 21:05:14: EPOCH 1 - PROGRESS: at 22.91% examples, 34216 words/s, in_qsize 0, out_qsize 0
INFO

INFO - 21:37:25: EPOCH 2 - PROGRESS: at 99.21% examples, 33380 words/s, in_qsize 0, out_qsize 0
INFO - 21:37:34: worker thread finished; awaiting finish of 6 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 5 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 4 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 3 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 2 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 1 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 0 more threads
INFO - 21:37:34: EPOCH - 2 : training on 89871314 raw words (36491174 effective words) took 1092.9s, 33389 effective words/s
INFO - 21:37:34: saving Word2Vec object under models/w2v_model_epoch_1.model, separately None
INFO - 21:37:34: storing np array 'vectors' to models/w2v_model_epoch_1.model.wv.vectors.npy
INFO - 21:37:34: not storing attribute vectors_norm
INFO - 21:37:34

INFO - 22:08:49: EPOCH 4 - PROGRESS: at 72.54% examples, 33681 words/s, in_qsize 0, out_qsize 0
INFO - 22:09:19: EPOCH 4 - PROGRESS: at 75.15% examples, 33643 words/s, in_qsize 0, out_qsize 0
INFO - 22:09:49: EPOCH 4 - PROGRESS: at 77.66% examples, 33568 words/s, in_qsize 0, out_qsize 0
INFO - 22:10:19: EPOCH 4 - PROGRESS: at 80.24% examples, 33487 words/s, in_qsize 0, out_qsize 0
INFO - 22:10:49: EPOCH 4 - PROGRESS: at 82.81% examples, 33374 words/s, in_qsize 0, out_qsize 0
INFO - 22:11:19: EPOCH 4 - PROGRESS: at 85.41% examples, 33341 words/s, in_qsize 0, out_qsize 0
INFO - 22:11:49: EPOCH 4 - PROGRESS: at 88.05% examples, 33333 words/s, in_qsize 0, out_qsize 0
INFO - 22:12:19: EPOCH 4 - PROGRESS: at 90.73% examples, 33188 words/s, in_qsize 0, out_qsize 0
INFO - 22:12:49: EPOCH 4 - PROGRESS: at 93.43% examples, 33095 words/s, in_qsize 0, out_qsize 0
INFO - 22:13:19: EPOCH 4 - PROGRESS: at 96.04% examples, 33062 words/s, in_qsize 0, out_qsize 0
INFO - 22:13:49: EPOCH 4 - PROGRESS: at 

INFO - 22:40:12: EPOCH 6 - PROGRESS: at 35.69% examples, 32937 words/s, in_qsize 0, out_qsize 0
INFO - 22:40:42: EPOCH 6 - PROGRESS: at 38.44% examples, 32926 words/s, in_qsize 0, out_qsize 0
INFO - 22:41:13: EPOCH 6 - PROGRESS: at 41.18% examples, 33001 words/s, in_qsize 0, out_qsize 0
INFO - 22:41:43: EPOCH 6 - PROGRESS: at 43.89% examples, 32891 words/s, in_qsize 0, out_qsize 0
INFO - 22:42:13: EPOCH 6 - PROGRESS: at 46.50% examples, 32848 words/s, in_qsize 0, out_qsize 0
INFO - 22:42:43: EPOCH 6 - PROGRESS: at 49.09% examples, 32799 words/s, in_qsize 0, out_qsize 0
INFO - 22:43:13: EPOCH 6 - PROGRESS: at 51.73% examples, 32774 words/s, in_qsize 0, out_qsize 0
INFO - 22:43:43: EPOCH 6 - PROGRESS: at 54.34% examples, 32753 words/s, in_qsize 0, out_qsize 0
INFO - 22:44:13: EPOCH 6 - PROGRESS: at 57.00% examples, 32800 words/s, in_qsize 0, out_qsize 0
INFO - 22:44:44: EPOCH 6 - PROGRESS: at 59.58% examples, 32783 words/s, in_qsize 0, out_qsize 0
INFO - 22:45:14: EPOCH 6 - PROGRESS: at 

INFO - 23:11:44: EPOCH 8 - PROGRESS: at 0.09% examples, 27200 words/s, in_qsize 0, out_qsize 0
INFO - 23:12:15: EPOCH 8 - PROGRESS: at 2.82% examples, 30356 words/s, in_qsize 0, out_qsize 0
INFO - 23:12:45: EPOCH 8 - PROGRESS: at 5.52% examples, 31360 words/s, in_qsize 0, out_qsize 0
INFO - 23:13:15: EPOCH 8 - PROGRESS: at 8.20% examples, 31986 words/s, in_qsize 0, out_qsize 0
INFO - 23:13:45: EPOCH 8 - PROGRESS: at 10.86% examples, 32378 words/s, in_qsize 0, out_qsize 0
INFO - 23:14:15: EPOCH 8 - PROGRESS: at 13.46% examples, 32217 words/s, in_qsize 5, out_qsize 0
INFO - 23:14:45: EPOCH 8 - PROGRESS: at 16.31% examples, 32549 words/s, in_qsize 0, out_qsize 0
INFO - 23:15:15: EPOCH 8 - PROGRESS: at 19.05% examples, 32561 words/s, in_qsize 0, out_qsize 0
INFO - 23:15:45: EPOCH 8 - PROGRESS: at 21.75% examples, 32466 words/s, in_qsize 0, out_qsize 0
INFO - 23:16:15: EPOCH 8 - PROGRESS: at 24.38% examples, 32583 words/s, in_qsize 0, out_qsize 0
INFO - 23:16:45: EPOCH 8 - PROGRESS: at 27.0

INFO - 23:49:10: EPOCH 9 - PROGRESS: at 93.47% examples, 31445 words/s, in_qsize 0, out_qsize 0
INFO - 23:49:40: EPOCH 9 - PROGRESS: at 96.06% examples, 31452 words/s, in_qsize 0, out_qsize 0
INFO - 23:50:10: EPOCH 9 - PROGRESS: at 98.64% examples, 31464 words/s, in_qsize 0, out_qsize 0
INFO - 23:50:26: worker thread finished; awaiting finish of 6 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 5 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 4 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 3 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 2 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 1 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 0 more threads
INFO - 23:50:26: EPOCH - 9 : training on 89871314 raw words (36497135 effective words) took 1159.8s, 31470 effective words/s
INFO - 23:50:26: saving Word2Vec object under models/w2v_model

INFO - 00:20:20: EPOCH 11 - PROGRESS: at 55.33% examples, 31718 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:50: EPOCH 11 - PROGRESS: at 57.84% examples, 31719 words/s, in_qsize 0, out_qsize 0
INFO - 00:21:20: EPOCH 11 - PROGRESS: at 60.34% examples, 31713 words/s, in_qsize 0, out_qsize 0
INFO - 00:21:50: EPOCH 11 - PROGRESS: at 62.83% examples, 31671 words/s, in_qsize 0, out_qsize 0
INFO - 00:22:20: EPOCH 11 - PROGRESS: at 65.33% examples, 31627 words/s, in_qsize 0, out_qsize 0
INFO - 00:22:50: EPOCH 11 - PROGRESS: at 67.82% examples, 31609 words/s, in_qsize 0, out_qsize 0
INFO - 00:23:20: EPOCH 11 - PROGRESS: at 70.32% examples, 31596 words/s, in_qsize 0, out_qsize 0
INFO - 00:23:50: EPOCH 11 - PROGRESS: at 72.82% examples, 31583 words/s, in_qsize 0, out_qsize 0
INFO - 00:24:21: EPOCH 11 - PROGRESS: at 75.32% examples, 31575 words/s, in_qsize 0, out_qsize 0
INFO - 00:24:51: EPOCH 11 - PROGRESS: at 77.83% examples, 31571 words/s, in_qsize 0, out_qsize 0
INFO - 00:25:21: EPOCH 11 - PR

INFO - 00:51:00: EPOCH 13 - PROGRESS: at 13.62% examples, 32523 words/s, in_qsize 2, out_qsize 1
INFO - 00:51:30: EPOCH 13 - PROGRESS: at 16.53% examples, 32894 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:00: EPOCH 13 - PROGRESS: at 19.29% examples, 32944 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:30: EPOCH 13 - PROGRESS: at 22.03% examples, 32860 words/s, in_qsize 0, out_qsize 0
INFO - 00:53:00: EPOCH 13 - PROGRESS: at 24.78% examples, 33086 words/s, in_qsize 0, out_qsize 0
INFO - 00:53:30: EPOCH 13 - PROGRESS: at 27.49% examples, 32964 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:00: EPOCH 13 - PROGRESS: at 30.24% examples, 32958 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:31: EPOCH 13 - PROGRESS: at 32.99% examples, 32986 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:01: EPOCH 13 - PROGRESS: at 35.74% examples, 32980 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:31: EPOCH 13 - PROGRESS: at 38.46% examples, 32930 words/s, in_qsize 0, out_qsize 0
INFO - 00:56:01: EPOCH 13 - PR

INFO - 01:26:04: worker thread finished; awaiting finish of 3 more threads
INFO - 01:26:04: worker thread finished; awaiting finish of 2 more threads
INFO - 01:26:04: worker thread finished; awaiting finish of 1 more threads
INFO - 01:26:04: worker thread finished; awaiting finish of 0 more threads
INFO - 01:26:04: EPOCH - 14 : training on 89871314 raw words (36495949 effective words) took 1130.6s, 32280 effective words/s
INFO - 01:26:04: saving Word2Vec object under models/w2v_model_epoch_13.model, separately None
INFO - 01:26:04: storing np array 'vectors' to models/w2v_model_epoch_13.model.wv.vectors.npy
INFO - 01:26:04: not storing attribute vectors_norm
INFO - 01:26:04: storing np array 'syn1neg' to models/w2v_model_epoch_13.model.trainables.syn1neg.npy
INFO - 01:26:04: not storing attribute cum_table
INFO - 01:26:04: saved models/w2v_model_epoch_13.model
INFO - 01:26:05: EPOCH 15 - PROGRESS: at 0.09% examples, 28386 words/s, in_qsize 0, out_qsize 0
INFO - 01:26:35: EPOCH 15 - PRO

INFO - 01:58:38: EPOCH 16 - PROGRESS: at 73.22% examples, 32916 words/s, in_qsize 0, out_qsize 0
INFO - 01:59:08: EPOCH 16 - PROGRESS: at 75.86% examples, 32921 words/s, in_qsize 0, out_qsize 0
INFO - 01:59:38: EPOCH 16 - PROGRESS: at 78.47% examples, 32920 words/s, in_qsize 0, out_qsize 0
INFO - 02:00:08: EPOCH 16 - PROGRESS: at 81.12% examples, 32833 words/s, in_qsize 0, out_qsize 0
INFO - 02:00:38: EPOCH 16 - PROGRESS: at 83.74% examples, 32806 words/s, in_qsize 0, out_qsize 0
INFO - 02:01:09: EPOCH 16 - PROGRESS: at 86.45% examples, 32829 words/s, in_qsize 0, out_qsize 0
INFO - 02:01:39: EPOCH 16 - PROGRESS: at 89.20% examples, 32869 words/s, in_qsize 0, out_qsize 0
INFO - 02:02:09: EPOCH 16 - PROGRESS: at 91.99% examples, 32697 words/s, in_qsize 0, out_qsize 0
INFO - 02:02:39: EPOCH 16 - PROGRESS: at 94.69% examples, 32697 words/s, in_qsize 0, out_qsize 0
INFO - 02:03:09: EPOCH 16 - PROGRESS: at 97.41% examples, 32716 words/s, in_qsize 0, out_qsize 0
INFO - 02:03:38: worker thread

INFO - 02:29:44: EPOCH 18 - PROGRESS: at 37.07% examples, 31692 words/s, in_qsize 0, out_qsize 0
INFO - 02:30:14: EPOCH 18 - PROGRESS: at 39.69% examples, 31729 words/s, in_qsize 0, out_qsize 0
INFO - 02:30:44: EPOCH 18 - PROGRESS: at 42.28% examples, 31619 words/s, in_qsize 0, out_qsize 0
INFO - 02:31:14: EPOCH 18 - PROGRESS: at 44.88% examples, 31595 words/s, in_qsize 0, out_qsize 0
INFO - 02:31:44: EPOCH 18 - PROGRESS: at 47.27% examples, 31484 words/s, in_qsize 0, out_qsize 0
INFO - 02:32:14: EPOCH 18 - PROGRESS: at 49.78% examples, 31464 words/s, in_qsize 0, out_qsize 0
INFO - 02:32:45: EPOCH 18 - PROGRESS: at 52.27% examples, 31417 words/s, in_qsize 0, out_qsize 0
INFO - 02:33:15: EPOCH 18 - PROGRESS: at 54.78% examples, 31428 words/s, in_qsize 0, out_qsize 0
INFO - 02:33:45: EPOCH 18 - PROGRESS: at 57.26% examples, 31428 words/s, in_qsize 0, out_qsize 0
INFO - 02:34:15: EPOCH 18 - PROGRESS: at 59.79% examples, 31450 words/s, in_qsize 0, out_qsize 0
INFO - 02:34:45: EPOCH 18 - PR

INFO - 03:00:47: not storing attribute cum_table
INFO - 03:00:47: saved models/w2v_model_epoch_18.model
INFO - 03:00:48: EPOCH 20 - PROGRESS: at 0.09% examples, 28207 words/s, in_qsize 0, out_qsize 0
INFO - 03:01:18: EPOCH 20 - PROGRESS: at 2.82% examples, 30341 words/s, in_qsize 0, out_qsize 0
INFO - 03:01:48: EPOCH 20 - PROGRESS: at 5.51% examples, 31294 words/s, in_qsize 0, out_qsize 0
INFO - 03:02:18: EPOCH 20 - PROGRESS: at 8.20% examples, 31960 words/s, in_qsize 0, out_qsize 0
INFO - 03:02:48: EPOCH 20 - PROGRESS: at 10.93% examples, 32556 words/s, in_qsize 0, out_qsize 0
INFO - 03:03:19: EPOCH 20 - PROGRESS: at 13.62% examples, 32314 words/s, in_qsize 3, out_qsize 0
INFO - 03:03:50: EPOCH 20 - PROGRESS: at 16.50% examples, 32650 words/s, in_qsize 0, out_qsize 0
INFO - 03:04:20: EPOCH 20 - PROGRESS: at 19.29% examples, 32772 words/s, in_qsize 0, out_qsize 0
INFO - 03:04:50: EPOCH 20 - PROGRESS: at 22.07% examples, 32785 words/s, in_qsize 0, out_qsize 0
INFO - 03:05:20: EPOCH 20 -

INFO - 03:37:26: EPOCH 21 - PROGRESS: at 97.91% examples, 32910 words/s, in_qsize 0, out_qsize 0
INFO - 03:37:49: worker thread finished; awaiting finish of 6 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 5 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 4 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 3 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 2 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 1 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 0 more threads
INFO - 03:37:49: EPOCH - 21 : training on 89871314 raw words (36494142 effective words) took 1108.6s, 32918 effective words/s
INFO - 03:37:49: saving Word2Vec object under models/w2v_model_epoch_20.model, separately None
INFO - 03:37:49: storing np array 'vectors' to models/w2v_model_epoch_20.model.wv.vectors.npy
INFO - 03:37:49: not storing attribute vectors_norm
INFO - 03:3

INFO - 04:08:26: EPOCH 23 - PROGRESS: at 65.29% examples, 32931 words/s, in_qsize 0, out_qsize 0
INFO - 04:08:56: EPOCH 23 - PROGRESS: at 68.08% examples, 33009 words/s, in_qsize 0, out_qsize 0
INFO - 04:09:26: EPOCH 23 - PROGRESS: at 70.89% examples, 33087 words/s, in_qsize 0, out_qsize 0
INFO - 04:09:56: EPOCH 23 - PROGRESS: at 73.66% examples, 33148 words/s, in_qsize 0, out_qsize 0
INFO - 04:10:26: EPOCH 23 - PROGRESS: at 76.45% examples, 33225 words/s, in_qsize 0, out_qsize 0
INFO - 04:10:56: EPOCH 23 - PROGRESS: at 79.18% examples, 33243 words/s, in_qsize 1, out_qsize 0
INFO - 04:11:27: EPOCH 23 - PROGRESS: at 81.87% examples, 33132 words/s, in_qsize 8, out_qsize 2
INFO - 04:11:57: EPOCH 23 - PROGRESS: at 84.73% examples, 33216 words/s, in_qsize 0, out_qsize 0
INFO - 04:12:27: EPOCH 23 - PROGRESS: at 87.45% examples, 33243 words/s, in_qsize 0, out_qsize 0
INFO - 04:12:57: EPOCH 23 - PROGRESS: at 90.23% examples, 33182 words/s, in_qsize 0, out_qsize 0
INFO - 04:13:27: EPOCH 23 - PR

INFO - 04:38:56: EPOCH 25 - PROGRESS: at 28.06% examples, 33370 words/s, in_qsize 5, out_qsize 2
INFO - 04:39:26: EPOCH 25 - PROGRESS: at 31.00% examples, 33508 words/s, in_qsize 0, out_qsize 0
INFO - 04:39:56: EPOCH 25 - PROGRESS: at 33.77% examples, 33559 words/s, in_qsize 0, out_qsize 0
INFO - 04:40:26: EPOCH 25 - PROGRESS: at 36.57% examples, 33526 words/s, in_qsize 0, out_qsize 0
INFO - 04:40:56: EPOCH 25 - PROGRESS: at 39.34% examples, 33558 words/s, in_qsize 0, out_qsize 0
INFO - 04:41:26: EPOCH 25 - PROGRESS: at 42.12% examples, 33476 words/s, in_qsize 0, out_qsize 0
INFO - 04:41:56: EPOCH 25 - PROGRESS: at 44.87% examples, 33438 words/s, in_qsize 0, out_qsize 0
INFO - 04:42:26: EPOCH 25 - PROGRESS: at 47.44% examples, 33337 words/s, in_qsize 0, out_qsize 0
INFO - 04:42:57: EPOCH 25 - PROGRESS: at 50.12% examples, 33332 words/s, in_qsize 0, out_qsize 0
INFO - 04:43:27: EPOCH 25 - PROGRESS: at 52.78% examples, 33290 words/s, in_qsize 0, out_qsize 0
INFO - 04:43:57: EPOCH 25 - PR

INFO - 05:09:44: EPOCH 27 - PROGRESS: at 0.10% examples, 29223 words/s, in_qsize 0, out_qsize 0
INFO - 05:10:15: EPOCH 27 - PROGRESS: at 3.03% examples, 32343 words/s, in_qsize 0, out_qsize 0
INFO - 05:10:45: EPOCH 27 - PROGRESS: at 5.91% examples, 33530 words/s, in_qsize 1, out_qsize 0
INFO - 05:11:15: EPOCH 27 - PROGRESS: at 8.80% examples, 34243 words/s, in_qsize 0, out_qsize 0
INFO - 05:11:45: EPOCH 27 - PROGRESS: at 11.54% examples, 34216 words/s, in_qsize 0, out_qsize 0
INFO - 05:12:15: EPOCH 27 - PROGRESS: at 14.40% examples, 34289 words/s, in_qsize 0, out_qsize 0
INFO - 05:12:46: EPOCH 27 - PROGRESS: at 17.38% examples, 34448 words/s, in_qsize 0, out_qsize 0
INFO - 05:13:16: EPOCH 27 - PROGRESS: at 20.34% examples, 34606 words/s, in_qsize 0, out_qsize 0
INFO - 05:13:46: EPOCH 27 - PROGRESS: at 23.26% examples, 34707 words/s, in_qsize 0, out_qsize 0
INFO - 05:14:16: EPOCH 27 - PROGRESS: at 26.16% examples, 34859 words/s, in_qsize 0, out_qsize 0
INFO - 05:14:46: EPOCH 27 - PROGRE

INFO - 05:44:48: worker thread finished; awaiting finish of 3 more threads
INFO - 05:44:48: worker thread finished; awaiting finish of 2 more threads
INFO - 05:44:48: worker thread finished; awaiting finish of 1 more threads
INFO - 05:44:48: worker thread finished; awaiting finish of 0 more threads
INFO - 05:44:48: EPOCH - 28 : training on 89871314 raw words (36495971 effective words) took 1054.9s, 34596 effective words/s
INFO - 05:44:48: saving Word2Vec object under models/w2v_model_epoch_27.model, separately None
INFO - 05:44:48: storing np array 'vectors' to models/w2v_model_epoch_27.model.wv.vectors.npy
INFO - 05:44:48: not storing attribute vectors_norm
INFO - 05:44:48: storing np array 'syn1neg' to models/w2v_model_epoch_27.model.trainables.syn1neg.npy
INFO - 05:44:48: not storing attribute cum_table
INFO - 05:44:49: saved models/w2v_model_epoch_27.model
INFO - 05:44:50: EPOCH 29 - PROGRESS: at 0.09% examples, 26529 words/s, in_qsize 0, out_qsize 0
INFO - 05:45:20: EPOCH 29 - PRO

INFO - 06:17:27: EPOCH 30 - PROGRESS: at 76.80% examples, 33354 words/s, in_qsize 0, out_qsize 0
INFO - 06:17:57: EPOCH 30 - PROGRESS: at 79.43% examples, 33328 words/s, in_qsize 0, out_qsize 0
INFO - 06:18:28: EPOCH 30 - PROGRESS: at 82.15% examples, 33241 words/s, in_qsize 0, out_qsize 0
INFO - 06:18:58: EPOCH 30 - PROGRESS: at 84.89% examples, 33273 words/s, in_qsize 0, out_qsize 0
INFO - 06:19:28: EPOCH 30 - PROGRESS: at 87.71% examples, 33336 words/s, in_qsize 0, out_qsize 0
INFO - 06:19:58: EPOCH 30 - PROGRESS: at 90.56% examples, 33268 words/s, in_qsize 0, out_qsize 0
INFO - 06:20:28: EPOCH 30 - PROGRESS: at 93.41% examples, 33217 words/s, in_qsize 0, out_qsize 0
INFO - 06:20:58: EPOCH 30 - PROGRESS: at 96.12% examples, 33211 words/s, in_qsize 0, out_qsize 0
INFO - 06:21:28: EPOCH 30 - PROGRESS: at 98.77% examples, 33200 words/s, in_qsize 0, out_qsize 0
INFO - 06:21:42: worker thread finished; awaiting finish of 6 more threads
INFO - 06:21:42: worker thread finished; awaiting fi

Time to train the model: 560.5 mins
