In [1]:
# Imports
from time import time
from os.path import join as join_path
import numpy as np
import pandas as pd

import multiprocessing
cores = multiprocessing.cpu_count()

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.doc2vec import TaggedDocument
import logging # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import nltk
nltk.download('punkt')

from utils import clean_text
from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load and prepare data

In [2]:
# Constants
cord_data_dir = 'data'
cord_data_path = join_path(cord_data_dir, 'cord-19-data.csv')
w2v_saved_models_dir = 'models-word2vec-new'
d2v_saved_models_dir = 'models-doc2vec'
saved_models_prefix = 'model'
train_logs_path = 'train_logs.txt'

In [3]:
cord_data = pd.read_csv(cord_data_path)
cord_data_eng = cord_data[cord_data['language'] == 'en']
eng_texts = cord_data_eng[['cord_uid', 'body_text']].values

In [None]:
cord_num_sentences = 0
for _, text in tqdm(eng_texts):
    sentences = nltk.tokenize.sent_tokenize(text)
    cord_num_sentences += len(sentences)
print(f'Total number of CORD-19 sentences: {cord_num_sentences}')

HBox(children=(IntProgress(value=0, max=32505), HTML(value='')))

In [None]:
class CORDDataIteratorWord2Vec():
    def __init__(self, texts: np.ndarray):
        self.texts = texts
    
    def __iter__(self):
        for _, text in self.texts:
            sentences = nltk.tokenize.sent_tokenize(text)
            cleaned_sentences = [clean_text(sent).split() for sent in sentences]
            for sentence in cleaned_sentences:
                yield sentence

In [6]:
class CORDDataIteratorDoc2Vec():
    def __init__(self, texts: np.ndarray):
        self.texts = texts
    
    def __iter__(self):
        for cord_uid, cord_text in self.texts:
            sentences = nltk.tokenize.sent_tokenize(cord_text)
            cleaned_sentences = [clean_text(sent) for sent in sentences]
            for sentence in cleaned_sentences:
                yield TaggedDocument(sentence, [cord_uid])

In [7]:
cord_sentences = CORDDataIteratorWord2Vec(eng_texts)

## Learn word embeddings using Doc2Vec

In [8]:
class DocEpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, start_epoch: int = 1):
        self.output_dir = output_dir
        self.prefix = prefix
        self.epoch = start_epoch

    def on_epoch_end(self, model):        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch + 1}.model')
        model.save(output_path)
        self.epoch += 1    

In [9]:
# Setup initial model
d2v_model = Doc2Vec(
    min_count=20,
    window=2,
    vector_size=300,
    negative=5,
    workers=cores-1,
    callbacks=[DocEpochSaver(d2v_saved_models_dir, saved_models_prefix)]
)
# Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [10]:
# Build vocabulary
t = time()
d2v_model.build_vocab(tqdm(cord_sentences, total=cord_num_sentences), progress_per=int(cord_num_sentences / 100))
print(f'Time to build vocab: {round((time() - t) / 60, 2)} mins')

HBox(children=(IntProgress(value=0, max=6488379), HTML(value='')))

INFO - 23:49:26: collecting all words and their counts
INFO - 23:49:27: PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO - 23:49:42: PROGRESS: at example #64883, processed 802744 words (52734/s), 47194 word types, 393 tags
INFO - 23:49:58: PROGRESS: at example #129766, processed 1634728 words (53760/s), 72675 word types, 803 tags
INFO - 23:50:14: PROGRESS: at example #194649, processed 2434336 words (51008/s), 94023 word types, 1057 tags
INFO - 23:50:28: PROGRESS: at example #259532, processed 3262468 words (56280/s), 116285 word types, 1221 tags
INFO - 23:50:43: PROGRESS: at example #324415, processed 4074622 words (54232/s), 129019 word types, 1376 tags
INFO - 23:50:59: PROGRESS: at example #389298, processed 4944028 words (55331/s), 147593 word types, 1532 tags
INFO - 23:51:14: PROGRESS: at example #454181, processed 5801460 words (55944/s), 159949 word types, 1622 tags
INFO - 23:51:30: PROGRESS: at example #519064, processed 6658970 words (55647/s), 17149

INFO - 00:08:41: PROGRESS: at example #4736459, processed 64234781 words (56574/s), 1009478 word types, 22347 tags
INFO - 00:08:57: PROGRESS: at example #4801342, processed 65150895 words (56709/s), 1023389 word types, 22658 tags
INFO - 00:09:13: PROGRESS: at example #4866225, processed 66085383 words (57292/s), 1036727 word types, 23017 tags
INFO - 00:09:30: PROGRESS: at example #4931108, processed 67051927 words (57846/s), 1050478 word types, 23379 tags
INFO - 00:09:46: PROGRESS: at example #4995991, processed 67994720 words (57845/s), 1063866 word types, 23744 tags
INFO - 00:10:03: PROGRESS: at example #5060874, processed 68917098 words (56826/s), 1077165 word types, 24086 tags
INFO - 00:10:19: PROGRESS: at example #5125757, processed 69833165 words (54872/s), 1091644 word types, 24439 tags
INFO - 00:10:35: PROGRESS: at example #5190640, processed 70712739 words (55690/s), 1107139 word types, 24854 tags
INFO - 00:10:51: PROGRESS: at example #5255523, processed 71532585 words (52322/




INFO - 00:15:55: effective_min_count=20 retains 88172 unique words (6% of original 1287568, drops 1199396)
INFO - 00:15:55: effective_min_count=20 leaves 85446429 word corpus (96% of original 88307056, drops 2860627)
INFO - 00:15:55: deleting the raw counts dictionary of 1287568 items
INFO - 00:15:55: sample=0.001 downsamples 15 most-common words
INFO - 00:15:55: downsampling leaves estimated 83093632 word corpus (97.2% of prior 85446429)
INFO - 00:15:55: estimated required memory for 88172 words and 300 dimensions: 301205800 bytes
INFO - 00:15:55: resetting layer weights


Time to build vocab: 26.86 mins


In [11]:
# d2v_model = Word2Vec.load('models-doc2vec/model_epoch_2.model')

In [12]:
# Train model
t = time()
d2v_model.train(cord_sentences, total_examples=d2v_model.corpus_count, epochs=10, report_delay=30, callbacks=[DocEpochSaver(d2v_saved_models_dir, saved_models_prefix, 10)])
print(f'Time to train the model: {round((time() - t) / 60, 2)} mins')

INFO - 00:16:56: training model with 15 workers on 88172 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
INFO - 00:16:58: EPOCH 1 - PROGRESS: at 0.04% examples, 26813 words/s, in_qsize 0, out_qsize 0
INFO - 00:17:28: EPOCH 1 - PROGRESS: at 1.34% examples, 35038 words/s, in_qsize 0, out_qsize 0
INFO - 00:18:06: EPOCH 1 - PROGRESS: at 2.54% examples, 30327 words/s, in_qsize 15, out_qsize 0
INFO - 00:18:36: EPOCH 1 - PROGRESS: at 3.33% examples, 27717 words/s, in_qsize 29, out_qsize 0
INFO - 00:19:07: EPOCH 1 - PROGRESS: at 4.16% examples, 26344 words/s, in_qsize 29, out_qsize 0
INFO - 00:19:37: EPOCH 1 - PROGRESS: at 5.06% examples, 26313 words/s, in_qsize 29, out_qsize 0
INFO - 00:20:07: EPOCH 1 - PROGRESS: at 5.90% examples, 25980 words/s, in_qsize 29, out_qsize 0
INFO - 00:20:38: EPOCH 1 - PROGRESS: at 6.78% examples, 25824 words/s, in_qsize 30, out_qsize 0
INFO - 00:21:08: EPOCH 1 - PROGRESS: at 7.62% examples, 25751 words/s, in_qsize 30, out_qsize 0
INF

INFO - 00:59:33: EPOCH 1 - PROGRESS: at 70.97% examples, 24774 words/s, in_qsize 30, out_qsize 0
INFO - 01:00:03: EPOCH 1 - PROGRESS: at 71.79% examples, 24780 words/s, in_qsize 29, out_qsize 0
INFO - 01:00:34: EPOCH 1 - PROGRESS: at 72.62% examples, 24784 words/s, in_qsize 29, out_qsize 0
INFO - 01:01:04: EPOCH 1 - PROGRESS: at 73.41% examples, 24779 words/s, in_qsize 29, out_qsize 0
INFO - 01:01:35: EPOCH 1 - PROGRESS: at 74.25% examples, 24784 words/s, in_qsize 29, out_qsize 0
INFO - 01:02:05: EPOCH 1 - PROGRESS: at 75.06% examples, 24790 words/s, in_qsize 30, out_qsize 0
INFO - 01:02:35: EPOCH 1 - PROGRESS: at 75.87% examples, 24806 words/s, in_qsize 29, out_qsize 0
INFO - 01:03:05: EPOCH 1 - PROGRESS: at 76.69% examples, 24817 words/s, in_qsize 29, out_qsize 0
INFO - 01:03:35: EPOCH 1 - PROGRESS: at 77.52% examples, 24826 words/s, in_qsize 28, out_qsize 0
INFO - 01:04:05: EPOCH 1 - PROGRESS: at 78.31% examples, 24821 words/s, in_qsize 30, out_qsize 0
INFO - 01:04:35: EPOCH 1 - PRO

INFO - 01:33:51: EPOCH 2 - PROGRESS: at 28.88% examples, 24701 words/s, in_qsize 30, out_qsize 0
INFO - 01:34:21: EPOCH 2 - PROGRESS: at 29.71% examples, 24701 words/s, in_qsize 29, out_qsize 0
INFO - 01:34:51: EPOCH 2 - PROGRESS: at 30.56% examples, 24689 words/s, in_qsize 29, out_qsize 0
INFO - 01:35:21: EPOCH 2 - PROGRESS: at 31.38% examples, 24709 words/s, in_qsize 29, out_qsize 0
INFO - 01:35:51: EPOCH 2 - PROGRESS: at 32.23% examples, 24717 words/s, in_qsize 29, out_qsize 0
INFO - 01:36:22: EPOCH 2 - PROGRESS: at 33.04% examples, 24714 words/s, in_qsize 29, out_qsize 0
INFO - 01:36:52: EPOCH 2 - PROGRESS: at 33.87% examples, 24685 words/s, in_qsize 30, out_qsize 0
INFO - 01:37:22: EPOCH 2 - PROGRESS: at 34.71% examples, 24655 words/s, in_qsize 29, out_qsize 0
INFO - 01:37:52: EPOCH 2 - PROGRESS: at 35.59% examples, 24646 words/s, in_qsize 29, out_qsize 0
INFO - 01:38:23: EPOCH 2 - PROGRESS: at 36.43% examples, 24646 words/s, in_qsize 29, out_qsize 0
INFO - 01:38:54: EPOCH 2 - PRO

INFO - 02:16:46: EPOCH 2 - PROGRESS: at 98.93% examples, 24736 words/s, in_qsize 29, out_qsize 0
INFO - 02:17:12: worker thread finished; awaiting finish of 14 more threads
INFO - 02:17:12: worker thread finished; awaiting finish of 13 more threads
INFO - 02:17:13: worker thread finished; awaiting finish of 12 more threads
INFO - 02:17:13: worker thread finished; awaiting finish of 11 more threads
INFO - 02:17:13: worker thread finished; awaiting finish of 10 more threads
INFO - 02:17:13: worker thread finished; awaiting finish of 9 more threads
INFO - 02:17:13: worker thread finished; awaiting finish of 8 more threads
INFO - 02:17:14: worker thread finished; awaiting finish of 7 more threads
INFO - 02:17:14: worker thread finished; awaiting finish of 6 more threads
INFO - 02:17:14: worker thread finished; awaiting finish of 5 more threads
INFO - 02:17:14: worker thread finished; awaiting finish of 4 more threads
INFO - 02:17:14: worker thread finished; awaiting finish of 3 more thread

INFO - 02:51:16: EPOCH 3 - PROGRESS: at 56.42% examples, 24410 words/s, in_qsize 29, out_qsize 0
INFO - 02:51:47: EPOCH 3 - PROGRESS: at 57.26% examples, 24430 words/s, in_qsize 29, out_qsize 0
INFO - 02:52:18: EPOCH 3 - PROGRESS: at 58.01% examples, 24409 words/s, in_qsize 21, out_qsize 0
INFO - 02:52:48: EPOCH 3 - PROGRESS: at 58.92% examples, 24443 words/s, in_qsize 30, out_qsize 0
INFO - 02:53:18: EPOCH 3 - PROGRESS: at 59.72% examples, 24452 words/s, in_qsize 29, out_qsize 0
INFO - 02:53:48: EPOCH 3 - PROGRESS: at 60.55% examples, 24468 words/s, in_qsize 29, out_qsize 0
INFO - 02:54:19: EPOCH 3 - PROGRESS: at 61.36% examples, 24477 words/s, in_qsize 29, out_qsize 0
INFO - 02:54:49: EPOCH 3 - PROGRESS: at 62.17% examples, 24478 words/s, in_qsize 30, out_qsize 0
INFO - 02:55:19: EPOCH 3 - PROGRESS: at 63.01% examples, 24480 words/s, in_qsize 30, out_qsize 0
INFO - 02:55:50: EPOCH 3 - PROGRESS: at 63.82% examples, 24483 words/s, in_qsize 30, out_qsize 0
INFO - 02:56:20: EPOCH 3 - PRO

INFO - 03:25:47: EPOCH 4 - PROGRESS: at 13.43% examples, 24696 words/s, in_qsize 29, out_qsize 0
INFO - 03:26:17: EPOCH 4 - PROGRESS: at 14.28% examples, 24589 words/s, in_qsize 29, out_qsize 0
INFO - 03:26:48: EPOCH 4 - PROGRESS: at 15.15% examples, 24473 words/s, in_qsize 29, out_qsize 0
INFO - 03:27:19: EPOCH 4 - PROGRESS: at 16.02% examples, 24404 words/s, in_qsize 29, out_qsize 0
INFO - 03:27:49: EPOCH 4 - PROGRESS: at 16.87% examples, 24323 words/s, in_qsize 29, out_qsize 0
INFO - 03:28:19: EPOCH 4 - PROGRESS: at 17.73% examples, 24351 words/s, in_qsize 29, out_qsize 0
INFO - 03:28:49: EPOCH 4 - PROGRESS: at 18.56% examples, 24324 words/s, in_qsize 29, out_qsize 0
INFO - 03:29:20: EPOCH 4 - PROGRESS: at 19.40% examples, 24299 words/s, in_qsize 29, out_qsize 0
INFO - 03:29:50: EPOCH 4 - PROGRESS: at 20.25% examples, 24304 words/s, in_qsize 29, out_qsize 0
INFO - 03:30:20: EPOCH 4 - PROGRESS: at 21.07% examples, 24258 words/s, in_qsize 30, out_qsize 0
INFO - 03:30:51: EPOCH 4 - PRO

INFO - 04:08:47: EPOCH 4 - PROGRESS: at 82.97% examples, 24396 words/s, in_qsize 29, out_qsize 0
INFO - 04:09:17: EPOCH 4 - PROGRESS: at 83.80% examples, 24411 words/s, in_qsize 30, out_qsize 0
INFO - 04:09:47: EPOCH 4 - PROGRESS: at 84.59% examples, 24413 words/s, in_qsize 29, out_qsize 0
INFO - 04:10:18: EPOCH 4 - PROGRESS: at 85.42% examples, 24421 words/s, in_qsize 29, out_qsize 0
INFO - 04:10:48: EPOCH 4 - PROGRESS: at 86.22% examples, 24425 words/s, in_qsize 29, out_qsize 0
INFO - 04:11:18: EPOCH 4 - PROGRESS: at 87.05% examples, 24437 words/s, in_qsize 29, out_qsize 0
INFO - 04:11:48: EPOCH 4 - PROGRESS: at 87.89% examples, 24451 words/s, in_qsize 29, out_qsize 0
INFO - 04:12:18: EPOCH 4 - PROGRESS: at 88.68% examples, 24449 words/s, in_qsize 29, out_qsize 0
INFO - 04:12:49: EPOCH 4 - PROGRESS: at 89.42% examples, 24429 words/s, in_qsize 29, out_qsize 0
INFO - 04:13:19: EPOCH 4 - PROGRESS: at 90.29% examples, 24407 words/s, in_qsize 29, out_qsize 0
INFO - 04:13:49: EPOCH 4 - PRO

INFO - 04:42:49: EPOCH 5 - PROGRESS: at 39.75% examples, 24176 words/s, in_qsize 29, out_qsize 0
INFO - 04:43:19: EPOCH 5 - PROGRESS: at 40.56% examples, 24148 words/s, in_qsize 29, out_qsize 0
INFO - 04:43:49: EPOCH 5 - PROGRESS: at 41.43% examples, 24156 words/s, in_qsize 29, out_qsize 0
INFO - 04:44:20: EPOCH 5 - PROGRESS: at 42.26% examples, 24153 words/s, in_qsize 29, out_qsize 0
INFO - 04:44:50: EPOCH 5 - PROGRESS: at 43.09% examples, 24171 words/s, in_qsize 29, out_qsize 0
INFO - 04:45:20: EPOCH 5 - PROGRESS: at 43.91% examples, 24171 words/s, in_qsize 29, out_qsize 0
INFO - 04:45:50: EPOCH 5 - PROGRESS: at 44.73% examples, 24188 words/s, in_qsize 29, out_qsize 0
INFO - 04:46:20: EPOCH 5 - PROGRESS: at 45.55% examples, 24200 words/s, in_qsize 29, out_qsize 0
INFO - 04:46:50: EPOCH 5 - PROGRESS: at 46.32% examples, 24201 words/s, in_qsize 29, out_qsize 0
INFO - 04:47:21: EPOCH 5 - PROGRESS: at 47.16% examples, 24215 words/s, in_qsize 29, out_qsize 0
INFO - 04:47:51: EPOCH 5 - PRO

INFO - 05:19:51: worker thread finished; awaiting finish of 0 more threads
INFO - 05:19:51: EPOCH - 5 : training on 88307056 raw words (89580565 effective words) took 3648.8s, 24551 effective words/s
INFO - 05:19:51: saving Doc2Vec object under models-doc2vec/model_epoch_5.model, separately None
INFO - 05:19:51: storing np array 'syn1neg' to models-doc2vec/model_epoch_5.model.trainables.syn1neg.npy
INFO - 05:19:51: storing np array 'vectors' to models-doc2vec/model_epoch_5.model.wv.vectors.npy
INFO - 05:19:52: saved models-doc2vec/model_epoch_5.model
INFO - 05:19:53: EPOCH 6 - PROGRESS: at 0.04% examples, 29251 words/s, in_qsize 0, out_qsize 0
INFO - 05:20:23: EPOCH 6 - PROGRESS: at 1.31% examples, 34342 words/s, in_qsize 0, out_qsize 0
INFO - 05:21:03: EPOCH 6 - PROGRESS: at 2.54% examples, 29751 words/s, in_qsize 15, out_qsize 0
INFO - 05:21:33: EPOCH 6 - PROGRESS: at 3.33% examples, 27359 words/s, in_qsize 30, out_qsize 0
INFO - 05:22:04: EPOCH 6 - PROGRESS: at 4.16% examples, 26029

INFO - 06:00:01: EPOCH 6 - PROGRESS: at 66.12% examples, 24424 words/s, in_qsize 29, out_qsize 0
INFO - 06:00:31: EPOCH 6 - PROGRESS: at 66.97% examples, 24435 words/s, in_qsize 29, out_qsize 0
INFO - 06:01:01: EPOCH 6 - PROGRESS: at 67.79% examples, 24449 words/s, in_qsize 30, out_qsize 0
INFO - 06:01:31: EPOCH 6 - PROGRESS: at 68.58% examples, 24456 words/s, in_qsize 29, out_qsize 0
INFO - 06:02:01: EPOCH 6 - PROGRESS: at 69.37% examples, 24454 words/s, in_qsize 29, out_qsize 0
INFO - 06:02:32: EPOCH 6 - PROGRESS: at 70.18% examples, 24460 words/s, in_qsize 29, out_qsize 0
INFO - 06:03:02: EPOCH 6 - PROGRESS: at 70.97% examples, 24458 words/s, in_qsize 29, out_qsize 0
INFO - 06:03:32: EPOCH 6 - PROGRESS: at 71.77% examples, 24461 words/s, in_qsize 30, out_qsize 0
INFO - 06:04:02: EPOCH 6 - PROGRESS: at 72.56% examples, 24456 words/s, in_qsize 29, out_qsize 0
INFO - 06:04:33: EPOCH 6 - PROGRESS: at 73.36% examples, 24460 words/s, in_qsize 29, out_qsize 0
INFO - 06:05:03: EPOCH 6 - PRO

INFO - 06:34:01: EPOCH 7 - PROGRESS: at 22.38% examples, 24185 words/s, in_qsize 29, out_qsize 0
INFO - 06:34:31: EPOCH 7 - PROGRESS: at 23.22% examples, 24210 words/s, in_qsize 29, out_qsize 0
INFO - 06:35:02: EPOCH 7 - PROGRESS: at 24.04% examples, 24222 words/s, in_qsize 29, out_qsize 0
INFO - 06:35:33: EPOCH 7 - PROGRESS: at 24.88% examples, 24221 words/s, in_qsize 28, out_qsize 0
INFO - 06:36:04: EPOCH 7 - PROGRESS: at 25.72% examples, 24261 words/s, in_qsize 29, out_qsize 0
INFO - 06:36:34: EPOCH 7 - PROGRESS: at 26.53% examples, 24253 words/s, in_qsize 29, out_qsize 0
INFO - 06:37:05: EPOCH 7 - PROGRESS: at 27.36% examples, 24217 words/s, in_qsize 29, out_qsize 0
INFO - 06:37:35: EPOCH 7 - PROGRESS: at 28.20% examples, 24216 words/s, in_qsize 29, out_qsize 0
INFO - 06:38:05: EPOCH 7 - PROGRESS: at 29.04% examples, 24218 words/s, in_qsize 29, out_qsize 0
INFO - 06:38:36: EPOCH 7 - PROGRESS: at 29.87% examples, 24215 words/s, in_qsize 29, out_qsize 0
INFO - 06:39:06: EPOCH 7 - PRO

INFO - 07:17:01: EPOCH 7 - PROGRESS: at 91.54% examples, 24323 words/s, in_qsize 28, out_qsize 0
INFO - 07:17:32: EPOCH 7 - PROGRESS: at 92.39% examples, 24311 words/s, in_qsize 29, out_qsize 0
INFO - 07:18:02: EPOCH 7 - PROGRESS: at 93.21% examples, 24317 words/s, in_qsize 29, out_qsize 0
INFO - 07:18:32: EPOCH 7 - PROGRESS: at 94.02% examples, 24322 words/s, in_qsize 29, out_qsize 0
INFO - 07:19:02: EPOCH 7 - PROGRESS: at 94.86% examples, 24327 words/s, in_qsize 29, out_qsize 0
INFO - 07:19:33: EPOCH 7 - PROGRESS: at 95.68% examples, 24329 words/s, in_qsize 29, out_qsize 0
INFO - 07:20:03: EPOCH 7 - PROGRESS: at 96.51% examples, 24335 words/s, in_qsize 29, out_qsize 0
INFO - 07:20:34: EPOCH 7 - PROGRESS: at 97.32% examples, 24334 words/s, in_qsize 30, out_qsize 0
INFO - 07:21:04: EPOCH 7 - PROGRESS: at 98.14% examples, 24340 words/s, in_qsize 29, out_qsize 0
INFO - 07:21:34: EPOCH 7 - PROGRESS: at 98.93% examples, 24339 words/s, in_qsize 29, out_qsize 0
INFO - 07:21:59: worker thread

INFO - 07:51:31: EPOCH 8 - PROGRESS: at 48.90% examples, 24170 words/s, in_qsize 29, out_qsize 0
INFO - 07:52:02: EPOCH 8 - PROGRESS: at 49.71% examples, 24184 words/s, in_qsize 29, out_qsize 0
INFO - 07:52:32: EPOCH 8 - PROGRESS: at 50.51% examples, 24204 words/s, in_qsize 29, out_qsize 0
INFO - 07:53:02: EPOCH 8 - PROGRESS: at 51.31% examples, 24217 words/s, in_qsize 30, out_qsize 0
INFO - 07:53:32: EPOCH 8 - PROGRESS: at 52.13% examples, 24234 words/s, in_qsize 30, out_qsize 0
INFO - 07:54:02: EPOCH 8 - PROGRESS: at 52.90% examples, 24232 words/s, in_qsize 24, out_qsize 0
INFO - 07:54:32: EPOCH 8 - PROGRESS: at 53.76% examples, 24260 words/s, in_qsize 29, out_qsize 0
INFO - 07:55:02: EPOCH 8 - PROGRESS: at 54.59% examples, 24285 words/s, in_qsize 29, out_qsize 0
INFO - 07:55:32: EPOCH 8 - PROGRESS: at 55.39% examples, 24301 words/s, in_qsize 29, out_qsize 0
INFO - 07:56:03: EPOCH 8 - PROGRESS: at 56.20% examples, 24320 words/s, in_qsize 29, out_qsize 0
INFO - 07:56:33: EPOCH 8 - PRO

INFO - 08:25:45: EPOCH 9 - PROGRESS: at 5.03% examples, 26036 words/s, in_qsize 29, out_qsize 0
INFO - 08:26:15: EPOCH 9 - PROGRESS: at 5.83% examples, 25639 words/s, in_qsize 29, out_qsize 0
INFO - 08:26:45: EPOCH 9 - PROGRESS: at 6.68% examples, 25416 words/s, in_qsize 29, out_qsize 0
INFO - 08:27:16: EPOCH 9 - PROGRESS: at 7.50% examples, 25267 words/s, in_qsize 29, out_qsize 0
INFO - 08:27:47: EPOCH 9 - PROGRESS: at 8.34% examples, 25157 words/s, in_qsize 29, out_qsize 0
INFO - 08:28:17: EPOCH 9 - PROGRESS: at 9.17% examples, 25096 words/s, in_qsize 29, out_qsize 0
INFO - 08:28:47: EPOCH 9 - PROGRESS: at 9.97% examples, 24934 words/s, in_qsize 29, out_qsize 0
INFO - 08:29:17: EPOCH 9 - PROGRESS: at 10.82% examples, 24870 words/s, in_qsize 28, out_qsize 0
INFO - 08:29:48: EPOCH 9 - PROGRESS: at 11.64% examples, 24777 words/s, in_qsize 29, out_qsize 0
INFO - 08:30:18: EPOCH 9 - PROGRESS: at 12.46% examples, 24614 words/s, in_qsize 29, out_qsize 0
INFO - 08:30:50: EPOCH 9 - PROGRESS: 

INFO - 09:08:51: EPOCH 9 - PROGRESS: at 75.05% examples, 24436 words/s, in_qsize 29, out_qsize 0
INFO - 09:09:21: EPOCH 9 - PROGRESS: at 75.83% examples, 24445 words/s, in_qsize 30, out_qsize 0
INFO - 09:09:51: EPOCH 9 - PROGRESS: at 76.60% examples, 24446 words/s, in_qsize 29, out_qsize 0
INFO - 09:10:21: EPOCH 9 - PROGRESS: at 77.40% examples, 24449 words/s, in_qsize 29, out_qsize 0
INFO - 09:10:51: EPOCH 9 - PROGRESS: at 78.22% examples, 24459 words/s, in_qsize 29, out_qsize 0
INFO - 09:11:21: EPOCH 9 - PROGRESS: at 79.04% examples, 24460 words/s, in_qsize 30, out_qsize 0
INFO - 09:11:51: EPOCH 9 - PROGRESS: at 79.84% examples, 24452 words/s, in_qsize 28, out_qsize 0
INFO - 09:12:22: EPOCH 9 - PROGRESS: at 80.66% examples, 24430 words/s, in_qsize 29, out_qsize 0
INFO - 09:12:52: EPOCH 9 - PROGRESS: at 81.45% examples, 24406 words/s, in_qsize 29, out_qsize 0
INFO - 09:13:23: EPOCH 9 - PROGRESS: at 82.37% examples, 24424 words/s, in_qsize 27, out_qsize 0
INFO - 09:13:53: EPOCH 9 - PRO

INFO - 09:42:55: EPOCH 10 - PROGRESS: at 31.83% examples, 24299 words/s, in_qsize 30, out_qsize 0
INFO - 09:43:26: EPOCH 10 - PROGRESS: at 32.66% examples, 24323 words/s, in_qsize 29, out_qsize 0
INFO - 09:43:56: EPOCH 10 - PROGRESS: at 33.48% examples, 24318 words/s, in_qsize 29, out_qsize 0
INFO - 09:44:26: EPOCH 10 - PROGRESS: at 34.33% examples, 24296 words/s, in_qsize 29, out_qsize 0
INFO - 09:44:56: EPOCH 10 - PROGRESS: at 35.13% examples, 24260 words/s, in_qsize 29, out_qsize 0
INFO - 09:45:26: EPOCH 10 - PROGRESS: at 36.01% examples, 24277 words/s, in_qsize 29, out_qsize 0
INFO - 09:45:57: EPOCH 10 - PROGRESS: at 36.83% examples, 24288 words/s, in_qsize 29, out_qsize 0
INFO - 09:46:27: EPOCH 10 - PROGRESS: at 37.66% examples, 24274 words/s, in_qsize 29, out_qsize 0
INFO - 09:46:57: EPOCH 10 - PROGRESS: at 38.49% examples, 24275 words/s, in_qsize 29, out_qsize 0
INFO - 09:47:28: EPOCH 10 - PROGRESS: at 39.31% examples, 24284 words/s, in_qsize 29, out_qsize 0
INFO - 09:47:58: EPO

INFO - 10:24:47: worker thread finished; awaiting finish of 13 more threads
INFO - 10:24:47: worker thread finished; awaiting finish of 12 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 11 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 10 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 9 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 8 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 7 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 6 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 5 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 4 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 3 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 2 more threads
INFO - 10:24:48: worker thread finished; awaiting finish of 1 more threads
INFO - 10:24:48: work

Time to train the model: 607.88 mins


In [443]:
# d2v_model.save('models-doc2vec/model_epoch_2.model')

INFO - 22:02:24: saving Doc2Vec object under models-doc2vec/model_epoch_2.model, separately None
INFO - 22:02:24: storing np array 'syn1neg' to models-doc2vec/model_epoch_2.model.trainables.syn1neg.npy
INFO - 22:02:24: storing np array 'vectors' to models-doc2vec/model_epoch_2.model.wv.vectors.npy
INFO - 22:02:25: saved models-doc2vec/model_epoch_2.model


In [420]:
# Prototype search pipeline below
'''
len(d2v_model.docvecs.index2entity)

query = clean_text('The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia.')
query_vec = d2v_model.infer_vector(query, epochs=100)

eng_texts[0][0]

doc_weight_mat = np.zeros((len(d2v_model.docvecs.index2entity), 300))
for i, cord_uid in enumerate(tqdm(d2v_model.docvecs.index2entity)):
    doc_weight_mat[i] = d2v_model.docvecs[cord_uid]

def cosine_sim(vec: np.ndarray, mat: np.ndarray):
    return vec @ mat.T / (np.linalg.norm(vec) * np.linalg.norm(mat, axis=1))

query_vec.shape, doc_weight_mat.shape

# Find closest document
#keys = d2v_model.docvecs.index2entity
similarities = cosine_sim(query_vec, doc_weight_mat)

top_n = 10
sorted_indicies = similarities.argsort()[::-1]
top_sim = list(zip(np.array(d2v_model.docvecs.index2entity)[sorted_indicies][:top_n], similarities[sorted_indicies][:top_n]))
top_sim

top_cord_uid = top_sim[0][0]
best_text = cord_data[cord_data['cord_uid'] == top_cord_uid.split('_')[0]].body_text.values[0]
best_text_sentences = nltk.tokenize.sent_tokenize(best_text)
best_text_sentences[int(top_cord_uid.split('_')[1])]
'''

"\nlen(d2v_model.docvecs.index2entity)\n\nquery = clean_text('The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia.')\nquery_vec = d2v_model.infer_vector(query, epochs=100)\n\neng_texts[0][0]\n\ndoc_weight_mat = np.zeros((len(d2v_model.docvecs.index2entity), 300))\nfor i, cord_uid in enumerate(tqdm(d2v_model.docvecs.index2entity)):\n    doc_weight_mat[i] = d2v_model.docvecs[cord_uid]\n\ndef cosine_sim(vec: np.ndarray, mat: np.ndarray):\n    return vec @ mat.T / (np.linalg.norm(vec) * np.linalg.norm(mat, axis=1))\n\nquery_vec.shape, doc_weight_mat.shape\n\n# Find closest document\n#keys = d2v_model.docvecs.index2entity\nsimilarities = cosine_sim(query_vec, doc_weight_mat)\n\ntop_n = 10\nsorted_indicies = similarities.argsort()[::-1]\ntop_sim = list(zip(np.array(d2v_model.docvecs.index2entity)[sorted_indicies][:top_n], similarities[sorted_indicies][:top_n]))\ntop_sim\n\ntop_cord_uid = top_sim[0][0]\nbest_text = cord_data[cord_d

## Learn word embeddings using Word2vec

In [40]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, logs_filename: str):
        self.output_dir = output_dir
        self.prefix = prefix
        self.logs_filename = logs_filename
        self.epoch = 0

    def on_epoch_end(self, model):
        cum_loss = model.get_latest_training_loss()
        if self.epoch == 0:
            loss = cum_loss
        else:
            loss = cum_loss - self.loss_previous_step
        self.loss_previous_step = loss
        with open(join_path(self.output_dir, self.logs_filename), 'a+') as file:
            file.write(f'Epoch #{self.epoch}, loss: {loss}\n')
        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1    

In [46]:
# Setup initial model
w2v_model = Word2Vec(
    min_count=20,
    window=2,
    size=300,
    sample=1e-5,
    alpha=0.025,
    min_alpha=0.0005,
    negative=20,
    workers=cores-1,
    callbacks=[EpochSaver(w2v_saved_models_dir, saved_models_prefix, train_logs_path)]
)

In [47]:
# Build vocabulary
t = time()
w2v_model.build_vocab(tqdm(cord_sentences, total=cord_num_sentences), progress_per=int(cord_num_sentences / 100))
print(f'Time to build vocab: {round((time() - t) / 60, 2)} mins')

HBox(children=(IntProgress(value=0, max=6522945), HTML(value='')))

INFO - 20:42:25: collecting all words and their counts
INFO - 20:42:25: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 20:42:36: PROGRESS: at sentence #65229, processed 822775 words, keeping 53450 word types
INFO - 20:42:46: PROGRESS: at sentence #130458, processed 1682936 words, keeping 83744 word types
INFO - 20:42:57: PROGRESS: at sentence #195687, processed 2494896 words, keeping 107266 word types
INFO - 20:43:08: PROGRESS: at sentence #260916, processed 3348805 words, keeping 133645 word types
INFO - 20:43:18: PROGRESS: at sentence #326145, processed 4150375 words, keeping 149386 word types
INFO - 20:43:29: PROGRESS: at sentence #391374, processed 5056104 words, keeping 171433 word types
INFO - 20:43:39: PROGRESS: at sentence #456603, processed 5930671 words, keeping 186592 word types
INFO - 20:43:50: PROGRESS: at sentence #521832, processed 6825462 words, keeping 201622 word types
INFO - 20:44:01: PROGRESS: at sentence #587061, processed 7707620 words, k

INFO - 20:57:12: PROGRESS: at sentence #5283549, processed 72805141 words, keeping 1242555 word types
INFO - 20:57:24: PROGRESS: at sentence #5348778, processed 73658257 words, keeping 1257489 word types
INFO - 20:57:35: PROGRESS: at sentence #5414007, processed 74601054 words, keeping 1278917 word types
INFO - 20:57:46: PROGRESS: at sentence #5479236, processed 75549958 words, keeping 1289450 word types
INFO - 20:57:57: PROGRESS: at sentence #5544465, processed 76469151 words, keeping 1298324 word types
INFO - 20:58:07: PROGRESS: at sentence #5609694, processed 77397295 words, keeping 1307909 word types
INFO - 20:58:18: PROGRESS: at sentence #5674923, processed 78319132 words, keeping 1317168 word types
INFO - 20:58:29: PROGRESS: at sentence #5740152, processed 79245559 words, keeping 1325557 word types
INFO - 20:58:40: PROGRESS: at sentence #5805381, processed 80153081 words, keeping 1334353 word types
INFO - 20:58:50: PROGRESS: at sentence #5870610, processed 80970626 words, keeping




INFO - 21:00:38: effective_min_count=20 retains 100466 unique words (7% of original 1424699, drops 1324233)
INFO - 21:00:38: effective_min_count=20 leaves 86688229 word corpus (96% of original 89871314, drops 3183085)
INFO - 21:00:38: deleting the raw counts dictionary of 1424699 items
INFO - 21:00:38: sample=1e-05 downsamples 5370 most-common words
INFO - 21:00:38: downsampling leaves estimated 36494016 word corpus (42.1% of prior 86688229)
INFO - 21:00:39: estimated required memory for 100466 words and 300 dimensions: 291351400 bytes
INFO - 21:00:39: resetting layer weights


Time to build vocab: 18.54 mins


In [48]:
# Train model
t = time()
w2v_model.train(cord_sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=30)
print(f'Time to train the model: {round((time() - t) / 60, 2)} mins')

INFO - 21:01:12: training model with 7 workers on 100466 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=2
INFO - 21:01:13: EPOCH 1 - PROGRESS: at 0.09% examples, 27104 words/s, in_qsize 0, out_qsize 0
INFO - 21:01:43: EPOCH 1 - PROGRESS: at 2.97% examples, 32075 words/s, in_qsize 0, out_qsize 0
INFO - 21:02:13: EPOCH 1 - PROGRESS: at 5.84% examples, 33382 words/s, in_qsize 0, out_qsize 0
INFO - 21:02:43: EPOCH 1 - PROGRESS: at 8.65% examples, 33875 words/s, in_qsize 0, out_qsize 0
INFO - 21:03:13: EPOCH 1 - PROGRESS: at 11.48% examples, 34215 words/s, in_qsize 0, out_qsize 0
INFO - 21:03:44: EPOCH 1 - PROGRESS: at 14.34% examples, 34224 words/s, in_qsize 0, out_qsize 0
INFO - 21:04:14: EPOCH 1 - PROGRESS: at 17.26% examples, 34268 words/s, in_qsize 0, out_qsize 0
INFO - 21:04:44: EPOCH 1 - PROGRESS: at 20.09% examples, 34264 words/s, in_qsize 0, out_qsize 0
INFO - 21:05:14: EPOCH 1 - PROGRESS: at 22.91% examples, 34216 words/s, in_qsize 0, out_qsize 0
INFO

INFO - 21:37:25: EPOCH 2 - PROGRESS: at 99.21% examples, 33380 words/s, in_qsize 0, out_qsize 0
INFO - 21:37:34: worker thread finished; awaiting finish of 6 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 5 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 4 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 3 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 2 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 1 more threads
INFO - 21:37:34: worker thread finished; awaiting finish of 0 more threads
INFO - 21:37:34: EPOCH - 2 : training on 89871314 raw words (36491174 effective words) took 1092.9s, 33389 effective words/s
INFO - 21:37:34: saving Word2Vec object under models/w2v_model_epoch_1.model, separately None
INFO - 21:37:34: storing np array 'vectors' to models/w2v_model_epoch_1.model.wv.vectors.npy
INFO - 21:37:34: not storing attribute vectors_norm
INFO - 21:37:34

INFO - 22:08:49: EPOCH 4 - PROGRESS: at 72.54% examples, 33681 words/s, in_qsize 0, out_qsize 0
INFO - 22:09:19: EPOCH 4 - PROGRESS: at 75.15% examples, 33643 words/s, in_qsize 0, out_qsize 0
INFO - 22:09:49: EPOCH 4 - PROGRESS: at 77.66% examples, 33568 words/s, in_qsize 0, out_qsize 0
INFO - 22:10:19: EPOCH 4 - PROGRESS: at 80.24% examples, 33487 words/s, in_qsize 0, out_qsize 0
INFO - 22:10:49: EPOCH 4 - PROGRESS: at 82.81% examples, 33374 words/s, in_qsize 0, out_qsize 0
INFO - 22:11:19: EPOCH 4 - PROGRESS: at 85.41% examples, 33341 words/s, in_qsize 0, out_qsize 0
INFO - 22:11:49: EPOCH 4 - PROGRESS: at 88.05% examples, 33333 words/s, in_qsize 0, out_qsize 0
INFO - 22:12:19: EPOCH 4 - PROGRESS: at 90.73% examples, 33188 words/s, in_qsize 0, out_qsize 0
INFO - 22:12:49: EPOCH 4 - PROGRESS: at 93.43% examples, 33095 words/s, in_qsize 0, out_qsize 0
INFO - 22:13:19: EPOCH 4 - PROGRESS: at 96.04% examples, 33062 words/s, in_qsize 0, out_qsize 0
INFO - 22:13:49: EPOCH 4 - PROGRESS: at 

INFO - 22:40:12: EPOCH 6 - PROGRESS: at 35.69% examples, 32937 words/s, in_qsize 0, out_qsize 0
INFO - 22:40:42: EPOCH 6 - PROGRESS: at 38.44% examples, 32926 words/s, in_qsize 0, out_qsize 0
INFO - 22:41:13: EPOCH 6 - PROGRESS: at 41.18% examples, 33001 words/s, in_qsize 0, out_qsize 0
INFO - 22:41:43: EPOCH 6 - PROGRESS: at 43.89% examples, 32891 words/s, in_qsize 0, out_qsize 0
INFO - 22:42:13: EPOCH 6 - PROGRESS: at 46.50% examples, 32848 words/s, in_qsize 0, out_qsize 0
INFO - 22:42:43: EPOCH 6 - PROGRESS: at 49.09% examples, 32799 words/s, in_qsize 0, out_qsize 0
INFO - 22:43:13: EPOCH 6 - PROGRESS: at 51.73% examples, 32774 words/s, in_qsize 0, out_qsize 0
INFO - 22:43:43: EPOCH 6 - PROGRESS: at 54.34% examples, 32753 words/s, in_qsize 0, out_qsize 0
INFO - 22:44:13: EPOCH 6 - PROGRESS: at 57.00% examples, 32800 words/s, in_qsize 0, out_qsize 0
INFO - 22:44:44: EPOCH 6 - PROGRESS: at 59.58% examples, 32783 words/s, in_qsize 0, out_qsize 0
INFO - 22:45:14: EPOCH 6 - PROGRESS: at 

INFO - 23:11:44: EPOCH 8 - PROGRESS: at 0.09% examples, 27200 words/s, in_qsize 0, out_qsize 0
INFO - 23:12:15: EPOCH 8 - PROGRESS: at 2.82% examples, 30356 words/s, in_qsize 0, out_qsize 0
INFO - 23:12:45: EPOCH 8 - PROGRESS: at 5.52% examples, 31360 words/s, in_qsize 0, out_qsize 0
INFO - 23:13:15: EPOCH 8 - PROGRESS: at 8.20% examples, 31986 words/s, in_qsize 0, out_qsize 0
INFO - 23:13:45: EPOCH 8 - PROGRESS: at 10.86% examples, 32378 words/s, in_qsize 0, out_qsize 0
INFO - 23:14:15: EPOCH 8 - PROGRESS: at 13.46% examples, 32217 words/s, in_qsize 5, out_qsize 0
INFO - 23:14:45: EPOCH 8 - PROGRESS: at 16.31% examples, 32549 words/s, in_qsize 0, out_qsize 0
INFO - 23:15:15: EPOCH 8 - PROGRESS: at 19.05% examples, 32561 words/s, in_qsize 0, out_qsize 0
INFO - 23:15:45: EPOCH 8 - PROGRESS: at 21.75% examples, 32466 words/s, in_qsize 0, out_qsize 0
INFO - 23:16:15: EPOCH 8 - PROGRESS: at 24.38% examples, 32583 words/s, in_qsize 0, out_qsize 0
INFO - 23:16:45: EPOCH 8 - PROGRESS: at 27.0

INFO - 23:49:10: EPOCH 9 - PROGRESS: at 93.47% examples, 31445 words/s, in_qsize 0, out_qsize 0
INFO - 23:49:40: EPOCH 9 - PROGRESS: at 96.06% examples, 31452 words/s, in_qsize 0, out_qsize 0
INFO - 23:50:10: EPOCH 9 - PROGRESS: at 98.64% examples, 31464 words/s, in_qsize 0, out_qsize 0
INFO - 23:50:26: worker thread finished; awaiting finish of 6 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 5 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 4 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 3 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 2 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 1 more threads
INFO - 23:50:26: worker thread finished; awaiting finish of 0 more threads
INFO - 23:50:26: EPOCH - 9 : training on 89871314 raw words (36497135 effective words) took 1159.8s, 31470 effective words/s
INFO - 23:50:26: saving Word2Vec object under models/w2v_model

INFO - 00:20:20: EPOCH 11 - PROGRESS: at 55.33% examples, 31718 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:50: EPOCH 11 - PROGRESS: at 57.84% examples, 31719 words/s, in_qsize 0, out_qsize 0
INFO - 00:21:20: EPOCH 11 - PROGRESS: at 60.34% examples, 31713 words/s, in_qsize 0, out_qsize 0
INFO - 00:21:50: EPOCH 11 - PROGRESS: at 62.83% examples, 31671 words/s, in_qsize 0, out_qsize 0
INFO - 00:22:20: EPOCH 11 - PROGRESS: at 65.33% examples, 31627 words/s, in_qsize 0, out_qsize 0
INFO - 00:22:50: EPOCH 11 - PROGRESS: at 67.82% examples, 31609 words/s, in_qsize 0, out_qsize 0
INFO - 00:23:20: EPOCH 11 - PROGRESS: at 70.32% examples, 31596 words/s, in_qsize 0, out_qsize 0
INFO - 00:23:50: EPOCH 11 - PROGRESS: at 72.82% examples, 31583 words/s, in_qsize 0, out_qsize 0
INFO - 00:24:21: EPOCH 11 - PROGRESS: at 75.32% examples, 31575 words/s, in_qsize 0, out_qsize 0
INFO - 00:24:51: EPOCH 11 - PROGRESS: at 77.83% examples, 31571 words/s, in_qsize 0, out_qsize 0
INFO - 00:25:21: EPOCH 11 - PR

INFO - 00:51:00: EPOCH 13 - PROGRESS: at 13.62% examples, 32523 words/s, in_qsize 2, out_qsize 1
INFO - 00:51:30: EPOCH 13 - PROGRESS: at 16.53% examples, 32894 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:00: EPOCH 13 - PROGRESS: at 19.29% examples, 32944 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:30: EPOCH 13 - PROGRESS: at 22.03% examples, 32860 words/s, in_qsize 0, out_qsize 0
INFO - 00:53:00: EPOCH 13 - PROGRESS: at 24.78% examples, 33086 words/s, in_qsize 0, out_qsize 0
INFO - 00:53:30: EPOCH 13 - PROGRESS: at 27.49% examples, 32964 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:00: EPOCH 13 - PROGRESS: at 30.24% examples, 32958 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:31: EPOCH 13 - PROGRESS: at 32.99% examples, 32986 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:01: EPOCH 13 - PROGRESS: at 35.74% examples, 32980 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:31: EPOCH 13 - PROGRESS: at 38.46% examples, 32930 words/s, in_qsize 0, out_qsize 0
INFO - 00:56:01: EPOCH 13 - PR

INFO - 01:26:04: worker thread finished; awaiting finish of 3 more threads
INFO - 01:26:04: worker thread finished; awaiting finish of 2 more threads
INFO - 01:26:04: worker thread finished; awaiting finish of 1 more threads
INFO - 01:26:04: worker thread finished; awaiting finish of 0 more threads
INFO - 01:26:04: EPOCH - 14 : training on 89871314 raw words (36495949 effective words) took 1130.6s, 32280 effective words/s
INFO - 01:26:04: saving Word2Vec object under models/w2v_model_epoch_13.model, separately None
INFO - 01:26:04: storing np array 'vectors' to models/w2v_model_epoch_13.model.wv.vectors.npy
INFO - 01:26:04: not storing attribute vectors_norm
INFO - 01:26:04: storing np array 'syn1neg' to models/w2v_model_epoch_13.model.trainables.syn1neg.npy
INFO - 01:26:04: not storing attribute cum_table
INFO - 01:26:04: saved models/w2v_model_epoch_13.model
INFO - 01:26:05: EPOCH 15 - PROGRESS: at 0.09% examples, 28386 words/s, in_qsize 0, out_qsize 0
INFO - 01:26:35: EPOCH 15 - PRO

INFO - 01:58:38: EPOCH 16 - PROGRESS: at 73.22% examples, 32916 words/s, in_qsize 0, out_qsize 0
INFO - 01:59:08: EPOCH 16 - PROGRESS: at 75.86% examples, 32921 words/s, in_qsize 0, out_qsize 0
INFO - 01:59:38: EPOCH 16 - PROGRESS: at 78.47% examples, 32920 words/s, in_qsize 0, out_qsize 0
INFO - 02:00:08: EPOCH 16 - PROGRESS: at 81.12% examples, 32833 words/s, in_qsize 0, out_qsize 0
INFO - 02:00:38: EPOCH 16 - PROGRESS: at 83.74% examples, 32806 words/s, in_qsize 0, out_qsize 0
INFO - 02:01:09: EPOCH 16 - PROGRESS: at 86.45% examples, 32829 words/s, in_qsize 0, out_qsize 0
INFO - 02:01:39: EPOCH 16 - PROGRESS: at 89.20% examples, 32869 words/s, in_qsize 0, out_qsize 0
INFO - 02:02:09: EPOCH 16 - PROGRESS: at 91.99% examples, 32697 words/s, in_qsize 0, out_qsize 0
INFO - 02:02:39: EPOCH 16 - PROGRESS: at 94.69% examples, 32697 words/s, in_qsize 0, out_qsize 0
INFO - 02:03:09: EPOCH 16 - PROGRESS: at 97.41% examples, 32716 words/s, in_qsize 0, out_qsize 0
INFO - 02:03:38: worker thread

INFO - 02:29:44: EPOCH 18 - PROGRESS: at 37.07% examples, 31692 words/s, in_qsize 0, out_qsize 0
INFO - 02:30:14: EPOCH 18 - PROGRESS: at 39.69% examples, 31729 words/s, in_qsize 0, out_qsize 0
INFO - 02:30:44: EPOCH 18 - PROGRESS: at 42.28% examples, 31619 words/s, in_qsize 0, out_qsize 0
INFO - 02:31:14: EPOCH 18 - PROGRESS: at 44.88% examples, 31595 words/s, in_qsize 0, out_qsize 0
INFO - 02:31:44: EPOCH 18 - PROGRESS: at 47.27% examples, 31484 words/s, in_qsize 0, out_qsize 0
INFO - 02:32:14: EPOCH 18 - PROGRESS: at 49.78% examples, 31464 words/s, in_qsize 0, out_qsize 0
INFO - 02:32:45: EPOCH 18 - PROGRESS: at 52.27% examples, 31417 words/s, in_qsize 0, out_qsize 0
INFO - 02:33:15: EPOCH 18 - PROGRESS: at 54.78% examples, 31428 words/s, in_qsize 0, out_qsize 0
INFO - 02:33:45: EPOCH 18 - PROGRESS: at 57.26% examples, 31428 words/s, in_qsize 0, out_qsize 0
INFO - 02:34:15: EPOCH 18 - PROGRESS: at 59.79% examples, 31450 words/s, in_qsize 0, out_qsize 0
INFO - 02:34:45: EPOCH 18 - PR

INFO - 03:00:47: not storing attribute cum_table
INFO - 03:00:47: saved models/w2v_model_epoch_18.model
INFO - 03:00:48: EPOCH 20 - PROGRESS: at 0.09% examples, 28207 words/s, in_qsize 0, out_qsize 0
INFO - 03:01:18: EPOCH 20 - PROGRESS: at 2.82% examples, 30341 words/s, in_qsize 0, out_qsize 0
INFO - 03:01:48: EPOCH 20 - PROGRESS: at 5.51% examples, 31294 words/s, in_qsize 0, out_qsize 0
INFO - 03:02:18: EPOCH 20 - PROGRESS: at 8.20% examples, 31960 words/s, in_qsize 0, out_qsize 0
INFO - 03:02:48: EPOCH 20 - PROGRESS: at 10.93% examples, 32556 words/s, in_qsize 0, out_qsize 0
INFO - 03:03:19: EPOCH 20 - PROGRESS: at 13.62% examples, 32314 words/s, in_qsize 3, out_qsize 0
INFO - 03:03:50: EPOCH 20 - PROGRESS: at 16.50% examples, 32650 words/s, in_qsize 0, out_qsize 0
INFO - 03:04:20: EPOCH 20 - PROGRESS: at 19.29% examples, 32772 words/s, in_qsize 0, out_qsize 0
INFO - 03:04:50: EPOCH 20 - PROGRESS: at 22.07% examples, 32785 words/s, in_qsize 0, out_qsize 0
INFO - 03:05:20: EPOCH 20 -

INFO - 03:37:26: EPOCH 21 - PROGRESS: at 97.91% examples, 32910 words/s, in_qsize 0, out_qsize 0
INFO - 03:37:49: worker thread finished; awaiting finish of 6 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 5 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 4 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 3 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 2 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 1 more threads
INFO - 03:37:49: worker thread finished; awaiting finish of 0 more threads
INFO - 03:37:49: EPOCH - 21 : training on 89871314 raw words (36494142 effective words) took 1108.6s, 32918 effective words/s
INFO - 03:37:49: saving Word2Vec object under models/w2v_model_epoch_20.model, separately None
INFO - 03:37:49: storing np array 'vectors' to models/w2v_model_epoch_20.model.wv.vectors.npy
INFO - 03:37:49: not storing attribute vectors_norm
INFO - 03:3

INFO - 04:08:26: EPOCH 23 - PROGRESS: at 65.29% examples, 32931 words/s, in_qsize 0, out_qsize 0
INFO - 04:08:56: EPOCH 23 - PROGRESS: at 68.08% examples, 33009 words/s, in_qsize 0, out_qsize 0
INFO - 04:09:26: EPOCH 23 - PROGRESS: at 70.89% examples, 33087 words/s, in_qsize 0, out_qsize 0
INFO - 04:09:56: EPOCH 23 - PROGRESS: at 73.66% examples, 33148 words/s, in_qsize 0, out_qsize 0
INFO - 04:10:26: EPOCH 23 - PROGRESS: at 76.45% examples, 33225 words/s, in_qsize 0, out_qsize 0
INFO - 04:10:56: EPOCH 23 - PROGRESS: at 79.18% examples, 33243 words/s, in_qsize 1, out_qsize 0
INFO - 04:11:27: EPOCH 23 - PROGRESS: at 81.87% examples, 33132 words/s, in_qsize 8, out_qsize 2
INFO - 04:11:57: EPOCH 23 - PROGRESS: at 84.73% examples, 33216 words/s, in_qsize 0, out_qsize 0
INFO - 04:12:27: EPOCH 23 - PROGRESS: at 87.45% examples, 33243 words/s, in_qsize 0, out_qsize 0
INFO - 04:12:57: EPOCH 23 - PROGRESS: at 90.23% examples, 33182 words/s, in_qsize 0, out_qsize 0
INFO - 04:13:27: EPOCH 23 - PR

INFO - 04:38:56: EPOCH 25 - PROGRESS: at 28.06% examples, 33370 words/s, in_qsize 5, out_qsize 2
INFO - 04:39:26: EPOCH 25 - PROGRESS: at 31.00% examples, 33508 words/s, in_qsize 0, out_qsize 0
INFO - 04:39:56: EPOCH 25 - PROGRESS: at 33.77% examples, 33559 words/s, in_qsize 0, out_qsize 0
INFO - 04:40:26: EPOCH 25 - PROGRESS: at 36.57% examples, 33526 words/s, in_qsize 0, out_qsize 0
INFO - 04:40:56: EPOCH 25 - PROGRESS: at 39.34% examples, 33558 words/s, in_qsize 0, out_qsize 0
INFO - 04:41:26: EPOCH 25 - PROGRESS: at 42.12% examples, 33476 words/s, in_qsize 0, out_qsize 0
INFO - 04:41:56: EPOCH 25 - PROGRESS: at 44.87% examples, 33438 words/s, in_qsize 0, out_qsize 0
INFO - 04:42:26: EPOCH 25 - PROGRESS: at 47.44% examples, 33337 words/s, in_qsize 0, out_qsize 0
INFO - 04:42:57: EPOCH 25 - PROGRESS: at 50.12% examples, 33332 words/s, in_qsize 0, out_qsize 0
INFO - 04:43:27: EPOCH 25 - PROGRESS: at 52.78% examples, 33290 words/s, in_qsize 0, out_qsize 0
INFO - 04:43:57: EPOCH 25 - PR

INFO - 05:09:44: EPOCH 27 - PROGRESS: at 0.10% examples, 29223 words/s, in_qsize 0, out_qsize 0
INFO - 05:10:15: EPOCH 27 - PROGRESS: at 3.03% examples, 32343 words/s, in_qsize 0, out_qsize 0
INFO - 05:10:45: EPOCH 27 - PROGRESS: at 5.91% examples, 33530 words/s, in_qsize 1, out_qsize 0
INFO - 05:11:15: EPOCH 27 - PROGRESS: at 8.80% examples, 34243 words/s, in_qsize 0, out_qsize 0
INFO - 05:11:45: EPOCH 27 - PROGRESS: at 11.54% examples, 34216 words/s, in_qsize 0, out_qsize 0
INFO - 05:12:15: EPOCH 27 - PROGRESS: at 14.40% examples, 34289 words/s, in_qsize 0, out_qsize 0
INFO - 05:12:46: EPOCH 27 - PROGRESS: at 17.38% examples, 34448 words/s, in_qsize 0, out_qsize 0
INFO - 05:13:16: EPOCH 27 - PROGRESS: at 20.34% examples, 34606 words/s, in_qsize 0, out_qsize 0
INFO - 05:13:46: EPOCH 27 - PROGRESS: at 23.26% examples, 34707 words/s, in_qsize 0, out_qsize 0
INFO - 05:14:16: EPOCH 27 - PROGRESS: at 26.16% examples, 34859 words/s, in_qsize 0, out_qsize 0
INFO - 05:14:46: EPOCH 27 - PROGRE

INFO - 05:44:48: worker thread finished; awaiting finish of 3 more threads
INFO - 05:44:48: worker thread finished; awaiting finish of 2 more threads
INFO - 05:44:48: worker thread finished; awaiting finish of 1 more threads
INFO - 05:44:48: worker thread finished; awaiting finish of 0 more threads
INFO - 05:44:48: EPOCH - 28 : training on 89871314 raw words (36495971 effective words) took 1054.9s, 34596 effective words/s
INFO - 05:44:48: saving Word2Vec object under models/w2v_model_epoch_27.model, separately None
INFO - 05:44:48: storing np array 'vectors' to models/w2v_model_epoch_27.model.wv.vectors.npy
INFO - 05:44:48: not storing attribute vectors_norm
INFO - 05:44:48: storing np array 'syn1neg' to models/w2v_model_epoch_27.model.trainables.syn1neg.npy
INFO - 05:44:48: not storing attribute cum_table
INFO - 05:44:49: saved models/w2v_model_epoch_27.model
INFO - 05:44:50: EPOCH 29 - PROGRESS: at 0.09% examples, 26529 words/s, in_qsize 0, out_qsize 0
INFO - 05:45:20: EPOCH 29 - PRO

INFO - 06:17:27: EPOCH 30 - PROGRESS: at 76.80% examples, 33354 words/s, in_qsize 0, out_qsize 0
INFO - 06:17:57: EPOCH 30 - PROGRESS: at 79.43% examples, 33328 words/s, in_qsize 0, out_qsize 0
INFO - 06:18:28: EPOCH 30 - PROGRESS: at 82.15% examples, 33241 words/s, in_qsize 0, out_qsize 0
INFO - 06:18:58: EPOCH 30 - PROGRESS: at 84.89% examples, 33273 words/s, in_qsize 0, out_qsize 0
INFO - 06:19:28: EPOCH 30 - PROGRESS: at 87.71% examples, 33336 words/s, in_qsize 0, out_qsize 0
INFO - 06:19:58: EPOCH 30 - PROGRESS: at 90.56% examples, 33268 words/s, in_qsize 0, out_qsize 0
INFO - 06:20:28: EPOCH 30 - PROGRESS: at 93.41% examples, 33217 words/s, in_qsize 0, out_qsize 0
INFO - 06:20:58: EPOCH 30 - PROGRESS: at 96.12% examples, 33211 words/s, in_qsize 0, out_qsize 0
INFO - 06:21:28: EPOCH 30 - PROGRESS: at 98.77% examples, 33200 words/s, in_qsize 0, out_qsize 0
INFO - 06:21:42: worker thread finished; awaiting finish of 6 more threads
INFO - 06:21:42: worker thread finished; awaiting fi

Time to train the model: 560.5 mins
