# Generate Language models from scratch

In [1]:
from argparse import Namespace

import copy
import glob
import os
import pandas as pd
import pickle
import re
import spacy
from time import time

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Read DB

In [2]:
db_sentence = pd.read_pickle("./db_trove_v002.pkl")
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,...,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup,ocr_dict_lookup_list,ocr_dict_perc,corrected_dict_lookup_list,corr_dict_perc,corrected_sentencizer_list,ocr_sentencizer_list,jaccard_similarity
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,...,"[[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, ...","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6...","[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6,...",79.439252,"[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, 5...",92.079208,"[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, SP...","[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I,...",0.305882
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,...,"[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, 2...",96.590909,"[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, 2...",98.876404,"[Natural, Childbirth, Sir,-We, nurses, have, s...","[Natural, Childbirth, Sir,-We, nurses, have, s...",0.52381
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,...,"[[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7...","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3...","[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3,...",82.0,"[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7,...",99.206349,"[FIRST, CHURCH, SERVICE, Presbyterian, Anniver...","[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, I...",0.191919
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,...,"[[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1], [4, 5, 6,...","[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1, 4, 5, 6, 6,...",95.58011,"[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1,...",95.652174,"["", Bob, "", Lulham, 's, Fight, Against, Thalli...","["", Bob, "", Lulham, 's, Fight, Against, Thalli...",0.282682
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,...,"[[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1...","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -...","[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -6...",82.882883,"[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1,...",86.486486,"[DIVORCE, Before, The, Judge, In, Divorce, ,, ...","[DIVORCE, Before, The, Judge, In, Divorce, ,, ...",0.147513


## preprocess DB

In [3]:
def cleanup(myrow, col_name):
    all_clean_rows = []
    for sent in myrow[col_name]:
        one_clean_row = []
        for token in sent:
            one_clean_row.append(token.lower())
        all_clean_rows.append(one_clean_row)
    return all_clean_rows

In [4]:
db_sentence["ocr_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["ocr_sentencizer"], axis=1)
db_sentence["corrected_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["corrected_sentencizer"], axis=1)

In [5]:
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,...,ocr_dict_lookup,ocr_dict_lookup_list,ocr_dict_perc,corrected_dict_lookup_list,corr_dict_perc,corrected_sentencizer_list,ocr_sentencizer_list,jaccard_similarity,ocr_sentencizer_cleaned,corrected_sentencizer_cleaned
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,...,"[[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6...","[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6,...",79.439252,"[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, 5...",92.079208,"[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, SP...","[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I,...",0.305882,"[[from, river, crossing, to, end, of, triäü, i...","[[from, river, crossing, to, end, of, trial, s..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,...,"[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, 2...",96.590909,"[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, 2...",98.876404,"[Natural, Childbirth, Sir,-We, nurses, have, s...","[Natural, Childbirth, Sir,-We, nurses, have, s...",0.52381,"[[natural, childbirth, sir,-we, nurses, have, ...","[[natural, childbirth, sir,-we, nurses, have, ..."
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,...,"[[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3...","[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3,...",82.0,"[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7,...",99.206349,"[FIRST, CHURCH, SERVICE, Presbyterian, Anniver...","[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, I...",0.191919,"[[first, church, i, service, 1, presbyterian, ...","[[first, church, service, presbyterian, annive..."
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,...,"[[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1], [4, 5, 6,...","[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1, 4, 5, 6, 6,...",95.58011,"[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1,...",95.652174,"["", Bob, "", Lulham, 's, Fight, Against, Thalli...","["", Bob, "", Lulham, 's, Fight, Against, Thalli...",0.282682,"[["", bob, "", lulham, 's, fight, against, thall...","[["", bob, "", lulham, 's, fight, against, thall..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,...,"[[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -...","[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -6...",82.882883,"[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1,...",86.486486,"[DIVORCE, Before, The, Judge, In, Divorce, ,, ...","[DIVORCE, Before, The, Judge, In, Divorce, ,, ...",0.147513,"[[divorce, before, the, judge, in, divorce, ,,...","[[divorce, before, the, judge, in, divorce, ,,..."


## LM parameters

In [6]:
# args for Word2Vec
w2v_args = Namespace(
    epochs=5, 
    # only for Word2Vec
    compute_loss=True,                               # If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

    size=300,                                        # Dimensionality of the word vectors.
    alpha=0.03,                                      # The initial learning rate.
    min_alpha=0.0007,                                # Learning rate will linearly drop to min_alpha as training progresses.
    sg=1,                                            # Training algorithm: skip-gram if sg=1, otherwise CBOW.
    hs=0,                                            # If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
    negative=20,                                     # If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. 
    min_count=0,                                   # The model ignores all words with total frequency lower than this.
    window=5,                                        # The maximum distance between the current and predicted word within a sentence.
    sample=1e-3,                                     # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
    workers=8, 
    cbow_mean=1,                                     # If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
    null_word=0,                                     # 
    trim_rule=None,                                  # 
    sorted_vocab=1,                                  # If 1, sort the vocabulary by descending frequency before assigning word indices.
    batch_words=10000,                               # Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
    
    seed=1364,                                       # Seed for the random number generator.
    # only for FastText (compare to word2vec)
    #word_ngrams=1,                                   # If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to Word2Vec. 
    #min_n=2,                                         # Minimum length of char n-grams to be used for training word representations.
    #max_n=15,                                        # Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used.
    #bucket=2000000                                  # Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model.
)

In [7]:
# Only if a new LM needs to be trained (from scratch)
embedding_model_ocr = Word2Vec(
     size=w2v_args.size, 
     alpha=w2v_args.alpha,
     min_alpha=w2v_args.min_alpha, 
     sg=w2v_args.sg, 
     hs=w2v_args.hs, 
     negative=w2v_args.negative, 
     iter=w2v_args.epochs, 
     min_count=w2v_args.min_count, 
     window=w2v_args.window, 
     sample=w2v_args.sample, 
     workers=w2v_args.workers, 
     cbow_mean=w2v_args.cbow_mean, 
     null_word=w2v_args.null_word, 
     trim_rule=w2v_args.trim_rule, 
     sorted_vocab=w2v_args.sorted_vocab, 
     batch_words=w2v_args.batch_words, 
     seed=w2v_args.seed, 
     compute_loss=w2v_args.compute_loss)

# Only if a new LM needs to be trained (from scratch)
embedding_model_corrected = Word2Vec(
     size=w2v_args.size, 
     alpha=w2v_args.alpha,
     min_alpha=w2v_args.min_alpha, 
     sg=w2v_args.sg, 
     hs=w2v_args.hs, 
     negative=w2v_args.negative, 
     iter=w2v_args.epochs, 
     min_count=w2v_args.min_count, 
     window=w2v_args.window, 
     sample=w2v_args.sample, 
     workers=w2v_args.workers, 
     cbow_mean=w2v_args.cbow_mean, 
     null_word=w2v_args.null_word, 
     trim_rule=w2v_args.trim_rule, 
     sorted_vocab=w2v_args.sorted_vocab, 
     batch_words=w2v_args.batch_words, 
     seed=w2v_args.seed, 
     compute_loss=w2v_args.compute_loss)

## Preprocess before creating/updating LM

In [None]:
"""
def preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):
    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]
    return txt
"""

In [None]:
"""
db_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)
db_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)
"""

## OCR

In [None]:
list_sentences = db_sentence["ocr_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

In [None]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [None]:
flattened_list_sentences[0]

In [None]:
embedding_model_ocr.build_vocab(flattened_list_sentences)
embedding_model_ocr.train(flattened_list_sentences, 
                          total_examples=embedding_model_ocr.corpus_count,
                          epochs=w2v_args.epochs,  
                          compute_loss=w2v_args.compute_loss)

In [None]:
print("\n\n[INFO] Save the model")
embedding_model_ocr.save("./embedding_model_ocr.model")

## Corrected

In [8]:
list_sentences = db_sentence["corrected_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [9]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [10]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'trial',
 'splash',
 ':',
 'peler',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'during',
 'the',
 'elimination',
 'section',
 '.']

In [12]:
len(flattened_list_sentences)

499269

In [13]:
embedding_model_corrected.build_vocab(flattened_list_sentences)
embedding_model_corrected.train(flattened_list_sentences, 
                                total_examples=embedding_model_corrected.corpus_count,
                                epochs=w2v_args.epochs,  
                                compute_loss=w2v_args.compute_loss)

2019-11-20 19:38:08,751 : INFO : collecting all words and their counts
2019-11-20 19:38:08,753 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-20 19:38:08,854 : INFO : PROGRESS: at sentence #10000, processed 238920 words, keeping 21363 word types
2019-11-20 19:38:08,958 : INFO : PROGRESS: at sentence #20000, processed 462924 words, keeping 30432 word types
2019-11-20 19:38:09,073 : INFO : PROGRESS: at sentence #30000, processed 706176 words, keeping 39228 word types
2019-11-20 19:38:09,218 : INFO : PROGRESS: at sentence #40000, processed 967747 words, keeping 46390 word types
2019-11-20 19:38:09,377 : INFO : PROGRESS: at sentence #50000, processed 1206328 words, keeping 52319 word types
2019-11-20 19:38:09,558 : INFO : PROGRESS: at sentence #60000, processed 1454073 words, keeping 57131 word types
2019-11-20 19:38:09,742 : INFO : PROGRESS: at sentence #70000, processed 1709317 words, keeping 61561 word types
2019-11-20 19:38:09,933 : INFO : PROGRESS: 

2019-11-20 19:39:34,445 : INFO : EPOCH 1 - PROGRESS: at 13.38% examples, 71885 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:39:35,529 : INFO : EPOCH 1 - PROGRESS: at 13.93% examples, 70552 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:39:36,544 : INFO : EPOCH 1 - PROGRESS: at 14.64% examples, 70009 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:39:37,583 : INFO : EPOCH 1 - PROGRESS: at 15.40% examples, 69409 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:39:38,597 : INFO : EPOCH 1 - PROGRESS: at 15.96% examples, 68261 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:39:39,598 : INFO : EPOCH 1 - PROGRESS: at 16.63% examples, 67614 words/s, in_qsize 14, out_qsize 1
2019-11-20 19:39:40,638 : INFO : EPOCH 1 - PROGRESS: at 17.32% examples, 66945 words/s, in_qsize 16, out_qsize 1
2019-11-20 19:39:41,675 : INFO : EPOCH 1 - PROGRESS: at 18.22% examples, 67193 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:39:42,717 : INFO : EPOCH 1 - PROGRESS: at 18.91% examples, 66545 words/s, in_qsize

2019-11-20 19:40:52,686 : INFO : EPOCH 1 - PROGRESS: at 64.93% examples, 57945 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:40:53,817 : INFO : EPOCH 1 - PROGRESS: at 65.65% examples, 57907 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:40:54,993 : INFO : EPOCH 1 - PROGRESS: at 66.45% examples, 57846 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:40:56,011 : INFO : EPOCH 1 - PROGRESS: at 67.20% examples, 57803 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:40:57,057 : INFO : EPOCH 1 - PROGRESS: at 68.00% examples, 57883 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:40:58,156 : INFO : EPOCH 1 - PROGRESS: at 68.66% examples, 57872 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:40:59,229 : INFO : EPOCH 1 - PROGRESS: at 69.32% examples, 57876 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:41:00,304 : INFO : EPOCH 1 - PROGRESS: at 70.04% examples, 57878 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:41:01,456 : INFO : EPOCH 1 - PROGRESS: at 70.80% examples, 57832 words/s, in_qsize

2019-11-20 19:42:03,802 : INFO : EPOCH 2 - PROGRESS: at 17.32% examples, 81608 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:42:04,841 : INFO : EPOCH 2 - PROGRESS: at 18.97% examples, 84326 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:42:05,995 : INFO : EPOCH 2 - PROGRESS: at 20.33% examples, 85657 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:42:07,072 : INFO : EPOCH 2 - PROGRESS: at 21.57% examples, 86793 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:42:08,104 : INFO : EPOCH 2 - PROGRESS: at 23.04% examples, 88663 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:42:09,203 : INFO : EPOCH 2 - PROGRESS: at 24.70% examples, 90061 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:42:10,248 : INFO : EPOCH 2 - PROGRESS: at 26.07% examples, 90718 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:42:11,305 : INFO : EPOCH 2 - PROGRESS: at 27.46% examples, 91565 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:42:12,311 : INFO : EPOCH 2 - PROGRESS: at 28.85% examples, 92482 words/s, in_qsize

2019-11-20 19:43:22,109 : INFO : EPOCH 2 - PROGRESS: at 86.96% examples, 75562 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:43:23,113 : INFO : EPOCH 2 - PROGRESS: at 87.63% examples, 75346 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:43:24,305 : INFO : EPOCH 2 - PROGRESS: at 88.39% examples, 75069 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:43:25,314 : INFO : EPOCH 2 - PROGRESS: at 88.97% examples, 74863 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:43:26,337 : INFO : EPOCH 2 - PROGRESS: at 89.65% examples, 74790 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:43:27,574 : INFO : EPOCH 2 - PROGRESS: at 90.32% examples, 74495 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:43:28,725 : INFO : EPOCH 2 - PROGRESS: at 91.09% examples, 74259 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:43:29,815 : INFO : EPOCH 2 - PROGRESS: at 91.80% examples, 74077 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:43:30,925 : INFO : EPOCH 2 - PROGRESS: at 92.57% examples, 73877 words/s, in_qsize

2019-11-20 19:44:33,417 : INFO : EPOCH 3 - PROGRESS: at 32.82% examples, 54875 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:44:34,422 : INFO : EPOCH 3 - PROGRESS: at 33.59% examples, 55004 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:44:35,471 : INFO : EPOCH 3 - PROGRESS: at 34.36% examples, 55090 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:44:36,475 : INFO : EPOCH 3 - PROGRESS: at 35.26% examples, 55350 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:44:37,956 : INFO : EPOCH 3 - PROGRESS: at 35.99% examples, 54970 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:44:39,096 : INFO : EPOCH 3 - PROGRESS: at 36.84% examples, 55085 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:44:40,289 : INFO : EPOCH 3 - PROGRESS: at 37.55% examples, 55015 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:44:41,320 : INFO : EPOCH 3 - PROGRESS: at 38.17% examples, 54980 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:44:42,350 : INFO : EPOCH 3 - PROGRESS: at 39.11% examples, 55275 words/s, in_qsize

2019-11-20 19:45:43,740 : INFO : EPOCH 4 - PROGRESS: at 7.39% examples, 61964 words/s, in_qsize 15, out_qsize 1
2019-11-20 19:45:44,946 : INFO : EPOCH 4 - PROGRESS: at 8.01% examples, 60221 words/s, in_qsize 15, out_qsize 1
2019-11-20 19:45:46,150 : INFO : EPOCH 4 - PROGRESS: at 8.72% examples, 58828 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:45:47,326 : INFO : EPOCH 4 - PROGRESS: at 9.35% examples, 57796 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:45:48,409 : INFO : EPOCH 4 - PROGRESS: at 10.02% examples, 57276 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:45:49,549 : INFO : EPOCH 4 - PROGRESS: at 10.65% examples, 56633 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:45:50,564 : INFO : EPOCH 4 - PROGRESS: at 11.32% examples, 56472 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:45:51,695 : INFO : EPOCH 4 - PROGRESS: at 11.96% examples, 56003 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:45:52,807 : INFO : EPOCH 4 - PROGRESS: at 12.58% examples, 55632 words/s, in_qsize 15,

2019-11-20 19:47:03,564 : INFO : EPOCH 4 - PROGRESS: at 58.40% examples, 54551 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:47:04,680 : INFO : EPOCH 4 - PROGRESS: at 59.09% examples, 54497 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:47:05,701 : INFO : EPOCH 4 - PROGRESS: at 60.01% examples, 54666 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:47:06,797 : INFO : EPOCH 4 - PROGRESS: at 60.73% examples, 54628 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:47:07,905 : INFO : EPOCH 4 - PROGRESS: at 61.40% examples, 54577 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:47:08,998 : INFO : EPOCH 4 - PROGRESS: at 62.05% examples, 54536 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:47:09,999 : INFO : EPOCH 4 - PROGRESS: at 62.52% examples, 54400 words/s, in_qsize 14, out_qsize 1
2019-11-20 19:47:11,037 : INFO : EPOCH 4 - PROGRESS: at 63.31% examples, 54536 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:47:12,195 : INFO : EPOCH 4 - PROGRESS: at 64.05% examples, 54525 words/s, in_qsize

2019-11-20 19:48:14,427 : INFO : EPOCH 5 - PROGRESS: at 7.39% examples, 80286 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:48:15,430 : INFO : EPOCH 5 - PROGRESS: at 9.01% examples, 86798 words/s, in_qsize 15, out_qsize 1
2019-11-20 19:48:16,444 : INFO : EPOCH 5 - PROGRESS: at 10.09% examples, 86943 words/s, in_qsize 16, out_qsize 3
2019-11-20 19:48:17,510 : INFO : EPOCH 5 - PROGRESS: at 11.40% examples, 88529 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:48:18,836 : INFO : EPOCH 5 - PROGRESS: at 13.27% examples, 91873 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:48:19,890 : INFO : EPOCH 5 - PROGRESS: at 14.47% examples, 92918 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:48:20,988 : INFO : EPOCH 5 - PROGRESS: at 15.80% examples, 93383 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:48:22,091 : INFO : EPOCH 5 - PROGRESS: at 17.24% examples, 94325 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:48:23,111 : INFO : EPOCH 5 - PROGRESS: at 18.67% examples, 95525 words/s, in_qsize 1

2019-11-20 19:49:33,327 : INFO : EPOCH 5 - PROGRESS: at 76.16% examples, 73611 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:49:34,543 : INFO : EPOCH 5 - PROGRESS: at 76.79% examples, 73218 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:49:35,777 : INFO : EPOCH 5 - PROGRESS: at 77.58% examples, 72826 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:49:37,187 : INFO : EPOCH 5 - PROGRESS: at 78.19% examples, 72300 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:49:38,845 : INFO : EPOCH 5 - PROGRESS: at 78.78% examples, 71594 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:49:40,341 : INFO : EPOCH 5 - PROGRESS: at 79.36% examples, 71044 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:49:41,824 : INFO : EPOCH 5 - PROGRESS: at 80.04% examples, 70518 words/s, in_qsize 16, out_qsize 0
2019-11-20 19:49:43,328 : INFO : EPOCH 5 - PROGRESS: at 80.75% examples, 69999 words/s, in_qsize 15, out_qsize 0
2019-11-20 19:49:44,612 : INFO : EPOCH 5 - PROGRESS: at 81.48% examples, 69647 words/s, in_qsize

(42047385, 60620435)

In [14]:
print("\n\n[INFO] Save the model")
embedding_model_corrected.save("./embedding_model_scratch_corrected.model")

2019-11-20 19:50:18,394 : INFO : saving Word2Vec object under ./embedding_model_scratch_corrected.model, separately None
2019-11-20 19:50:18,413 : INFO : storing np array 'vectors' to ./embedding_model_scratch_corrected.model.wv.vectors.npy




[INFO] Save the model


2019-11-20 19:50:19,664 : INFO : not storing attribute vectors_norm
2019-11-20 19:50:19,703 : INFO : storing np array 'syn1neg' to ./embedding_model_scratch_corrected.model.trainables.syn1neg.npy
2019-11-20 19:50:21,165 : INFO : not storing attribute cum_table
2019-11-20 19:50:22,341 : INFO : saved ./embedding_model_scratch_corrected.model
