# Generate Language models from scratch

In [1]:
from argparse import Namespace

import copy
import glob
import os
import pandas as pd
import pickle
import re
import spacy
from time import time

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Read DB

In [2]:
db_sentence = pd.read_pickle("./db_trove_sentence_with_lookup.pkl")
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_corrected,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,2,0,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, ...","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,1,0,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ..."
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,3,0,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7...","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3..."
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,4,0,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1], [4, 5, 6,..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,2,0,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1...","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -..."


## preprocess DB

In [3]:
def cleanup(myrow, col_name):
    all_clean_rows = []
    for sent in myrow[col_name]:
        one_clean_row = []
        for token in sent:
            one_clean_row.append(token.lower())
        all_clean_rows.append(one_clean_row)
    return all_clean_rows

In [4]:
db_sentence["ocr_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["ocr_sentencizer"], axis=1)
db_sentence["corrected_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["corrected_sentencizer"], axis=1)

In [5]:
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_corrected,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup,ocr_sentencizer_cleaned,corrected_sentencizer_cleaned
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,2,0,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, ...","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6...","[[from, river, crossing, to, end, of, triäü, i...","[[from, river, crossing, to, end, of, trial, s..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,1,0,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[natural, childbirth, sir,-we, nurses, have, ...","[[natural, childbirth, sir,-we, nurses, have, ..."
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,3,0,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7...","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3...","[[first, church, i, service, 1, presbyterian, ...","[[first, church, service, presbyterian, annive..."
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,4,0,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1], [4, 5, 6,...","[["", bob, "", lulham, 's, fight, against, thall...","[["", bob, "", lulham, 's, fight, against, thall..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,2,0,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1...","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -...","[[divorce, before, the, judge, in, divorce, ,,...","[[divorce, before, the, judge, in, divorce, ,,..."


## LM parameters

In [6]:
# args for Word2Vec
w2v_args = Namespace(
    epochs=5, 
    # only for Word2Vec
    compute_loss=True,                               # If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

    size=300,                                        # Dimensionality of the word vectors.
    alpha=0.03,                                      # The initial learning rate.
    min_alpha=0.0007,                                # Learning rate will linearly drop to min_alpha as training progresses.
    sg=1,                                            # Training algorithm: skip-gram if sg=1, otherwise CBOW.
    hs=0,                                            # If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
    negative=20,                                     # If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. 
    min_count=0,                                   # The model ignores all words with total frequency lower than this.
    window=5,                                        # The maximum distance between the current and predicted word within a sentence.
    sample=1e-3,                                     # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
    workers=8, 
    cbow_mean=1,                                     # If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
    null_word=0,                                     # 
    trim_rule=None,                                  # 
    sorted_vocab=1,                                  # If 1, sort the vocabulary by descending frequency before assigning word indices.
    batch_words=10000,                               # Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
    
    seed=1364,                                       # Seed for the random number generator.
    # only for FastText (compare to word2vec)
    #word_ngrams=1,                                   # If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to Word2Vec. 
    #min_n=2,                                         # Minimum length of char n-grams to be used for training word representations.
    #max_n=15,                                        # Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used.
    #bucket=2000000                                  # Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model.
)

In [7]:
# Only if a new LM needs to be trained (from scratch)
embedding_model_ocr = Word2Vec(
     size=w2v_args.size, 
     alpha=w2v_args.alpha,
     min_alpha=w2v_args.min_alpha, 
     sg=w2v_args.sg, 
     hs=w2v_args.hs, 
     negative=w2v_args.negative, 
     iter=w2v_args.epochs, 
     min_count=w2v_args.min_count, 
     window=w2v_args.window, 
     sample=w2v_args.sample, 
     workers=w2v_args.workers, 
     cbow_mean=w2v_args.cbow_mean, 
     null_word=w2v_args.null_word, 
     trim_rule=w2v_args.trim_rule, 
     sorted_vocab=w2v_args.sorted_vocab, 
     batch_words=w2v_args.batch_words, 
     seed=w2v_args.seed, 
     compute_loss=w2v_args.compute_loss)

# Only if a new LM needs to be trained (from scratch)
embedding_model_corrected = Word2Vec(
     size=w2v_args.size, 
     alpha=w2v_args.alpha,
     min_alpha=w2v_args.min_alpha, 
     sg=w2v_args.sg, 
     hs=w2v_args.hs, 
     negative=w2v_args.negative, 
     iter=w2v_args.epochs, 
     min_count=w2v_args.min_count, 
     window=w2v_args.window, 
     sample=w2v_args.sample, 
     workers=w2v_args.workers, 
     cbow_mean=w2v_args.cbow_mean, 
     null_word=w2v_args.null_word, 
     trim_rule=w2v_args.trim_rule, 
     sorted_vocab=w2v_args.sorted_vocab, 
     batch_words=w2v_args.batch_words, 
     seed=w2v_args.seed, 
     compute_loss=w2v_args.compute_loss)

## Preprocess before creating/updating LM

In [None]:
"""
def preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):
    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]
    return txt
"""

In [None]:
"""
db_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)
db_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)
"""

## OCR

In [8]:
list_sentences = db_sentence["ocr_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [9]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [10]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'triäü',
 'i',
 '^',
 'pi',
 'a^h',
 '"',
 'pclcr',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'jr',
 'la',
 'jil',
 '?',
 'during',
 '{',
 '|',
 ')',
 'c',
 'elimination',
 'section',
 '.']

In [11]:
embedding_model_ocr.build_vocab(flattened_list_sentences)
embedding_model_ocr.train(flattened_list_sentences, 
                          total_examples=embedding_model_ocr.corpus_count,
                          epochs=w2v_args.epochs,  
                          compute_loss=w2v_args.compute_loss)

2019-11-13 13:05:57,666 : INFO : collecting all words and their counts
2019-11-13 13:05:57,667 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-13 13:05:57,903 : INFO : PROGRESS: at sentence #10000, processed 525154 words, keeping 85319 word types
2019-11-13 13:05:58,069 : INFO : PROGRESS: at sentence #20000, processed 856174 words, keeping 117320 word types
2019-11-13 13:05:58,273 : INFO : PROGRESS: at sentence #30000, processed 1262312 words, keeping 157071 word types
2019-11-13 13:05:58,487 : INFO : PROGRESS: at sentence #40000, processed 1640602 words, keeping 191468 word types
2019-11-13 13:05:58,654 : INFO : PROGRESS: at sentence #50000, processed 1983065 words, keeping 219352 word types
2019-11-13 13:05:58,845 : INFO : PROGRESS: at sentence #60000, processed 2332908 words, keeping 244407 word types
2019-11-13 13:05:59,056 : INFO : PROGRESS: at sentence #70000, processed 2683098 words, keeping 272037 word types
2019-11-13 13:05:59,320 : INFO : PR

2019-11-13 13:11:20,738 : INFO : EPOCH 1 - PROGRESS: at 41.88% examples, 96148 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:21,769 : INFO : EPOCH 1 - PROGRESS: at 42.54% examples, 96317 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:22,840 : INFO : EPOCH 1 - PROGRESS: at 43.12% examples, 96234 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:24,003 : INFO : EPOCH 1 - PROGRESS: at 43.64% examples, 95939 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:25,187 : INFO : EPOCH 1 - PROGRESS: at 44.92% examples, 96103 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:26,370 : INFO : EPOCH 1 - PROGRESS: at 46.33% examples, 96200 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:27,594 : INFO : EPOCH 1 - PROGRESS: at 48.02% examples, 96249 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:28,595 : INFO : EPOCH 1 - PROGRESS: at 48.91% examples, 96597 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:11:29,737 : INFO : EPOCH 1 - PROGRESS: at 49.44% examples, 96115 words/s, in_qsize

2019-11-13 13:12:32,821 : INFO : EPOCH 2 - PROGRESS: at 12.11% examples, 93938 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:12:34,090 : INFO : EPOCH 2 - PROGRESS: at 13.71% examples, 93857 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:12:35,329 : INFO : EPOCH 2 - PROGRESS: at 15.21% examples, 94057 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:12:36,492 : INFO : EPOCH 2 - PROGRESS: at 16.91% examples, 94726 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:12:37,704 : INFO : EPOCH 2 - PROGRESS: at 18.56% examples, 95014 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:12:38,724 : INFO : EPOCH 2 - PROGRESS: at 19.90% examples, 95968 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:12:39,755 : INFO : EPOCH 2 - PROGRESS: at 20.81% examples, 95094 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:12:40,881 : INFO : EPOCH 2 - PROGRESS: at 22.27% examples, 95306 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:12:41,920 : INFO : EPOCH 2 - PROGRESS: at 23.72% examples, 95867 words/s, in_qsize

2019-11-13 13:13:50,694 : INFO : EPOCH 2 - PROGRESS: at 93.28% examples, 96034 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:13:51,802 : INFO : EPOCH 2 - PROGRESS: at 94.14% examples, 95956 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:13:52,813 : INFO : EPOCH 2 - PROGRESS: at 94.79% examples, 95910 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:13:53,816 : INFO : EPOCH 2 - PROGRESS: at 95.53% examples, 95944 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:13:54,910 : INFO : EPOCH 2 - PROGRESS: at 96.33% examples, 95895 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:13:55,926 : INFO : EPOCH 2 - PROGRESS: at 96.89% examples, 95920 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:13:56,928 : INFO : EPOCH 2 - PROGRESS: at 97.61% examples, 95879 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:13:57,977 : INFO : EPOCH 2 - PROGRESS: at 98.78% examples, 95864 words/s, in_qsize 12, out_qsize 0
2019-11-13 13:13:58,277 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-

2019-11-13 13:15:03,314 : INFO : EPOCH 3 - PROGRESS: at 59.63% examples, 86299 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:15:04,320 : INFO : EPOCH 3 - PROGRESS: at 60.28% examples, 86199 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:15:05,365 : INFO : EPOCH 3 - PROGRESS: at 61.16% examples, 85993 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:15:06,413 : INFO : EPOCH 3 - PROGRESS: at 62.18% examples, 86119 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:15:07,551 : INFO : EPOCH 3 - PROGRESS: at 62.99% examples, 85790 words/s, in_qsize 16, out_qsize 1
2019-11-13 13:15:08,796 : INFO : EPOCH 3 - PROGRESS: at 63.77% examples, 85439 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:15:09,832 : INFO : EPOCH 3 - PROGRESS: at 64.05% examples, 84728 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:15:10,841 : INFO : EPOCH 3 - PROGRESS: at 64.49% examples, 84387 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:15:11,861 : INFO : EPOCH 3 - PROGRESS: at 64.92% examples, 84036 words/s, in_qsize

2019-11-13 13:16:19,096 : INFO : EPOCH 4 - PROGRESS: at 17.84% examples, 64878 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:20,136 : INFO : EPOCH 4 - PROGRESS: at 18.91% examples, 65499 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:21,213 : INFO : EPOCH 4 - PROGRESS: at 19.90% examples, 65998 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:22,527 : INFO : EPOCH 4 - PROGRESS: at 20.71% examples, 65563 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:23,611 : INFO : EPOCH 4 - PROGRESS: at 22.11% examples, 66692 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:24,763 : INFO : EPOCH 4 - PROGRESS: at 23.09% examples, 66607 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:25,892 : INFO : EPOCH 4 - PROGRESS: at 24.31% examples, 67299 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:26,984 : INFO : EPOCH 4 - PROGRESS: at 25.18% examples, 67582 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:16:28,066 : INFO : EPOCH 4 - PROGRESS: at 25.93% examples, 67427 words/s, in_qsize

2019-11-13 13:17:38,803 : INFO : EPOCH 4 - PROGRESS: at 91.08% examples, 81217 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:39,867 : INFO : EPOCH 4 - PROGRESS: at 92.23% examples, 81257 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:40,904 : INFO : EPOCH 4 - PROGRESS: at 93.11% examples, 81391 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:41,918 : INFO : EPOCH 4 - PROGRESS: at 93.97% examples, 81472 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:17:42,944 : INFO : EPOCH 4 - PROGRESS: at 94.68% examples, 81630 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:43,992 : INFO : EPOCH 4 - PROGRESS: at 95.36% examples, 81684 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:45,113 : INFO : EPOCH 4 - PROGRESS: at 96.12% examples, 81763 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:46,285 : INFO : EPOCH 4 - PROGRESS: at 96.82% examples, 81869 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:17:47,352 : INFO : EPOCH 4 - PROGRESS: at 97.53% examples, 81981 words/s, in_qsize

2019-11-13 13:18:50,257 : INFO : EPOCH 5 - PROGRESS: at 57.76% examples, 87913 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:51,310 : INFO : EPOCH 5 - PROGRESS: at 58.51% examples, 87791 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:52,317 : INFO : EPOCH 5 - PROGRESS: at 59.46% examples, 88106 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:53,345 : INFO : EPOCH 5 - PROGRESS: at 60.14% examples, 87948 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:54,387 : INFO : EPOCH 5 - PROGRESS: at 61.43% examples, 88294 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:18:55,468 : INFO : EPOCH 5 - PROGRESS: at 62.41% examples, 88351 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:56,476 : INFO : EPOCH 5 - PROGRESS: at 63.36% examples, 88359 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:57,564 : INFO : EPOCH 5 - PROGRESS: at 64.20% examples, 88498 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:18:58,590 : INFO : EPOCH 5 - PROGRESS: at 64.87% examples, 88497 words/s, in_qsize

(47229637, 61784180)

In [12]:
print("\n\n[INFO] Save the model")
embedding_model_ocr.save("./embedding_model_ocr.model")

2019-11-13 13:19:33,746 : INFO : saving Word2Vec object under ./embedding_model_ocr.model, separately None
2019-11-13 13:19:33,757 : INFO : storing np array 'vectors' to ./embedding_model_ocr.model.wv.vectors.npy




[INFO] Save the model


2019-11-13 13:19:35,913 : INFO : not storing attribute vectors_norm
2019-11-13 13:19:35,941 : INFO : storing np array 'syn1neg' to ./embedding_model_ocr.model.trainables.syn1neg.npy
2019-11-13 13:19:37,918 : INFO : not storing attribute cum_table
2019-11-13 13:19:40,296 : INFO : saved ./embedding_model_ocr.model


## Corrected

In [13]:
list_sentences = db_sentence["corrected_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [14]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [15]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'trial',
 'splash',
 ':',
 'peler',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'during',
 'the',
 'elimination',
 'section',
 '.']

In [16]:
embedding_model_corrected.build_vocab(flattened_list_sentences)
embedding_model_corrected.train(flattened_list_sentences, 
                                total_examples=embedding_model_corrected.corpus_count,
                                epochs=w2v_args.epochs,  
                                compute_loss=w2v_args.compute_loss)

2019-11-13 13:19:41,210 : INFO : collecting all words and their counts
2019-11-13 13:19:41,212 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-13 13:19:41,267 : INFO : PROGRESS: at sentence #10000, processed 238920 words, keeping 21363 word types
2019-11-13 13:19:41,314 : INFO : PROGRESS: at sentence #20000, processed 462924 words, keeping 30432 word types
2019-11-13 13:19:41,366 : INFO : PROGRESS: at sentence #30000, processed 706176 words, keeping 39228 word types
2019-11-13 13:19:41,429 : INFO : PROGRESS: at sentence #40000, processed 967747 words, keeping 46390 word types
2019-11-13 13:19:41,480 : INFO : PROGRESS: at sentence #50000, processed 1206328 words, keeping 52319 word types
2019-11-13 13:19:41,532 : INFO : PROGRESS: at sentence #60000, processed 1454073 words, keeping 57131 word types
2019-11-13 13:19:41,586 : INFO : PROGRESS: at sentence #70000, processed 1709317 words, keeping 61561 word types
2019-11-13 13:19:41,638 : INFO : PROGRESS: 

2019-11-13 13:20:28,469 : INFO : EPOCH 1 - PROGRESS: at 23.58% examples, 137723 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:20:29,542 : INFO : EPOCH 1 - PROGRESS: at 25.49% examples, 137449 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:20:30,608 : INFO : EPOCH 1 - PROGRESS: at 27.06% examples, 136552 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:20:31,686 : INFO : EPOCH 1 - PROGRESS: at 28.76% examples, 136387 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:20:32,701 : INFO : EPOCH 1 - PROGRESS: at 30.26% examples, 136369 words/s, in_qsize 16, out_qsize 1
2019-11-13 13:20:33,750 : INFO : EPOCH 1 - PROGRESS: at 31.90% examples, 136144 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:20:34,775 : INFO : EPOCH 1 - PROGRESS: at 33.59% examples, 136124 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:20:35,817 : INFO : EPOCH 1 - PROGRESS: at 35.49% examples, 136600 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:20:36,858 : INFO : EPOCH 1 - PROGRESS: at 37.15% examples, 136394 words/s,

2019-11-13 13:21:39,457 : INFO : EPOCH 2 - PROGRESS: at 21.26% examples, 108683 words/s, in_qsize 16, out_qsize 1
2019-11-13 13:21:40,577 : INFO : EPOCH 2 - PROGRESS: at 22.41% examples, 107633 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:21:41,599 : INFO : EPOCH 2 - PROGRESS: at 23.98% examples, 108724 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:21:42,638 : INFO : EPOCH 2 - PROGRESS: at 25.40% examples, 108531 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:21:43,681 : INFO : EPOCH 2 - PROGRESS: at 26.64% examples, 108076 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:21:44,727 : INFO : EPOCH 2 - PROGRESS: at 27.77% examples, 107314 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:21:45,773 : INFO : EPOCH 2 - PROGRESS: at 28.85% examples, 106306 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:21:46,846 : INFO : EPOCH 2 - PROGRESS: at 29.67% examples, 104730 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:21:47,911 : INFO : EPOCH 2 - PROGRESS: at 30.91% examples, 104681 words/s,

2019-11-13 13:22:49,739 : INFO : EPOCH 3 - PROGRESS: at 17.81% examples, 111165 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:22:50,745 : INFO : EPOCH 3 - PROGRESS: at 19.41% examples, 112864 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:22:51,845 : INFO : EPOCH 3 - PROGRESS: at 20.91% examples, 113728 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:22:52,861 : INFO : EPOCH 3 - PROGRESS: at 22.19% examples, 113784 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:22:53,896 : INFO : EPOCH 3 - PROGRESS: at 23.98% examples, 115690 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:22:55,021 : INFO : EPOCH 3 - PROGRESS: at 25.49% examples, 114949 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:22:56,118 : INFO : EPOCH 3 - PROGRESS: at 26.81% examples, 114162 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:22:57,119 : INFO : EPOCH 3 - PROGRESS: at 28.07% examples, 113978 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:22:58,183 : INFO : EPOCH 3 - PROGRESS: at 29.39% examples, 113458 words/s,

2019-11-13 13:23:59,366 : INFO : EPOCH 4 - PROGRESS: at 19.05% examples, 131764 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:24:00,399 : INFO : EPOCH 4 - PROGRESS: at 20.40% examples, 130838 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:24:01,556 : INFO : EPOCH 4 - PROGRESS: at 22.11% examples, 131291 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:24:02,596 : INFO : EPOCH 4 - PROGRESS: at 23.98% examples, 132710 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:24:03,611 : INFO : EPOCH 4 - PROGRESS: at 25.67% examples, 132411 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:24:04,798 : INFO : EPOCH 4 - PROGRESS: at 27.46% examples, 132140 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:24:05,839 : INFO : EPOCH 4 - PROGRESS: at 29.31% examples, 133224 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:24:06,858 : INFO : EPOCH 4 - PROGRESS: at 30.58% examples, 132280 words/s, in_qsize 14, out_qsize 1
2019-11-13 13:24:07,899 : INFO : EPOCH 4 - PROGRESS: at 32.48% examples, 133360 words/s,

2019-11-13 13:25:09,184 : INFO : EPOCH 5 - PROGRESS: at 29.94% examples, 133612 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:10,192 : INFO : EPOCH 5 - PROGRESS: at 31.74% examples, 134489 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:25:11,217 : INFO : EPOCH 5 - PROGRESS: at 33.26% examples, 133875 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:12,310 : INFO : EPOCH 5 - PROGRESS: at 35.26% examples, 134449 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:13,374 : INFO : EPOCH 5 - PROGRESS: at 36.84% examples, 133887 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:14,522 : INFO : EPOCH 5 - PROGRESS: at 38.57% examples, 133764 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:15,532 : INFO : EPOCH 5 - PROGRESS: at 40.40% examples, 134366 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:16,595 : INFO : EPOCH 5 - PROGRESS: at 41.85% examples, 133675 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:25:17,610 : INFO : EPOCH 5 - PROGRESS: at 43.30% examples, 133517 words/s,

(42049983, 60620435)

In [17]:
print("\n\n[INFO] Save the model")
embedding_model_corrected.save("./embedding_model_corrected.model")

2019-11-13 13:25:52,355 : INFO : saving Word2Vec object under ./embedding_model_corrected.model, separately None
2019-11-13 13:25:52,358 : INFO : storing np array 'vectors' to ./embedding_model_corrected.model.wv.vectors.npy




[INFO] Save the model


2019-11-13 13:25:53,004 : INFO : not storing attribute vectors_norm
2019-11-13 13:25:53,011 : INFO : storing np array 'syn1neg' to ./embedding_model_corrected.model.trainables.syn1neg.npy
2019-11-13 13:25:53,624 : INFO : not storing attribute cum_table
2019-11-13 13:25:54,038 : INFO : saved ./embedding_model_corrected.model
