# Generate Language models

In [1]:
from argparse import Namespace

import copy
import glob
import os
import pandas as pd
import pickle
import re
import spacy
from time import time

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
#nlp = spacy.load("en_core_web_lg")

## Initialize a pre-trained language model

In [3]:
# model types
# w2v: word2vec
# ft: fasttext
model_type = "w2v"   
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model"

if model_type.lower() in ["w2v", "word2vec"]:
    # Word2Vec
    embedding_model = Word2Vec.load(model_path)
elif model_type.lower() in ["ft", "fasttext"]:
    # FastText
    embedding_model = FastText.load(model_path)
print(embedding_model)
embedding_model_orig = copy.deepcopy(embedding_model)

2019-11-13 13:20:52,898 : INFO : loading Word2Vec object from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model
2019-11-13 13:20:55,374 : INFO : loading wv recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.* with mmap=None
2019-11-13 13:20:55,378 : INFO : loading vectors from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.vectors.npy with mmap=None
2019-11-13 13:20:56,400 : INFO : setting ignored attribute vectors_norm to None
2019-11-13 13:20:56,403 : INFO : loading vocabulary recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/m

Word2Vec(vocab=434049, size=300, alpha=0.03)


## Read DB

In [4]:
db_sentence = pd.read_pickle("./db_trove_sentence_with_lookup.pkl")
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_corrected,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,2,0,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, ...","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,1,0,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ..."
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,3,0,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7...","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3..."
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,4,0,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1], [4, 5, 6,..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,2,0,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1...","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -..."


## preprocess DB

In [5]:
"""
def cleanup(myrow, colname="corrected"):
    # remove all # and @§
    
    corpus = [re.sub(r'#', '', element, flags=re.IGNORECASE) for element in corpus]
    corpus = [re.sub(r'@', '', element, flags=re.IGNORECASE) for element in corpus]
    
    # --- remove 2 or more .
    corpus = [re.sub('[.]{2,}', '.', element) for element in corpus]
    # --- add a space before and after a list of punctuations
    corpus = [re.sub(r"([.,!?:;\"\'])", r" \1 ", element) for element in corpus]
    # --- remove everything except:
    #corpus = [re.sub(r"([^a-zA-Z\-.:;,!?\d+]+)", r" ", element) for element in corpus]
    corpus = [re.sub(r"([^a-zA-Z\d+]+)", r" ", element) for element in corpus]
    # --- replace numbers with <NUM>
    corpus = [re.sub(r'\b\d+\b', '<NUM>', element) for element in corpus]
    corpus = [re.sub('--', '', element) for element in corpus]
    # --- normalize white spaces
    corpus = [re.sub('\s+', ' ', element) for element in corpus]
    
    # remove multiple spaces
    corpus = [re.sub(r'\s+', ' ', element, flags=re.IGNORECASE) for element in corpus]
    corpus = [element.strip() for element in corpus]
    #corpus = [element.lower() for element in corpus]
    return corpus
"""

'\ndef cleanup(myrow, colname="corrected"):\n    # remove all # and @§\n    \n    corpus = [re.sub(r\'#\', \'\', element, flags=re.IGNORECASE) for element in corpus]\n    corpus = [re.sub(r\'@\', \'\', element, flags=re.IGNORECASE) for element in corpus]\n    \n    # --- remove 2 or more .\n    corpus = [re.sub(\'[.]{2,}\', \'.\', element) for element in corpus]\n    # --- add a space before and after a list of punctuations\n    corpus = [re.sub(r"([.,!?:;"\'])", r" \x01 ", element) for element in corpus]\n    # --- remove everything except:\n    #corpus = [re.sub(r"([^a-zA-Z\\-.:;,!?\\d+]+)", r" ", element) for element in corpus]\n    corpus = [re.sub(r"([^a-zA-Z\\d+]+)", r" ", element) for element in corpus]\n    # --- replace numbers with <NUM>\n    corpus = [re.sub(r\'\x08\\d+\x08\', \'<NUM>\', element) for element in corpus]\n    corpus = [re.sub(\'--\', \'\', element) for element in corpus]\n    # --- normalize white spaces\n    corpus = [re.sub(\'\\s+\', \' \', element) for elem

In [6]:
def cleanup(myrow, col_name):
    all_clean_rows = []
    for sent in myrow[col_name]:
        one_clean_row = []
        for token in sent:
            one_clean_row.append(token.lower())
        all_clean_rows.append(one_clean_row)
    return all_clean_rows

In [7]:
db_sentence["ocr_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["ocr_sentencizer"], axis=1)
db_sentence["corrected_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["corrected_sentencizer"], axis=1)

In [8]:
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_corrected,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup,ocr_sentencizer_cleaned,corrected_sentencizer_cleaned
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,2,0,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[4, 5, 8, 2, 3, 2, 5, 6, 1, -5, -6, 8, 4, 4, ...","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[4, 5, 8, 2, 3, 2, -5, 1, 1, 2, -3, 1, -5, -6...","[[from, river, crossing, to, end, of, triäü, i...","[[from, river, crossing, to, end, of, trial, s..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,1,0,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[7, 10, -7, 6, 4, 4, 3, 3, 4, 5, 6, 4, 6, 5, ...","[[natural, childbirth, sir,-we, nurses, have, ...","[[natural, childbirth, sir,-we, nurses, have, ..."
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,3,0,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[5, 6, 7, 12, 11, 3, 5, 11, 2, 3, 5, 12, 6, 7...","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[5, 6, 1, 7, 1, 12, 1, 1, 11, 1, 3, 5, -12, 3...","[[first, church, i, service, 1, presbyterian, ...","[[first, church, service, presbyterian, annive..."
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,4,0,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 6, 6, 1, 1, 3, 1, 1...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[1, 3, 1, -6, 2, 5, 7, 8, 8, 8, 1], [4, 5, 6,...","[["", bob, "", lulham, 's, fight, against, thall...","[["", bob, "", lulham, 's, fight, against, thall..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,2,0,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 1, 7, -5, 7, 4, 1, 1...","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[7, 6, 3, 5, 2, 7, 1, 2, 7, -5, 7, 4, 1, 1, -...","[[divorce, before, the, judge, in, divorce, ,,...","[[divorce, before, the, judge, in, divorce, ,,..."


## Update a pre-trained LM

In [9]:
# args for Word2Vec
w2v_args = Namespace(
    epochs=5, 
    # only for Word2Vec
    compute_loss=True,                               # If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

#     size=100,                                        # Dimensionality of the word vectors.
#     alpha=0.03,                                      # The initial learning rate.
#     min_alpha=0.0007,                                # Learning rate will linearly drop to min_alpha as training progresses.
#     sg=1,                                            # Training algorithm: skip-gram if sg=1, otherwise CBOW.
#     hs=0,                                            # If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
#     negative=20,                                     # If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. 
#     min_count=5,                                    # The model ignores all words with total frequency lower than this.
#     window=5,                                        # The maximum distance between the current and predicted word within a sentence.
#     sample=1e-3,                                     # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
#     workers=8, 
#     cbow_mean=1,                                     # If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
#     null_word=0,                                     # 
#     trim_rule=None,                                  # 
#     sorted_vocab=1,                                  # If 1, sort the vocabulary by descending frequency before assigning word indices.
#     batch_words=10000,                               # Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
    
#     seed=1364,                                       # Seed for the random number generator.
#     # only for FastText (compare to word2vec)
#     #word_ngrams=1,                                   # If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to Word2Vec. 
#     #min_n=2,                                         # Minimum length of char n-grams to be used for training word representations.
#     #max_n=15,                                        # Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used.
#     #bucket=2000000                                  # Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model.
)

In [10]:
# Only if a new LM needs to be trained (from scratch)
"""
w2v_model = Word2Vec(
                 size=w2v_args.size, 
                 alpha=w2v_args.alpha,
                 min_alpha=w2v_args.min_alpha, 
                 sg=w2v_args.sg, 
                 hs=w2v_args.hs, 
                 negative=w2v_args.negative, 
                 iter=w2v_args.epochs, 
                 min_count=w2v_args.min_count, 
                 window=w2v_args.window, 
                 sample=w2v_args.sample, 
                 workers=w2v_args.workers, 
                 cbow_mean=w2v_args.cbow_mean, 
                 null_word=w2v_args.null_word, 
                 trim_rule=w2v_args.trim_rule, 
                 sorted_vocab=w2v_args.sorted_vocab, 
                 batch_words=w2v_args.batch_words, 
                 seed=w2v_args.seed, 
                 compute_loss=w2v_args.compute_loss)
"""

'\nw2v_model = Word2Vec(\n                 size=w2v_args.size, \n                 alpha=w2v_args.alpha,\n                 min_alpha=w2v_args.min_alpha, \n                 sg=w2v_args.sg, \n                 hs=w2v_args.hs, \n                 negative=w2v_args.negative, \n                 iter=w2v_args.epochs, \n                 min_count=w2v_args.min_count, \n                 window=w2v_args.window, \n                 sample=w2v_args.sample, \n                 workers=w2v_args.workers, \n                 cbow_mean=w2v_args.cbow_mean, \n                 null_word=w2v_args.null_word, \n                 trim_rule=w2v_args.trim_rule, \n                 sorted_vocab=w2v_args.sorted_vocab, \n                 batch_words=w2v_args.batch_words, \n                 seed=w2v_args.seed, \n                 compute_loss=w2v_args.compute_loss)\n'

## Preprocess before creating/updating LM

In [11]:
"""
def preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):
    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]
    return txt
"""

'\ndef preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):\n    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]\n    return txt\n'

In [12]:
"""
db_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)
db_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)
"""

'\ndb_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)\ndb_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)\n'

## OCR

In [13]:
list_sentences = db_sentence["ocr_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [14]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [15]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'triäü',
 'i',
 '^',
 'pi',
 'a^h',
 '"',
 'pclcr',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'jr',
 'la',
 'jil',
 '?',
 'during',
 '{',
 '|',
 ')',
 'c',
 'elimination',
 'section',
 '.']

In [16]:
embedding_model_ocr = copy.deepcopy(embedding_model_orig)

In [17]:
embedding_model_ocr.workers = 8
embedding_model_ocr.min_count = 0

  


In [18]:
embedding_model_ocr.build_vocab(flattened_list_sentences, update=True)
embedding_model_ocr.train(flattened_list_sentences, 
                          total_examples=embedding_model_ocr.corpus_count,
                          epochs=w2v_args.epochs,  
                          compute_loss=w2v_args.compute_loss)

2019-11-13 13:22:22,416 : INFO : collecting all words and their counts
2019-11-13 13:22:22,417 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-13 13:22:22,802 : INFO : PROGRESS: at sentence #10000, processed 525154 words, keeping 85319 word types
2019-11-13 13:22:23,027 : INFO : PROGRESS: at sentence #20000, processed 856174 words, keeping 117320 word types
2019-11-13 13:22:23,286 : INFO : PROGRESS: at sentence #30000, processed 1262312 words, keeping 157071 word types
2019-11-13 13:22:23,515 : INFO : PROGRESS: at sentence #40000, processed 1640602 words, keeping 191468 word types
2019-11-13 13:22:23,709 : INFO : PROGRESS: at sentence #50000, processed 1983065 words, keeping 219352 word types
2019-11-13 13:22:23,937 : INFO : PROGRESS: at sentence #60000, processed 2332908 words, keeping 244407 word types
2019-11-13 13:22:24,181 : INFO : PROGRESS: at sentence #70000, processed 2683098 words, keeping 272037 word types
2019-11-13 13:22:24,440 : INFO : PR

2019-11-13 13:27:28,193 : INFO : EPOCH 1 - PROGRESS: at 51.82% examples, 134812 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:29,236 : INFO : EPOCH 1 - PROGRESS: at 53.12% examples, 134512 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:30,267 : INFO : EPOCH 1 - PROGRESS: at 54.47% examples, 134179 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:31,338 : INFO : EPOCH 1 - PROGRESS: at 56.52% examples, 134846 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:32,449 : INFO : EPOCH 1 - PROGRESS: at 57.95% examples, 134341 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:33,479 : INFO : EPOCH 1 - PROGRESS: at 59.29% examples, 134515 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:34,537 : INFO : EPOCH 1 - PROGRESS: at 60.52% examples, 134548 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:35,646 : INFO : EPOCH 1 - PROGRESS: at 62.11% examples, 134425 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:27:36,659 : INFO : EPOCH 1 - PROGRESS: at 63.51% examples, 134410 words/s,

2019-11-13 13:28:37,010 : INFO : EPOCH 2 - PROGRESS: at 49.44% examples, 133441 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:28:38,093 : INFO : EPOCH 2 - PROGRESS: at 50.20% examples, 132934 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:28:39,138 : INFO : EPOCH 2 - PROGRESS: at 51.31% examples, 132527 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:28:40,175 : INFO : EPOCH 2 - PROGRESS: at 52.56% examples, 132114 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:28:41,212 : INFO : EPOCH 2 - PROGRESS: at 53.94% examples, 131854 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:28:42,231 : INFO : EPOCH 2 - PROGRESS: at 55.68% examples, 132592 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:28:43,260 : INFO : EPOCH 2 - PROGRESS: at 57.32% examples, 132352 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:28:44,326 : INFO : EPOCH 2 - PROGRESS: at 58.65% examples, 132279 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:28:45,363 : INFO : EPOCH 2 - PROGRESS: at 59.91% examples, 132377 words/s,

2019-11-13 13:29:45,469 : INFO : EPOCH 3 - PROGRESS: at 47.85% examples, 135728 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:29:46,475 : INFO : EPOCH 3 - PROGRESS: at 48.94% examples, 135702 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:29:47,505 : INFO : EPOCH 3 - PROGRESS: at 49.74% examples, 135565 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:29:48,612 : INFO : EPOCH 3 - PROGRESS: at 50.66% examples, 135115 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:29:49,640 : INFO : EPOCH 3 - PROGRESS: at 52.31% examples, 135717 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:29:50,824 : INFO : EPOCH 3 - PROGRESS: at 53.80% examples, 135032 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:29:51,858 : INFO : EPOCH 3 - PROGRESS: at 55.50% examples, 135658 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:29:52,972 : INFO : EPOCH 3 - PROGRESS: at 57.24% examples, 135219 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:29:53,979 : INFO : EPOCH 3 - PROGRESS: at 58.57% examples, 135272 words/s,

2019-11-13 13:30:55,466 : INFO : EPOCH 4 - PROGRESS: at 48.21% examples, 136060 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:30:56,506 : INFO : EPOCH 4 - PROGRESS: at 49.27% examples, 136132 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:30:57,593 : INFO : EPOCH 4 - PROGRESS: at 50.13% examples, 136212 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:30:58,614 : INFO : EPOCH 4 - PROGRESS: at 51.31% examples, 136017 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:30:59,691 : INFO : EPOCH 4 - PROGRESS: at 52.73% examples, 135759 words/s, in_qsize 14, out_qsize 1
2019-11-13 13:31:00,696 : INFO : EPOCH 4 - PROGRESS: at 54.39% examples, 136293 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:31:01,826 : INFO : EPOCH 4 - PROGRESS: at 55.99% examples, 135943 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:31:02,879 : INFO : EPOCH 4 - PROGRESS: at 57.89% examples, 136341 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:31:03,925 : INFO : EPOCH 4 - PROGRESS: at 59.07% examples, 136032 words/s,

2019-11-13 13:32:05,424 : INFO : EPOCH 5 - PROGRESS: at 48.15% examples, 135581 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:32:06,463 : INFO : EPOCH 5 - PROGRESS: at 49.20% examples, 135687 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:32:07,469 : INFO : EPOCH 5 - PROGRESS: at 50.03% examples, 135856 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:32:08,521 : INFO : EPOCH 5 - PROGRESS: at 51.23% examples, 135783 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:32:09,610 : INFO : EPOCH 5 - PROGRESS: at 52.56% examples, 135294 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:32:10,610 : INFO : EPOCH 5 - PROGRESS: at 54.31% examples, 136072 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:32:11,768 : INFO : EPOCH 5 - PROGRESS: at 55.75% examples, 135440 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:32:12,832 : INFO : EPOCH 5 - PROGRESS: at 57.61% examples, 135395 words/s, in_qsize 16, out_qsize 2
2019-11-13 13:32:14,049 : INFO : EPOCH 5 - PROGRESS: at 59.07% examples, 135310 words/s,

(47229506, 61784180)

In [19]:
print("\n\n[INFO] Save the model")
embedding_model_ocr.save("./w2v_005_embedding_model_ocr.model")

2019-11-13 13:32:42,913 : INFO : saving Word2Vec object under ./w2v_005_embedding_model_ocr.model, separately None
2019-11-13 13:32:42,914 : INFO : storing np array 'vectors' to ./w2v_005_embedding_model_ocr.model.wv.vectors.npy




[INFO] Save the model


2019-11-13 13:32:45,335 : INFO : not storing attribute vectors_norm
2019-11-13 13:32:45,336 : INFO : storing np array 'syn1neg' to ./w2v_005_embedding_model_ocr.model.trainables.syn1neg.npy
2019-11-13 13:32:47,731 : INFO : not storing attribute cum_table
2019-11-13 13:32:50,802 : INFO : saved ./w2v_005_embedding_model_ocr.model


## Corrected

In [20]:
list_sentences = db_sentence["corrected_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [21]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [22]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'trial',
 'splash',
 ':',
 'peler',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'during',
 'the',
 'elimination',
 'section',
 '.']

In [23]:
embedding_model_corrected = copy.deepcopy(embedding_model_orig)

In [24]:
embedding_model_corrected.workers = 8
embedding_model_corrected.min_count = 0

  


In [25]:
embedding_model_corrected.build_vocab(flattened_list_sentences, update=True)
embedding_model_corrected.train(flattened_list_sentences, 
                                total_examples=embedding_model_corrected.corpus_count,
                                epochs=w2v_args.epochs,  
                                compute_loss=w2v_args.compute_loss)

2019-11-13 13:33:01,833 : INFO : collecting all words and their counts
2019-11-13 13:33:01,834 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-13 13:33:01,876 : INFO : PROGRESS: at sentence #10000, processed 238920 words, keeping 21363 word types
2019-11-13 13:33:01,913 : INFO : PROGRESS: at sentence #20000, processed 462924 words, keeping 30432 word types
2019-11-13 13:33:01,952 : INFO : PROGRESS: at sentence #30000, processed 706176 words, keeping 39228 word types
2019-11-13 13:33:01,996 : INFO : PROGRESS: at sentence #40000, processed 967747 words, keeping 46390 word types
2019-11-13 13:33:02,038 : INFO : PROGRESS: at sentence #50000, processed 1206328 words, keeping 52319 word types
2019-11-13 13:33:02,083 : INFO : PROGRESS: at sentence #60000, processed 1454073 words, keeping 57131 word types
2019-11-13 13:33:02,125 : INFO : PROGRESS: at sentence #70000, processed 1709317 words, keeping 61561 word types
2019-11-13 13:33:02,166 : INFO : PROGRESS: 

2019-11-13 13:33:36,106 : INFO : EPOCH 1 - PROGRESS: at 26.24% examples, 151265 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:37,142 : INFO : EPOCH 1 - PROGRESS: at 28.07% examples, 151362 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:38,268 : INFO : EPOCH 1 - PROGRESS: at 29.94% examples, 151063 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:39,358 : INFO : EPOCH 1 - PROGRESS: at 31.87% examples, 151133 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:40,422 : INFO : EPOCH 1 - PROGRESS: at 33.92% examples, 151424 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:41,513 : INFO : EPOCH 1 - PROGRESS: at 35.99% examples, 151415 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:42,585 : INFO : EPOCH 1 - PROGRESS: at 37.94% examples, 151523 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:43,616 : INFO : EPOCH 1 - PROGRESS: at 39.91% examples, 151841 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:33:44,658 : INFO : EPOCH 1 - PROGRESS: at 41.69% examples, 151581 words/s,

2019-11-13 13:34:47,974 : INFO : EPOCH 2 - PROGRESS: at 27.77% examples, 88295 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:34:48,985 : INFO : EPOCH 2 - PROGRESS: at 28.93% examples, 88546 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:34:50,039 : INFO : EPOCH 2 - PROGRESS: at 29.94% examples, 88668 words/s, in_qsize 16, out_qsize 1
2019-11-13 13:34:51,238 : INFO : EPOCH 2 - PROGRESS: at 31.50% examples, 89493 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:34:52,286 : INFO : EPOCH 2 - PROGRESS: at 33.26% examples, 91188 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:34:53,422 : INFO : EPOCH 2 - PROGRESS: at 34.91% examples, 92053 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:34:54,479 : INFO : EPOCH 2 - PROGRESS: at 36.59% examples, 93247 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:34:55,496 : INFO : EPOCH 2 - PROGRESS: at 37.78% examples, 93487 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:34:56,568 : INFO : EPOCH 2 - PROGRESS: at 38.83% examples, 93158 words/s, in_qsize

2019-11-13 13:35:57,947 : INFO : EPOCH 3 - PROGRESS: at 31.17% examples, 134910 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:35:58,947 : INFO : EPOCH 3 - PROGRESS: at 32.82% examples, 135133 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:35:59,972 : INFO : EPOCH 3 - PROGRESS: at 34.81% examples, 136075 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:36:01,025 : INFO : EPOCH 3 - PROGRESS: at 36.84% examples, 137012 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:36:02,037 : INFO : EPOCH 3 - PROGRESS: at 38.65% examples, 137815 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:36:03,219 : INFO : EPOCH 3 - PROGRESS: at 40.09% examples, 135920 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:36:04,286 : INFO : EPOCH 3 - PROGRESS: at 41.77% examples, 135928 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:36:05,292 : INFO : EPOCH 3 - PROGRESS: at 43.23% examples, 135728 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:36:06,660 : INFO : EPOCH 3 - PROGRESS: at 44.49% examples, 133075 words/s,

2019-11-13 13:37:07,052 : INFO : EPOCH 4 - PROGRESS: at 42.18% examples, 139273 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:37:08,106 : INFO : EPOCH 4 - PROGRESS: at 43.37% examples, 137915 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:37:09,268 : INFO : EPOCH 4 - PROGRESS: at 45.18% examples, 137625 words/s, in_qsize 16, out_qsize 1
2019-11-13 13:37:10,328 : INFO : EPOCH 4 - PROGRESS: at 46.80% examples, 136889 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:37:11,447 : INFO : EPOCH 4 - PROGRESS: at 48.81% examples, 136991 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:37:12,466 : INFO : EPOCH 4 - PROGRESS: at 50.70% examples, 137261 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:37:13,694 : INFO : EPOCH 4 - PROGRESS: at 52.37% examples, 136146 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:37:14,753 : INFO : EPOCH 4 - PROGRESS: at 53.77% examples, 135547 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:37:15,789 : INFO : EPOCH 4 - PROGRESS: at 55.68% examples, 136275 words/s,

2019-11-13 13:38:16,809 : INFO : EPOCH 5 - PROGRESS: at 52.45% examples, 133987 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:38:17,813 : INFO : EPOCH 5 - PROGRESS: at 54.23% examples, 134697 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:38:18,902 : INFO : EPOCH 5 - PROGRESS: at 56.09% examples, 135027 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:38:20,074 : INFO : EPOCH 5 - PROGRESS: at 57.46% examples, 133715 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:38:21,090 : INFO : EPOCH 5 - PROGRESS: at 58.91% examples, 133256 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:38:22,148 : INFO : EPOCH 5 - PROGRESS: at 60.47% examples, 132721 words/s, in_qsize 14, out_qsize 1
2019-11-13 13:38:23,161 : INFO : EPOCH 5 - PROGRESS: at 62.05% examples, 132668 words/s, in_qsize 16, out_qsize 0
2019-11-13 13:38:24,206 : INFO : EPOCH 5 - PROGRESS: at 63.16% examples, 131642 words/s, in_qsize 15, out_qsize 0
2019-11-13 13:38:25,207 : INFO : EPOCH 5 - PROGRESS: at 65.01% examples, 132121 words/s,

(42049881, 60620435)

In [26]:
print("\n\n[INFO] Save the model")
embedding_model_corrected.save("./w2v_005_embedding_model_corrected.model")

2019-11-13 13:38:46,350 : INFO : saving Word2Vec object under ./w2v_005_embedding_model_corrected.model, separately None
2019-11-13 13:38:46,354 : INFO : storing np array 'vectors' to ./w2v_005_embedding_model_corrected.model.wv.vectors.npy




[INFO] Save the model


2019-11-13 13:38:48,917 : INFO : not storing attribute vectors_norm
2019-11-13 13:38:48,918 : INFO : storing np array 'syn1neg' to ./w2v_005_embedding_model_corrected.model.trainables.syn1neg.npy
2019-11-13 13:38:50,261 : INFO : not storing attribute cum_table
2019-11-13 13:38:51,857 : INFO : saved ./w2v_005_embedding_model_corrected.model
