# Generate Language models from scratch

In [1]:
from argparse import Namespace

import copy
import glob
import os
import pandas as pd
import pickle
import re
import spacy
from time import time

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Read DB

In [2]:
db_sentence = pd.read_pickle("./db_trove_sentence_lookup.pkl")
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_corrected,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,2,0,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[], [], [], [], [], [], [], []]","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[], [], [], [], [], [], [], [], [], []]"
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,1,0,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[], [], [], [], [], [], []]","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[], [], [], [], [], [], []]"
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,3,0,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[], [], [], [], [], [], [], [], [], [], [], []]","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[], []]"
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,4,0,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[], [], [], [], [], [], [], [], [], [], [], [...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[], [], [], [], [], [], [], [], [], [], [], [..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,2,0,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[], [], [], [], [], [], [], []]","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[], []]"


## preprocess DB

In [3]:
def cleanup(myrow, col_name):
    all_clean_rows = []
    for sent in myrow[col_name]:
        one_clean_row = []
        for token in sent:
            one_clean_row.append(token.lower())
        all_clean_rows.append(one_clean_row)
    return all_clean_rows

In [4]:
db_sentence["ocr_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["ocr_sentencizer"], axis=1)
db_sentence["corrected_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["corrected_sentencizer"], axis=1)

In [5]:
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_corrected,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup,ocr_sentencizer_cleaned,corrected_sentencizer_cleaned
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,0.847561,746,820,2,0,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[], [], [], [], [], [], [], []]","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[], [], [], [], [], [], [], [], [], []]","[[from, river, crossing, to, end, of, triäü, i...","[[from, river, crossing, to, end, of, trial, s..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",0.964119,641,630,1,0,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[], [], [], [], [], [], []]","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[], [], [], [], [], [], []]","[[natural, childbirth, sir,-we, nurses, have, ...","[[natural, childbirth, sir,-we, nurses, have, ..."
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,0.738901,946,832,3,0,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[], [], [], [], [], [], [], [], [], [], [], []]","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[], []]","[[first, church, i, service, 1, presbyterian, ...","[[first, church, service, presbyterian, annive..."
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",0.493898,2950,2740,4,0,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[], [], [], [], [], [], [], [], [], [], [], [...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[], [], [], [], [], [], [], [], [], [], [], [...","[["", bob, "", lulham, 's, fight, against, thall...","[["", bob, "", lulham, 's, fight, against, thall..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",0.894176,1219,1121,2,0,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[], [], [], [], [], [], [], []]","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[], []]","[[divorce, before, the, judge, in, divorce, ,,...","[[divorce, before, the, judge, in, divorce, ,,..."


## LM parameters

In [18]:
# args for Word2Vec
w2v_args = Namespace(
    epochs=5, 
    # only for Word2Vec
    compute_loss=True,                               # If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

    size=300,                                        # Dimensionality of the word vectors.
    alpha=0.03,                                      # The initial learning rate.
    min_alpha=0.0007,                                # Learning rate will linearly drop to min_alpha as training progresses.
    sg=1,                                            # Training algorithm: skip-gram if sg=1, otherwise CBOW.
    hs=0,                                            # If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
    negative=20,                                     # If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. 
    min_count=0,                                   # The model ignores all words with total frequency lower than this.
    window=5,                                        # The maximum distance between the current and predicted word within a sentence.
    sample=1e-3,                                     # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
    workers=8, 
    cbow_mean=1,                                     # If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
    null_word=0,                                     # 
    trim_rule=None,                                  # 
    sorted_vocab=1,                                  # If 1, sort the vocabulary by descending frequency before assigning word indices.
    batch_words=10000,                               # Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
    
    seed=1364,                                       # Seed for the random number generator.
    # only for FastText (compare to word2vec)
    #word_ngrams=1,                                   # If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to Word2Vec. 
    #min_n=2,                                         # Minimum length of char n-grams to be used for training word representations.
    #max_n=15,                                        # Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used.
    #bucket=2000000                                  # Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model.
)

In [19]:
# Only if a new LM needs to be trained (from scratch)
embedding_model_ocr = Word2Vec(
     size=w2v_args.size, 
     alpha=w2v_args.alpha,
     min_alpha=w2v_args.min_alpha, 
     sg=w2v_args.sg, 
     hs=w2v_args.hs, 
     negative=w2v_args.negative, 
     iter=w2v_args.epochs, 
     min_count=w2v_args.min_count, 
     window=w2v_args.window, 
     sample=w2v_args.sample, 
     workers=w2v_args.workers, 
     cbow_mean=w2v_args.cbow_mean, 
     null_word=w2v_args.null_word, 
     trim_rule=w2v_args.trim_rule, 
     sorted_vocab=w2v_args.sorted_vocab, 
     batch_words=w2v_args.batch_words, 
     seed=w2v_args.seed, 
     compute_loss=w2v_args.compute_loss)

# Only if a new LM needs to be trained (from scratch)
embedding_model_corrected = Word2Vec(
     size=w2v_args.size, 
     alpha=w2v_args.alpha,
     min_alpha=w2v_args.min_alpha, 
     sg=w2v_args.sg, 
     hs=w2v_args.hs, 
     negative=w2v_args.negative, 
     iter=w2v_args.epochs, 
     min_count=w2v_args.min_count, 
     window=w2v_args.window, 
     sample=w2v_args.sample, 
     workers=w2v_args.workers, 
     cbow_mean=w2v_args.cbow_mean, 
     null_word=w2v_args.null_word, 
     trim_rule=w2v_args.trim_rule, 
     sorted_vocab=w2v_args.sorted_vocab, 
     batch_words=w2v_args.batch_words, 
     seed=w2v_args.seed, 
     compute_loss=w2v_args.compute_loss)

## Preprocess before creating/updating LM

In [20]:
"""
def preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):
    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]
    return txt
"""

'\ndef preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):\n    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]\n    return txt\n'

In [21]:
"""
db_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)
db_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)
"""

'\ndb_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)\ndb_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)\n'

## OCR

In [22]:
list_sentences = db_sentence["ocr_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [23]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [24]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'triäü',
 'i',
 '^',
 'pi',
 'a^h',
 '"',
 'pclcr',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'jr',
 'la',
 'jil',
 '?',
 'during',
 '{',
 '|',
 ')',
 'c',
 'elimination',
 'section',
 '.']

In [25]:
embedding_model_ocr.build_vocab(flattened_list_sentences)
embedding_model_ocr.train(flattened_list_sentences, 
                          total_examples=embedding_model_ocr.corpus_count,
                          epochs=w2v_args.epochs,  
                          compute_loss=w2v_args.compute_loss)

2019-11-12 11:52:47,555 : INFO : collecting all words and their counts
2019-11-12 11:52:47,556 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-12 11:52:47,802 : INFO : PROGRESS: at sentence #10000, processed 525154 words, keeping 85319 word types
2019-11-12 11:52:48,092 : INFO : PROGRESS: at sentence #20000, processed 856174 words, keeping 117320 word types
2019-11-12 11:52:48,422 : INFO : PROGRESS: at sentence #30000, processed 1262312 words, keeping 157071 word types
2019-11-12 11:52:48,771 : INFO : PROGRESS: at sentence #40000, processed 1640602 words, keeping 191468 word types
2019-11-12 11:52:49,062 : INFO : PROGRESS: at sentence #50000, processed 1983065 words, keeping 219352 word types
2019-11-12 11:52:49,341 : INFO : PROGRESS: at sentence #60000, processed 2332908 words, keeping 244407 word types
2019-11-12 11:52:49,650 : INFO : PROGRESS: at sentence #70000, processed 2683098 words, keeping 272037 word types
2019-11-12 11:52:49,940 : INFO : PR

2019-11-12 12:03:44,772 : INFO : EPOCH 1 - PROGRESS: at 21.30% examples, 31812 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:03:46,288 : INFO : EPOCH 1 - PROGRESS: at 22.04% examples, 31873 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:03:47,382 : INFO : EPOCH 1 - PROGRESS: at 22.77% examples, 32148 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:03:48,517 : INFO : EPOCH 1 - PROGRESS: at 23.09% examples, 31920 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:03:49,894 : INFO : EPOCH 1 - PROGRESS: at 23.64% examples, 31932 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:03:51,045 : INFO : EPOCH 1 - PROGRESS: at 24.31% examples, 32169 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:03:52,430 : INFO : EPOCH 1 - PROGRESS: at 24.63% examples, 31953 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:03:53,569 : INFO : EPOCH 1 - PROGRESS: at 25.00% examples, 31972 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:03:54,684 : INFO : EPOCH 1 - PROGRESS: at 25.59% examples, 32213 words/s, in_qsize

2019-11-12 12:05:13,881 : INFO : EPOCH 1 - PROGRESS: at 50.20% examples, 31376 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:05:14,942 : INFO : EPOCH 1 - PROGRESS: at 50.51% examples, 31470 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:05:16,041 : INFO : EPOCH 1 - PROGRESS: at 50.73% examples, 31396 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:05:17,131 : INFO : EPOCH 1 - PROGRESS: at 51.15% examples, 31419 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:05:18,727 : INFO : EPOCH 1 - PROGRESS: at 51.49% examples, 31294 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:05:19,807 : INFO : EPOCH 1 - PROGRESS: at 51.89% examples, 31318 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:05:20,830 : INFO : EPOCH 1 - PROGRESS: at 51.94% examples, 31163 words/s, in_qsize 16, out_qsize 1
2019-11-12 12:05:21,864 : INFO : EPOCH 1 - PROGRESS: at 52.36% examples, 31252 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:05:23,104 : INFO : EPOCH 1 - PROGRESS: at 52.56% examples, 31103 words/s, in_qsize

2019-11-12 12:06:42,321 : INFO : EPOCH 1 - PROGRESS: at 77.21% examples, 30556 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:06:43,585 : INFO : EPOCH 1 - PROGRESS: at 77.68% examples, 30546 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:06:44,651 : INFO : EPOCH 1 - PROGRESS: at 78.10% examples, 30565 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:06:45,656 : INFO : EPOCH 1 - PROGRESS: at 78.34% examples, 30529 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:06:46,724 : INFO : EPOCH 1 - PROGRESS: at 78.86% examples, 30575 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:06:48,047 : INFO : EPOCH 1 - PROGRESS: at 79.01% examples, 30470 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:06:49,210 : INFO : EPOCH 1 - PROGRESS: at 79.33% examples, 30447 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:06:50,446 : INFO : EPOCH 1 - PROGRESS: at 79.66% examples, 30418 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:06:51,598 : INFO : EPOCH 1 - PROGRESS: at 79.99% examples, 30399 words/s, in_qsize

2019-11-12 12:08:00,410 : INFO : EPOCH 2 - PROGRESS: at 0.09% examples, 4528 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:08:02,368 : INFO : EPOCH 2 - PROGRESS: at 0.58% examples, 19473 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:08:04,322 : INFO : EPOCH 2 - PROGRESS: at 1.08% examples, 23955 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:08:06,362 : INFO : EPOCH 2 - PROGRESS: at 1.46% examples, 26175 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:08:08,675 : INFO : EPOCH 2 - PROGRESS: at 1.87% examples, 26583 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:08:10,540 : INFO : EPOCH 2 - PROGRESS: at 2.43% examples, 27764 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:08:12,134 : INFO : EPOCH 2 - PROGRESS: at 2.93% examples, 29180 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:08:13,495 : INFO : EPOCH 2 - PROGRESS: at 3.81% examples, 30503 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:08:14,812 : INFO : EPOCH 2 - PROGRESS: at 4.43% examples, 31914 words/s, in_qsize 16, out_q

2019-11-12 12:09:32,875 : INFO : EPOCH 2 - PROGRESS: at 47.85% examples, 45844 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:33,925 : INFO : EPOCH 2 - PROGRESS: at 48.32% examples, 45920 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:35,051 : INFO : EPOCH 2 - PROGRESS: at 48.70% examples, 45885 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:36,278 : INFO : EPOCH 2 - PROGRESS: at 49.14% examples, 45964 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:09:37,291 : INFO : EPOCH 2 - PROGRESS: at 49.45% examples, 45972 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:38,468 : INFO : EPOCH 2 - PROGRESS: at 49.81% examples, 46064 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:39,500 : INFO : EPOCH 2 - PROGRESS: at 50.13% examples, 46146 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:40,557 : INFO : EPOCH 2 - PROGRESS: at 50.45% examples, 46132 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:09:41,568 : INFO : EPOCH 2 - PROGRESS: at 50.98% examples, 46206 words/s, in_qsize

2019-11-12 12:10:59,092 : INFO : EPOCH 2 - PROGRESS: at 89.10% examples, 46094 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:00,122 : INFO : EPOCH 2 - PROGRESS: at 89.67% examples, 46074 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:01,361 : INFO : EPOCH 2 - PROGRESS: at 90.36% examples, 46048 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:02,395 : INFO : EPOCH 2 - PROGRESS: at 90.84% examples, 45990 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:03,576 : INFO : EPOCH 2 - PROGRESS: at 91.44% examples, 45980 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:04,780 : INFO : EPOCH 2 - PROGRESS: at 92.16% examples, 45965 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:05,809 : INFO : EPOCH 2 - PROGRESS: at 92.68% examples, 45995 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:06,849 : INFO : EPOCH 2 - PROGRESS: at 92.97% examples, 45942 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:11:07,973 : INFO : EPOCH 2 - PROGRESS: at 93.50% examples, 45951 words/s, in_qsize

2019-11-12 12:12:13,320 : INFO : EPOCH 3 - PROGRESS: at 31.05% examples, 52285 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:12:14,436 : INFO : EPOCH 3 - PROGRESS: at 31.57% examples, 52311 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:12:15,541 : INFO : EPOCH 3 - PROGRESS: at 32.13% examples, 52363 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:12:16,654 : INFO : EPOCH 3 - PROGRESS: at 32.62% examples, 52402 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:12:17,810 : INFO : EPOCH 3 - PROGRESS: at 33.20% examples, 52395 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:12:18,940 : INFO : EPOCH 3 - PROGRESS: at 33.79% examples, 52417 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:12:20,092 : INFO : EPOCH 3 - PROGRESS: at 34.32% examples, 52408 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:12:21,313 : INFO : EPOCH 3 - PROGRESS: at 35.01% examples, 52378 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:12:22,399 : INFO : EPOCH 3 - PROGRESS: at 35.83% examples, 52430 words/s, in_qsize

2019-11-12 12:13:35,238 : INFO : EPOCH 3 - PROGRESS: at 72.61% examples, 51672 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:13:36,375 : INFO : EPOCH 3 - PROGRESS: at 73.23% examples, 51688 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:13:37,459 : INFO : EPOCH 3 - PROGRESS: at 73.97% examples, 51723 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:13:38,545 : INFO : EPOCH 3 - PROGRESS: at 74.64% examples, 51748 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:13:39,675 : INFO : EPOCH 3 - PROGRESS: at 75.30% examples, 51753 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:13:40,732 : INFO : EPOCH 3 - PROGRESS: at 76.03% examples, 51784 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:13:41,762 : INFO : EPOCH 3 - PROGRESS: at 77.03% examples, 51824 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:13:42,866 : INFO : EPOCH 3 - PROGRESS: at 77.84% examples, 51834 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:13:43,974 : INFO : EPOCH 3 - PROGRESS: at 78.53% examples, 51846 words/s, in_qsize

2019-11-12 12:14:49,014 : INFO : EPOCH 4 - PROGRESS: at 12.85% examples, 50874 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:14:50,112 : INFO : EPOCH 4 - PROGRESS: at 13.71% examples, 51022 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:14:51,256 : INFO : EPOCH 4 - PROGRESS: at 14.27% examples, 51073 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:14:52,342 : INFO : EPOCH 4 - PROGRESS: at 15.21% examples, 51222 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:14:53,373 : INFO : EPOCH 4 - PROGRESS: at 16.22% examples, 51421 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:14:54,441 : INFO : EPOCH 4 - PROGRESS: at 16.91% examples, 51655 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:14:55,593 : INFO : EPOCH 4 - PROGRESS: at 17.74% examples, 51655 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:14:56,662 : INFO : EPOCH 4 - PROGRESS: at 18.56% examples, 51813 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:14:57,826 : INFO : EPOCH 4 - PROGRESS: at 19.24% examples, 51851 words/s, in_qsize

2019-11-12 12:16:10,398 : INFO : EPOCH 4 - PROGRESS: at 59.35% examples, 52472 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:16:11,580 : INFO : EPOCH 4 - PROGRESS: at 59.84% examples, 52420 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:16:12,752 : INFO : EPOCH 4 - PROGRESS: at 60.36% examples, 52441 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:16:13,884 : INFO : EPOCH 4 - PROGRESS: at 61.07% examples, 52455 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:16:15,036 : INFO : EPOCH 4 - PROGRESS: at 61.77% examples, 52453 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:16:16,136 : INFO : EPOCH 4 - PROGRESS: at 62.33% examples, 52486 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:16:17,277 : INFO : EPOCH 4 - PROGRESS: at 62.96% examples, 52491 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:16:18,417 : INFO : EPOCH 4 - PROGRESS: at 63.57% examples, 52488 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:16:19,449 : INFO : EPOCH 4 - PROGRESS: at 63.98% examples, 52478 words/s, in_qsize

2019-11-12 12:17:24,391 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-12 12:17:24,392 : INFO : EPOCH - 4 : training on 12356836 raw words (9445544 effective words) took 179.5s, 52609 effective words/s
2019-11-12 12:17:25,619 : INFO : EPOCH 5 - PROGRESS: at 0.09% examples, 6463 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:17:26,826 : INFO : EPOCH 5 - PROGRESS: at 0.58% examples, 29753 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:17:28,053 : INFO : EPOCH 5 - PROGRESS: at 1.08% examples, 37140 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:17:29,303 : INFO : EPOCH 5 - PROGRESS: at 1.46% examples, 41141 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:17:30,563 : INFO : EPOCH 5 - PROGRESS: at 1.87% examples, 43212 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:17:31,685 : INFO : EPOCH 5 - PROGRESS: at 2.43% examples, 45285 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:17:32,826 : INFO : EPOCH 5 - PROGRESS: at 2.91% examples, 46692 words/s, in_qsize 15,

2019-11-12 12:18:44,008 : INFO : EPOCH 5 - PROGRESS: at 46.33% examples, 52912 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:18:45,077 : INFO : EPOCH 5 - PROGRESS: at 47.19% examples, 52955 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:18:46,262 : INFO : EPOCH 5 - PROGRESS: at 48.02% examples, 52920 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:18:47,440 : INFO : EPOCH 5 - PROGRESS: at 48.52% examples, 52943 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:18:48,691 : INFO : EPOCH 5 - PROGRESS: at 48.99% examples, 52916 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:18:49,940 : INFO : EPOCH 5 - PROGRESS: at 49.40% examples, 52889 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:18:51,097 : INFO : EPOCH 5 - PROGRESS: at 49.74% examples, 52916 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:18:52,100 : INFO : EPOCH 5 - PROGRESS: at 50.08% examples, 52942 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:18:53,179 : INFO : EPOCH 5 - PROGRESS: at 50.45% examples, 52923 words/s, in_qsize

2019-11-12 12:20:05,447 : INFO : EPOCH 5 - PROGRESS: at 92.06% examples, 53064 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:20:06,510 : INFO : EPOCH 5 - PROGRESS: at 92.68% examples, 53089 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:20:07,519 : INFO : EPOCH 5 - PROGRESS: at 93.11% examples, 53088 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:20:08,666 : INFO : EPOCH 5 - PROGRESS: at 93.63% examples, 53040 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:20:09,865 : INFO : EPOCH 5 - PROGRESS: at 94.14% examples, 53027 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:20:11,050 : INFO : EPOCH 5 - PROGRESS: at 94.58% examples, 53022 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:20:12,243 : INFO : EPOCH 5 - PROGRESS: at 95.06% examples, 53010 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:20:13,434 : INFO : EPOCH 5 - PROGRESS: at 95.48% examples, 52998 words/s, in_qsize 16, out_qsize 1
2019-11-12 12:20:14,576 : INFO : EPOCH 5 - PROGRESS: at 95.94% examples, 53006 words/s, in_qsize

(47229959, 61784180)

In [26]:
print("\n\n[INFO] Save the model")
embedding_model_ocr.save("./embedding_model_ocr.model")

2019-11-12 12:20:22,243 : INFO : saving Word2Vec object under ./embedding_model_ocr.model, separately None
2019-11-12 12:20:22,253 : INFO : storing np array 'vectors' to ./embedding_model_ocr.model.wv.vectors.npy




[INFO] Save the model


2019-11-12 12:20:25,920 : INFO : not storing attribute vectors_norm
2019-11-12 12:20:25,946 : INFO : storing np array 'syn1neg' to ./embedding_model_ocr.model.trainables.syn1neg.npy
2019-11-12 12:20:29,073 : INFO : not storing attribute cum_table
2019-11-12 12:20:32,531 : INFO : saved ./embedding_model_ocr.model


## Corrected

In [27]:
list_sentences = db_sentence["corrected_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [28]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [29]:
flattened_list_sentences[0]

['from',
 'river',
 'crossing',
 'to',
 'end',
 'of',
 'trial',
 'splash',
 ':',
 'peler',
 'antill',
 'ploughed',
 'deep',
 'into',
 'paddy',
 "'s",
 'river',
 'in',
 'his',
 'chrysler',
 'plymouth',
 'during',
 'the',
 'elimination',
 'section',
 '.']

In [30]:
embedding_model_corrected.build_vocab(flattened_list_sentences)
embedding_model_corrected.train(flattened_list_sentences, 
                                total_examples=embedding_model_corrected.corpus_count,
                                epochs=w2v_args.epochs,  
                                compute_loss=w2v_args.compute_loss)

2019-11-12 12:20:33,558 : INFO : collecting all words and their counts
2019-11-12 12:20:33,559 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-12 12:20:33,718 : INFO : PROGRESS: at sentence #10000, processed 238920 words, keeping 21363 word types
2019-11-12 12:20:33,868 : INFO : PROGRESS: at sentence #20000, processed 462924 words, keeping 30432 word types
2019-11-12 12:20:34,035 : INFO : PROGRESS: at sentence #30000, processed 706176 words, keeping 39228 word types
2019-11-12 12:20:34,206 : INFO : PROGRESS: at sentence #40000, processed 967747 words, keeping 46390 word types
2019-11-12 12:20:34,325 : INFO : PROGRESS: at sentence #50000, processed 1206328 words, keeping 52319 word types
2019-11-12 12:20:34,447 : INFO : PROGRESS: at sentence #60000, processed 1454073 words, keeping 57131 word types
2019-11-12 12:20:34,580 : INFO : PROGRESS: at sentence #70000, processed 1709317 words, keeping 61561 word types
2019-11-12 12:20:34,699 : INFO : PROGRESS: 

2019-11-12 12:22:11,061 : INFO : EPOCH 1 - PROGRESS: at 15.98% examples, 74626 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:22:12,413 : INFO : EPOCH 1 - PROGRESS: at 17.15% examples, 74445 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:22:13,447 : INFO : EPOCH 1 - PROGRESS: at 17.99% examples, 74039 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:22:14,730 : INFO : EPOCH 1 - PROGRESS: at 19.13% examples, 74056 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:22:15,864 : INFO : EPOCH 1 - PROGRESS: at 19.88% examples, 73400 words/s, in_qsize 15, out_qsize 1
2019-11-12 12:22:16,981 : INFO : EPOCH 1 - PROGRESS: at 20.84% examples, 73726 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:22:18,065 : INFO : EPOCH 1 - PROGRESS: at 21.48% examples, 73009 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:22:19,189 : INFO : EPOCH 1 - PROGRESS: at 22.65% examples, 73815 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:22:20,202 : INFO : EPOCH 1 - PROGRESS: at 23.35% examples, 73362 words/s, in_qsize

2019-11-12 12:23:29,393 : INFO : EPOCH 1 - PROGRESS: at 88.97% examples, 77175 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:30,520 : INFO : EPOCH 1 - PROGRESS: at 89.82% examples, 77134 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:23:31,539 : INFO : EPOCH 1 - PROGRESS: at 90.75% examples, 77181 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:32,582 : INFO : EPOCH 1 - PROGRESS: at 91.72% examples, 77196 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:33,633 : INFO : EPOCH 1 - PROGRESS: at 92.73% examples, 77202 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:34,721 : INFO : EPOCH 1 - PROGRESS: at 93.69% examples, 77188 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:35,764 : INFO : EPOCH 1 - PROGRESS: at 94.64% examples, 77211 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:36,891 : INFO : EPOCH 1 - PROGRESS: at 95.63% examples, 77238 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:23:37,961 : INFO : EPOCH 1 - PROGRESS: at 96.82% examples, 77371 words/s, in_qsize

2019-11-12 12:24:41,273 : INFO : EPOCH 2 - PROGRESS: at 55.32% examples, 77822 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:24:42,332 : INFO : EPOCH 2 - PROGRESS: at 56.26% examples, 77714 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:24:43,361 : INFO : EPOCH 2 - PROGRESS: at 57.40% examples, 77874 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:24:44,442 : INFO : EPOCH 2 - PROGRESS: at 58.31% examples, 77757 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:24:45,458 : INFO : EPOCH 2 - PROGRESS: at 59.44% examples, 77945 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:24:46,597 : INFO : EPOCH 2 - PROGRESS: at 60.47% examples, 77784 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:24:47,605 : INFO : EPOCH 2 - PROGRESS: at 61.55% examples, 77963 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:24:48,716 : INFO : EPOCH 2 - PROGRESS: at 62.44% examples, 77814 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:24:49,733 : INFO : EPOCH 2 - PROGRESS: at 63.40% examples, 77879 words/s, in_qsize

2019-11-12 12:25:53,099 : INFO : EPOCH 3 - PROGRESS: at 21.71% examples, 77190 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:25:54,219 : INFO : EPOCH 3 - PROGRESS: at 22.72% examples, 77333 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:25:55,244 : INFO : EPOCH 3 - PROGRESS: at 23.69% examples, 77453 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:25:56,269 : INFO : EPOCH 3 - PROGRESS: at 24.78% examples, 77559 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:25:57,300 : INFO : EPOCH 3 - PROGRESS: at 25.75% examples, 77393 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:25:58,305 : INFO : EPOCH 3 - PROGRESS: at 26.72% examples, 77566 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:25:59,417 : INFO : EPOCH 3 - PROGRESS: at 27.69% examples, 77450 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:26:00,515 : INFO : EPOCH 3 - PROGRESS: at 28.75% examples, 77583 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:26:01,535 : INFO : EPOCH 3 - PROGRESS: at 29.61% examples, 77487 words/s, in_qsize

2019-11-12 12:27:10,055 : INFO : EPOCH 3 - PROGRESS: at 94.22% examples, 78284 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:27:11,104 : INFO : EPOCH 3 - PROGRESS: at 95.10% examples, 78223 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:27:12,186 : INFO : EPOCH 3 - PROGRESS: at 96.12% examples, 78272 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:27:13,199 : INFO : EPOCH 3 - PROGRESS: at 97.06% examples, 78239 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:27:14,289 : INFO : EPOCH 3 - PROGRESS: at 98.12% examples, 78285 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:27:15,304 : INFO : EPOCH 3 - PROGRESS: at 99.04% examples, 78260 words/s, in_qsize 14, out_qsize 0
2019-11-12 12:27:15,897 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-11-12 12:27:16,046 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-11-12 12:27:16,083 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-11-12 12:27:16,158 : INFO : worker thread fi

2019-11-12 12:28:21,909 : INFO : EPOCH 4 - PROGRESS: at 60.01% examples, 76766 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:28:22,933 : INFO : EPOCH 4 - PROGRESS: at 61.08% examples, 76869 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:28:23,967 : INFO : EPOCH 4 - PROGRESS: at 61.89% examples, 76699 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:28:25,021 : INFO : EPOCH 4 - PROGRESS: at 62.99% examples, 76943 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:28:26,101 : INFO : EPOCH 4 - PROGRESS: at 63.79% examples, 76747 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:28:27,228 : INFO : EPOCH 4 - PROGRESS: at 65.01% examples, 76880 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:28:28,310 : INFO : EPOCH 4 - PROGRESS: at 65.98% examples, 76872 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:28:29,544 : INFO : EPOCH 4 - PROGRESS: at 67.21% examples, 76797 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:28:30,682 : INFO : EPOCH 4 - PROGRESS: at 68.38% examples, 77005 words/s, in_qsize

2019-11-12 12:29:34,925 : INFO : EPOCH 5 - PROGRESS: at 27.46% examples, 77714 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:29:36,002 : INFO : EPOCH 5 - PROGRESS: at 28.31% examples, 77450 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:29:37,095 : INFO : EPOCH 5 - PROGRESS: at 29.39% examples, 77590 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:29:38,180 : INFO : EPOCH 5 - PROGRESS: at 30.15% examples, 77340 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:29:39,221 : INFO : EPOCH 5 - PROGRESS: at 31.26% examples, 77597 words/s, in_qsize 16, out_qsize 0
2019-11-12 12:29:40,334 : INFO : EPOCH 5 - PROGRESS: at 32.11% examples, 77322 words/s, in_qsize 14, out_qsize 1
2019-11-12 12:29:41,337 : INFO : EPOCH 5 - PROGRESS: at 33.26% examples, 77683 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:29:42,414 : INFO : EPOCH 5 - PROGRESS: at 34.19% examples, 77477 words/s, in_qsize 15, out_qsize 0
2019-11-12 12:29:43,459 : INFO : EPOCH 5 - PROGRESS: at 35.33% examples, 77705 words/s, in_qsize

2019-11-12 12:30:52,525 : INFO : EPOCH 5 - PROGRESS: at 99.46% examples, 77581 words/s, in_qsize 9, out_qsize 0
2019-11-12 12:30:52,607 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-11-12 12:30:52,691 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-11-12 12:30:52,777 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-11-12 12:30:52,878 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-11-12 12:30:52,890 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-11-12 12:30:52,922 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-11-12 12:30:52,988 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-11-12 12:30:53,123 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-12 12:30:53,124 : INFO : EPOCH - 5 : training on 12124087 raw words (8407524 effective words) took 108.2s, 77707 effective words/s
2019-11-12 12:30:53,1

(42046793, 60620435)

In [31]:
print("\n\n[INFO] Save the model")
embedding_model_ocr.save("./embedding_model_corrected.model")

2019-11-12 12:30:53,135 : INFO : saving Word2Vec object under ./embedding_model_corrected.model, separately None
2019-11-12 12:30:53,138 : INFO : storing np array 'vectors' to ./embedding_model_corrected.model.wv.vectors.npy




[INFO] Save the model


2019-11-12 12:30:56,405 : INFO : not storing attribute vectors_norm
2019-11-12 12:30:56,407 : INFO : storing np array 'syn1neg' to ./embedding_model_corrected.model.trainables.syn1neg.npy
2019-11-12 12:30:59,777 : INFO : not storing attribute cum_table
2019-11-12 12:31:03,140 : INFO : saved ./embedding_model_corrected.model
