# Language model analysis

In [1]:
from argparse import Namespace

import copy
import glob
import os
import pandas as pd
import pickle
import re
import spacy
from time import time

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
nlp = spacy.load("en_core_web_lg")

2019-11-11 17:15:39,234 : INFO : Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2019-11-11 17:15:39,240 : INFO : Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


## Initialize a pre-trained language model

In [3]:
# model types
# w2v: word2vec
# ft: fasttext
model_type = "w2v"   
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model"

if model_type.lower() in ["w2v", "word2vec"]:
    # Word2Vec
    embedding_model = Word2Vec.load(model_path)
elif model_type.lower() in ["ft", "fasttext"]:
    # FastText
    embedding_model = FastText.load(model_path)
print(embedding_model)
embedding_model_orig = copy.deepcopy(embedding_model)

2019-11-11 17:15:47,990 : INFO : loading Word2Vec object from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model
2019-11-11 17:15:49,144 : INFO : loading wv recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.* with mmap=None
2019-11-11 17:15:49,145 : INFO : loading vectors from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.vectors.npy with mmap=None
2019-11-11 17:15:49,515 : INFO : setting ignored attribute vectors_norm to None
2019-11-11 17:15:49,517 : INFO : loading vocabulary recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/m

Word2Vec(vocab=434049, size=300, alpha=0.03)


## Read DB

In [4]:
db_sentence = pd.read_pickle("./db_trove_sentencizer.pkl")
db_sentence.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,human_ocr_char_diff,str_similarity,str_length_humanText,str_length_ocrText,quality_band,use_human,corrected_sentencizer,corrected_dict_lookup,ocr_sentencizer,ocr_dict_lookup
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,74,0.847561,746,820,2,1,"[[FROM, RIVER, CROSSING, TO, END, OF, TRIAL, S...","[[], [], [], [], [], [], [], []]","[[FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I...","[[], [], [], [], [], [], [], [], [], []]"
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",11,0.964119,641,630,1,1,"[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[], [], [], [], [], [], []]","[[Natural, Childbirth, Sir,-We, nurses, have, ...","[[], [], [], [], [], [], []]"
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,114,0.738901,946,832,3,1,"[[FIRST, CHURCH, SERVICE, Presbyterian, Annive...","[[], [], [], [], [], [], [], [], [], [], [], []]","[[FIRST, CHURCH, I, SERVICE, 1, Presbyterian, ...","[[], []]"
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...","""Bob"" Lulham's Fight Against Thallium Arthur ...",210,0.493898,2950,2740,4,1,"[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[], [], [], [], [], [], [], [], [], [], [], [...","[["", Bob, "", Lulham, 's, Fight, Against, Thall...","[[], [], [], [], [], [], [], [], [], [], [], [..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...","DIVORCE Before The Judge In Divorce, Mr. Justi...",98,0.894176,1219,1121,2,1,"[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[], [], [], [], [], [], [], []]","[[DIVORCE, Before, The, Judge, In, Divorce, ,,...","[[], []]"


## preprocess DB

In [5]:
"""
def cleanup(myrow, colname="corrected"):
    # remove all # and @§
    
    corpus = [re.sub(r'#', '', element, flags=re.IGNORECASE) for element in corpus]
    corpus = [re.sub(r'@', '', element, flags=re.IGNORECASE) for element in corpus]
    
    # --- remove 2 or more .
    corpus = [re.sub('[.]{2,}', '.', element) for element in corpus]
    # --- add a space before and after a list of punctuations
    corpus = [re.sub(r"([.,!?:;\"\'])", r" \1 ", element) for element in corpus]
    # --- remove everything except:
    #corpus = [re.sub(r"([^a-zA-Z\-.:;,!?\d+]+)", r" ", element) for element in corpus]
    corpus = [re.sub(r"([^a-zA-Z\d+]+)", r" ", element) for element in corpus]
    # --- replace numbers with <NUM>
    corpus = [re.sub(r'\b\d+\b', '<NUM>', element) for element in corpus]
    corpus = [re.sub('--', '', element) for element in corpus]
    # --- normalize white spaces
    corpus = [re.sub('\s+', ' ', element) for element in corpus]
    
    # remove multiple spaces
    corpus = [re.sub(r'\s+', ' ', element, flags=re.IGNORECASE) for element in corpus]
    corpus = [element.strip() for element in corpus]
    #corpus = [element.lower() for element in corpus]
    return corpus
"""

'\ndef cleanup(myrow, colname="corrected"):\n    # remove all # and @§\n    \n    corpus = [re.sub(r\'#\', \'\', element, flags=re.IGNORECASE) for element in corpus]\n    corpus = [re.sub(r\'@\', \'\', element, flags=re.IGNORECASE) for element in corpus]\n    \n    # --- remove 2 or more .\n    corpus = [re.sub(\'[.]{2,}\', \'.\', element) for element in corpus]\n    # --- add a space before and after a list of punctuations\n    corpus = [re.sub(r"([.,!?:;"\'])", r" \x01 ", element) for element in corpus]\n    # --- remove everything except:\n    #corpus = [re.sub(r"([^a-zA-Z\\-.:;,!?\\d+]+)", r" ", element) for element in corpus]\n    corpus = [re.sub(r"([^a-zA-Z\\d+]+)", r" ", element) for element in corpus]\n    # --- replace numbers with <NUM>\n    corpus = [re.sub(r\'\x08\\d+\x08\', \'<NUM>\', element) for element in corpus]\n    corpus = [re.sub(\'--\', \'\', element) for element in corpus]\n    # --- normalize white spaces\n    corpus = [re.sub(\'\\s+\', \' \', element) for elem

In [6]:
def cleanup(myrow, col_name):
    all_clean_rows = []
    for sent in myrow[col_name]:
        one_clean_row = []
        for token in sent:
            one_clean_row.append(token.lower())
        all_clean_rows.append(one_clean_row)
    return all_clean_rows

In [7]:
db_sentence["ocr_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["ocr_sentencizer"], axis=1)
db_sentence["corrected_sentencizer_cleaned"] = db_sentence.apply(cleanup, args=["corrected_sentencizer"], axis=1)

## Update a pre-trained LM

In [8]:
# args for Word2Vec
w2v_args = Namespace(
    epochs=1, 
    # only for Word2Vec
    compute_loss=True,                               # If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

#     size=100,                                        # Dimensionality of the word vectors.
#     alpha=0.03,                                      # The initial learning rate.
#     min_alpha=0.0007,                                # Learning rate will linearly drop to min_alpha as training progresses.
#     sg=1,                                            # Training algorithm: skip-gram if sg=1, otherwise CBOW.
#     hs=0,                                            # If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
#     negative=20,                                     # If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. 
#     min_count=5,                                    # The model ignores all words with total frequency lower than this.
#     window=5,                                        # The maximum distance between the current and predicted word within a sentence.
#     sample=1e-3,                                     # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
#     workers=8, 
#     cbow_mean=1,                                     # If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
#     null_word=0,                                     # 
#     trim_rule=None,                                  # 
#     sorted_vocab=1,                                  # If 1, sort the vocabulary by descending frequency before assigning word indices.
#     batch_words=10000,                               # Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
    
#     seed=1364,                                       # Seed for the random number generator.
#     # only for FastText (compare to word2vec)
#     #word_ngrams=1,                                   # If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to Word2Vec. 
#     #min_n=2,                                         # Minimum length of char n-grams to be used for training word representations.
#     #max_n=15,                                        # Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used.
#     #bucket=2000000                                  # Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model.
)

In [9]:
# Only if a new LM needs to be trained (from scratch)
"""
w2v_model = Word2Vec(
                 size=w2v_args.size, 
                 alpha=w2v_args.alpha,
                 min_alpha=w2v_args.min_alpha, 
                 sg=w2v_args.sg, 
                 hs=w2v_args.hs, 
                 negative=w2v_args.negative, 
                 iter=w2v_args.epochs, 
                 min_count=w2v_args.min_count, 
                 window=w2v_args.window, 
                 sample=w2v_args.sample, 
                 workers=w2v_args.workers, 
                 cbow_mean=w2v_args.cbow_mean, 
                 null_word=w2v_args.null_word, 
                 trim_rule=w2v_args.trim_rule, 
                 sorted_vocab=w2v_args.sorted_vocab, 
                 batch_words=w2v_args.batch_words, 
                 seed=w2v_args.seed, 
                 compute_loss=w2v_args.compute_loss)
"""

'\nw2v_model = Word2Vec(\n                 size=w2v_args.size, \n                 alpha=w2v_args.alpha,\n                 min_alpha=w2v_args.min_alpha, \n                 sg=w2v_args.sg, \n                 hs=w2v_args.hs, \n                 negative=w2v_args.negative, \n                 iter=w2v_args.epochs, \n                 min_count=w2v_args.min_count, \n                 window=w2v_args.window, \n                 sample=w2v_args.sample, \n                 workers=w2v_args.workers, \n                 cbow_mean=w2v_args.cbow_mean, \n                 null_word=w2v_args.null_word, \n                 trim_rule=w2v_args.trim_rule, \n                 sorted_vocab=w2v_args.sorted_vocab, \n                 batch_words=w2v_args.batch_words, \n                 seed=w2v_args.seed, \n                 compute_loss=w2v_args.compute_loss)\n'

## Preprocess before creating/updating LM

In [None]:
"""
def preprocess4LM(myrow, col_name="ocrText_cleaned_tokenize"):
    txt = [token.lemma_ for token in nlp(myrow[col_name].lower())]
    return txt
"""

In [None]:
"""
db_sentence["ocrText_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["ocrText_cleaned"], axis=1)
db_sentence["corrected_cleaned_tokenize"] = db_sentence[0:10].apply(preprocess4LM, args=["corrected_cleaned"], axis=1)
"""

## OCR

In [19]:
list_sentences = db_sentence["ocr_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

#sentences: 30509


In [21]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [26]:
embedding_model_ocr = copy.deepcopy(embedding_model_orig)

In [28]:
embedding_model_ocr.workers = 8

In [29]:
embedding_model_ocr.build_vocab(flattened_list_sentences, update=True)
embedding_model_ocr.train(flattened_list_sentences, 
                          total_examples=embedding_model.corpus_count,
                          epochs=w2v_args.epochs,  
                          compute_loss=w2v_args.compute_loss)

2019-11-11 17:26:03,481 : INFO : collecting all words and their counts
2019-11-11 17:26:03,482 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-11 17:26:03,611 : INFO : PROGRESS: at sentence #10000, processed 525154 words, keeping 85319 word types
2019-11-11 17:26:03,683 : INFO : PROGRESS: at sentence #20000, processed 856174 words, keeping 117320 word types
2019-11-11 17:26:03,785 : INFO : PROGRESS: at sentence #30000, processed 1262312 words, keeping 157071 word types
2019-11-11 17:26:03,880 : INFO : PROGRESS: at sentence #40000, processed 1640602 words, keeping 191468 word types
2019-11-11 17:26:03,963 : INFO : PROGRESS: at sentence #50000, processed 1983065 words, keeping 219352 word types
2019-11-11 17:26:04,058 : INFO : PROGRESS: at sentence #60000, processed 2332908 words, keeping 244407 word types
2019-11-11 17:26:04,146 : INFO : PROGRESS: at sentence #70000, processed 2683098 words, keeping 272037 word types
2019-11-11 17:26:04,240 : INFO : PR

2019-11-11 17:26:45,216 : INFO : EPOCH 1 - PROGRESS: at 0.10% examples, 133840 words/s, in_qsize 15, out_qsize 0


KeyboardInterrupt: 

## Corrected

In [None]:
list_sentences = db_sentence["corrected_sentencizer_cleaned"].to_list()
print('#sentences: {}'.format(len(list_sentences)))

In [None]:
flattened_list_sentences = [val for sublist in list_sentences for val in sublist]

In [None]:
embedding_model_corrected = copy.deepcopy(embedding_model_orig)

In [None]:
embedding_model_ocr.workers = 8

In [None]:
embedding_model_corrected.build_vocab(flattened_list_sentences, update=True)
embedding_model_corrected.train(flattened_list_sentences, 
                                total_examples=embedding_model.corpus_count,
                                epochs=w2v_args.epochs,  
                                compute_loss=w2v_args.compute_loss)

In [None]:
embedding_model.wv.vocab

In [None]:
embedding_model.wv.most_similar("man")

In [None]:
embedding_model_orig.wv.most_similar("man")

In [None]:
embedding_model.wv.most_similar("oldham")

## Word error rates and dictionary lookup 

In [None]:
# Load a spacy model
nlp = spacy.load('en_core_web_lg')
spacy_dict = list(nlp.vocab.strings)

In [None]:
def dictionary_lookup(corpus_series, nlp_tool="spacy", use_lemma=False):    
    fulltxt_sent_list = []
    fulltxt_found_dict = []
    counter = 0
    for corpus in corpus_series:
        sent_list_tmp = []
        found_dict_tmp = []
        
        print(counter)
        corpus_spacy = nlp(corpus)
        if nlp_tool == "spacy":
            for sentence in list(corpus_spacy.sents):
                if use_lemma:
                    # --- lemmatization WITH punctuation and stop wors
                    txt = [token.lemma_ for token in sentence]
                    txt_dict = [str(len(token.text)) if str(token.lemma_) in spacy_dict else str(-len(token.text)) for token in sentence]
                    # --- lemmatization, remove punctuation and stop wors
                    #txt = [token.lemma_ for token in sentence if not token.is_punct | token.is_stop]
                    #txt_pos = [token.pos_ for token in sentence if not token.is_punct | token.is_stop]
                else:
                    txt = [token.text for token in sentence]
                    txt_dict = [str(len(token.text)) if str(token.text) in spacy_dict else str(-len(token.text)) for token in sentence]

                sent_list_tmp.append(txt)
                found_dict_tmp.append(txt_dict)
        fulltxt_sent_list.append(sent_list_tmp)
        fulltxt_found_dict.append(found_dict_tmp)
        counter += 1
    return fulltxt_sent_list, fulltxt_found_dict

In [None]:
db = db[:20]

In [None]:
fulltxt_sent_list, fulltxt_found_dict = dictionary_lookup(db["GS_cleaned"])

In [None]:
db["sent_GS_cleaned"] = fulltxt_sent_list
db["dict_GS_cleaned"] = fulltxt_found_dict

In [None]:
db["sent_GS_cleaned"]

In [None]:
db["dict_GS_cleaned"]

In [None]:
db