# Language Model analysis

In [1]:
from argparse import Namespace

import copy
import glob
import numpy as np
import os
import pandas as pd
import pickle
import re
import spacy
from time import time

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# embedding models, base model
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model"
w2v = Word2Vec.load(model_path)

# OCR model, pretrained
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/w2v_005_embedding_model_ocr.model"
w2v_em_ocr = Word2Vec.load(model_path)

# corrected model, pretrained
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/w2v_005_embedding_model_corrected.model"
w2v_em_corr = Word2Vec.load(model_path)

# OCR model, pretrained
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/embedding_model_ocr.model"
em_ocr = Word2Vec.load(model_path)

# corrected model, pretrained
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/embedding_model_corrected.model"
em_corr = Word2Vec.load(model_path)

2019-11-14 11:28:34,715 : INFO : loading Word2Vec object from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model
2019-11-14 11:28:35,834 : INFO : loading wv recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.* with mmap=None
2019-11-14 11:28:35,835 : INFO : loading vectors from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.vectors.npy with mmap=None
2019-11-14 11:28:36,255 : INFO : setting ignored attribute vectors_norm to None
2019-11-14 11:28:36,256 : INFO : loading vocabulary recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/m

## Create list of words in the corrected set

In [3]:
words_corrected = []
for item in em_corr.wv.vocab:
    words_corrected.append([item, int(em_corr.wv.vocab[item].count)])

In [4]:
import pandas as pd
pd_words = pd.DataFrame(words_corrected, columns=['vocab', 'count'])

In [6]:
pd_words = pd_words.sort_values(by=['count'], ascending=False)
print("size: {}".format(len(pd_words)))
pd_words

size: 179735


Unnamed: 0,vocab,count
57,",",860225
24,.,831221
21,the,736361
5,of,351214
110,and,291173
...,...,...
152166,instructeded,1
152167,nonfiling,1
152168,porformance,1
152155,hoiioiire,1


In [7]:
def found_neighbors(myrow, embedding, colname='vocab', topn=2):
    try:
        vocab_neigh = embedding.wv.most_similar([myrow['vocab']], topn=topn)
        return list(np.array(vocab_neigh)[:, 0])
    except KeyError:
        return []

In [8]:
#num_rows = 20
#pd2search = pd_words.loc[pd_words.index[np.linspace(2, len(pd_words)-1, num_rows).astype(np.int)]]
pd2search = pd_words[0:500]

In [None]:
import time
t1 = time.time()
pd2search['w2v_em_corr_neigh'] = pd2search.apply(found_neighbors, args=[w2v_em_corr], axis=1)
print(time.time() - t1)

In [None]:
import time
t1 = time.time()
pd2search['w2v_em_ocr_neigh'] = pd2search.apply(found_neighbors, args=[w2v_em_ocr], axis=1)
print(time.time() - t1)

In [None]:
import time
t1 = time.time()
pd2search['em_corr_neigh'] = pd2search.apply(found_neighbors, args=[em_corr], axis=1)
print(time.time() - t1)

In [None]:
import time
t1 = time.time()
pd2search['em_ocr_neigh'] = pd2search.apply(found_neighbors, args=[em_ocr], axis=1)
print(time.time() - t1)

In [None]:
pd2search.head()

In [None]:
def jaccard_similarity_df(myrow, colname_1, colname_2, make_lowercase=True):
    """
    Jaccard similarity between two documents (e.g., OCR and Human) on flattened list of words
    """
    list1 = myrow[colname_1]
    list2 = myrow[colname_2]
    if make_lowercase:
        list1 = [x.lower() for x in list1]
        list2 = [x.lower() for x in list2]
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [None]:
pd2search['jaccard_w2v_corr_w2v_ocr'] = \
    pd2search.apply(jaccard_similarity_df, args=['w2v_em_corr_neigh', "w2v_em_ocr_neigh", True], axis=1)

In [None]:
pd2search['jaccard_corr_ocr'] = \
    pd2search.apply(jaccard_similarity_df, args=['em_corr_neigh', "em_ocr_neigh", True], axis=1)

In [None]:
pd2search.head()

In [None]:
import matplotlib.pyplot as plt
plt.plot(pd2search['count'], pd2search['jaccard_w2v_corr_w2v_ocr'], 'ko', alpha=0.1)
plt.xlim(0, 20000)
print(pd2search['jaccard_w2v_corr_w2v_ocr'].mean())