# Language Model analysis

In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# embedding models, base model
#model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model"
model_path = "./LMs/embedding_model_scratch_corrected.model"
w2v_corrected = Word2Vec.load(model_path)

2019-11-20 19:51:23,147 : INFO : loading Word2Vec object from ./LMs/embedding_model_scratch_corrected.model
2019-11-20 19:51:24,385 : INFO : loading wv recursively from ./LMs/embedding_model_scratch_corrected.model.wv.* with mmap=None
2019-11-20 19:51:24,399 : INFO : loading vectors from ./LMs/embedding_model_scratch_corrected.model.wv.vectors.npy with mmap=None
2019-11-20 19:51:24,844 : INFO : setting ignored attribute vectors_norm to None
2019-11-20 19:51:24,852 : INFO : loading vocabulary recursively from ./LMs/embedding_model_scratch_corrected.model.vocabulary.* with mmap=None
2019-11-20 19:51:24,868 : INFO : loading trainables recursively from ./LMs/embedding_model_scratch_corrected.model.trainables.* with mmap=None
2019-11-20 19:51:24,870 : INFO : loading syn1neg from ./LMs/embedding_model_scratch_corrected.model.trainables.syn1neg.npy with mmap=None
2019-11-20 19:51:25,191 : INFO : setting ignored attribute cum_table to None
2019-11-20 19:51:25,269 : INFO : loaded ./LMs/embeddin

In [3]:
def found_neighbors(myrow, embedding, colname='vocab', topn=1):
    try:
        vocab_neigh = embedding.wv.most_similar([myrow['vocab']], topn=topn)
        return list(np.array(vocab_neigh)[:, 0])
    except KeyError:
        return []

In [4]:
def jaccard_similarity_df(myrow, colname_1, colname_2, make_lowercase=True):
    """
    Jaccard similarity between two documents (e.g., OCR and Human) on flattened list of words
    """
    list1 = myrow[colname_1]
    list2 = myrow[colname_2]
    if make_lowercase:
        list1 = [x.lower() for x in list1]
        list2 = [x.lower() for x in list2]
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [5]:
words_corrected = []
for item in w2v_corrected.wv.vocab:
    words_corrected.append([item, int(w2v_corrected.wv.vocab[item].count)])

In [6]:
pd_words = pd.DataFrame(words_corrected, columns=['vocab', 'count'])

In [7]:
pd_words = pd_words.sort_values(by=['count'], ascending=False)
print("size: {}".format(len(pd_words)))
pd_words.head()

size: 179735


Unnamed: 0,vocab,count
57,",",860225
24,.,831221
21,the,736361
5,of,351214
110,and,291173


In [12]:
pd2search = pd_words[0:1000]
pd2search

Unnamed: 0,vocab,count
57,",",860225
24,.,831221
21,the,736361
5,of,351214
110,and,291173
...,...,...
2483,matches,1142
8638,cape,1140
2883,matters,1139
265,prince,1139


# Quality bands 3, 4

## Create list of words and their frequencies in the corrected set

In [13]:
neigh_jaccard_bands_3_4 = []

w2v_em_ocr_qual_3_4 = Word2Vec.load('./LMs/w2v_005_EM_ocr_qual_3_4.model')
w2v_em_corr_qual_3_4 = Word2Vec.load('./LMs/w2v_005_EM_corr_qual_3_4.model')

for topn in [1, 2, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000]:
    print("topn: {}".format(topn))
    t1 = time.time()
    
    pd2search = pd_words[0:1000]
    pd2search['w2v_em_corr_qual_3_4'] = pd2search.apply(found_neighbors, args=[w2v_em_corr_qual_3_4, 
                                                                               'vocab', 
                                                                               topn], axis=1)
    print("corr: {}".format(time.time() - t1))
    pd2search['w2v_em_ocr_qual_3_4'] = pd2search.apply(found_neighbors, args=[w2v_em_ocr_qual_3_4, 
                                                                             'vocab', 
                                                                              topn], axis=1)
    pd2search['jaccard_qual_3_4'] = \
        pd2search.apply(jaccard_similarity_df, args=['w2v_em_corr_qual_3_4', 
                                                     "w2v_em_ocr_qual_3_4", 
                                                     True], 
                        axis=1)
    
    neigh_jaccard_bands_3_4.append(
        [topn, 
         pd2search['jaccard_qual_3_4'].mean(), 
         pd2search['jaccard_qual_3_4'].std(), 
         0
        ])
    
    print("total: {}".format(time.time() - t1))

neigh_jaccard_bands_3_4 = np.array(neigh_jaccard_bands_3_4)

2019-11-20 19:56:34,324 : INFO : loading Word2Vec object from ./LMs/w2v_005_EM_ocr_qual_3_4.model
2019-11-20 19:56:37,538 : INFO : loading wv recursively from ./LMs/w2v_005_EM_ocr_qual_3_4.model.wv.* with mmap=None
2019-11-20 19:56:37,540 : INFO : loading vectors from ./LMs/w2v_005_EM_ocr_qual_3_4.model.wv.vectors.npy with mmap=None
2019-11-20 19:56:38,971 : INFO : setting ignored attribute vectors_norm to None
2019-11-20 19:56:38,993 : INFO : loading vocabulary recursively from ./LMs/w2v_005_EM_ocr_qual_3_4.model.vocabulary.* with mmap=None
2019-11-20 19:56:38,996 : INFO : loading trainables recursively from ./LMs/w2v_005_EM_ocr_qual_3_4.model.trainables.* with mmap=None
2019-11-20 19:56:38,997 : INFO : loading syn1neg from ./LMs/w2v_005_EM_ocr_qual_3_4.model.trainables.syn1neg.npy with mmap=None
2019-11-20 19:56:40,004 : INFO : setting ignored attribute cum_table to None
2019-11-20 19:56:40,015 : INFO : loaded ./LMs/w2v_005_EM_ocr_qual_3_4.model
2019-11-20 19:56:42,957 : INFO : loadi

topn: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
2019-11-20 19:57:37,729 : INFO : precomputing L2-norms of word weight vectors


corr: 45.38017201423645


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


total: 91.93899583816528
topn: 2
corr: 57.73870491981506
total: 111.64461183547974
topn: 5
corr: 36.83208179473877
total: 94.78911685943604
topn: 10
corr: 69.45237493515015
total: 140.63996291160583
topn: 50
corr: 82.52211117744446
total: 160.23693799972534
topn: 100
corr: 67.58274292945862
total: 124.30753517150879
topn: 500
corr: 72.02198934555054
total: 141.812833070755
topn: 1000
corr: 58.17880296707153
total: 135.97059512138367
topn: 5000
corr: 118.19587993621826
total: 257.04857087135315
topn: 10000
corr: 174.63330507278442
total: 388.95017409324646
topn: 50000
corr: 391.7289128303528
total: 806.5877659320831


In [14]:
np.save("neigh_jaccard_bands_3_4.npy", neigh_jaccard_bands_3_4)