# Language Model analysis

In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# embedding models, base model
#model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model"
model_path = "./LMs/embedding_model_scratch_corrected.model"
w2v_corrected = Word2Vec.load(model_path)

2019-11-21 10:58:57,626 : INFO : loading Word2Vec object from ./LMs/embedding_model_scratch_corrected.model
2019-11-21 10:58:58,094 : INFO : loading wv recursively from ./LMs/embedding_model_scratch_corrected.model.wv.* with mmap=None
2019-11-21 10:58:58,094 : INFO : loading vectors from ./LMs/embedding_model_scratch_corrected.model.wv.vectors.npy with mmap=None
2019-11-21 10:58:58,223 : INFO : setting ignored attribute vectors_norm to None
2019-11-21 10:58:58,225 : INFO : loading vocabulary recursively from ./LMs/embedding_model_scratch_corrected.model.vocabulary.* with mmap=None
2019-11-21 10:58:58,225 : INFO : loading trainables recursively from ./LMs/embedding_model_scratch_corrected.model.trainables.* with mmap=None
2019-11-21 10:58:58,226 : INFO : loading syn1neg from ./LMs/embedding_model_scratch_corrected.model.trainables.syn1neg.npy with mmap=None
2019-11-21 10:58:58,347 : INFO : setting ignored attribute cum_table to None
2019-11-21 10:58:58,348 : INFO : loaded ./LMs/embeddin

In [3]:
def found_neighbors(myrow, embedding, colname='vocab', topn=1):
    try:
        vocab_neigh = embedding.wv.most_similar([myrow['vocab']], topn=topn)
        return list(np.array(vocab_neigh)[:, 0])
    except KeyError:
        return []

In [4]:
def jaccard_similarity_df(myrow, colname_1, colname_2, num_items=False, make_lowercase=True):
    """
    Jaccard similarity between two documents (e.g., OCR and Human) on flattened list of words
    """
    if not num_items:
        list1 = myrow[colname_1]
        list2 = myrow[colname_2]
    else:
        list1 = myrow[colname_1][:num_items]
        list2 = myrow[colname_2][:num_items]
    if make_lowercase:
        list1 = [x.lower() for x in list1]
        list2 = [x.lower() for x in list2]
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [5]:
words_corrected = []
for item in w2v_corrected.wv.vocab:
    words_corrected.append([item, int(w2v_corrected.wv.vocab[item].count)])

In [6]:
pd_words = pd.DataFrame(words_corrected, columns=['vocab', 'count'])

In [7]:
pd_words = pd_words.sort_values(by=['count'], ascending=False)
print("size: {}".format(len(pd_words)))
pd_words.head()

size: 179735


Unnamed: 0,vocab,count
57,",",860225
24,.,831221
21,the,736361
5,of,351214
110,and,291173


In [16]:
pd2search = pd_words[100:151]
pd2search

Unnamed: 0,vocab,count
4058,captain,10023
189,1,10014
1673,any,9719
13,into,9699
827,we,9673
971,court,9670
491,o,9650
205,2,9648
933,men,9638
384,if,9424


# Quality bands 3, 4

## Create list of words and their frequencies in the corrected set

In [9]:
neigh_jaccard_bands_3_4 = []

for i_model in [0]:

    w2v_em_corr_qual_3_4 = Word2Vec.load('./LMs/w2v_005_EM_corr_qual_3_4.model')
    w2v_em_ocr_qual_3_4 = Word2Vec.load('./LMs/w2v_005_EM_ocr_qual_3_4.model')

    #for topn in [1, 2, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000]:
    for topn in [50000]:
        print("topn: {}".format(topn))
        t1 = time.time()

        pd2search = pd_words[0:1000]
        pd2search['w2v_em_corr_qual_3_4'] = pd2search.apply(found_neighbors, args=[w2v_em_corr_qual_3_4, 
                                                                                   'vocab', 
                                                                                   topn], axis=1)
        print("corr: {}".format(time.time() - t1))
        pd2search['w2v_em_ocr_qual_3_4'] = pd2search.apply(found_neighbors, args=[w2v_em_ocr_qual_3_4,
                                                                                  'vocab', 
                                                                                  topn], axis=1)
        print("ocr: {}".format(time.time() - t1))

    mytopn_range = [1, 2, 5, 
                    10, 20, 50, 
                    100, 200, 500, 
                    1000, 2000, 5000, 
                    10000, 20000, 50000]
    for mytopn in mytopn_range:
        pd2search['jaccard_qual_3_4'] = \
            pd2search.apply(jaccard_similarity_df, 
                            args=['w2v_em_corr_qual_3_4', "w2v_em_ocr_qual_3_4", mytopn], 
                            axis=1)

        neigh_jaccard_bands_3_4.append(
            [mytopn, 
             pd2search['jaccard_qual_3_4'].mean(), 
             pd2search['jaccard_qual_3_4'].std(),
             i_model
            ])
    print("total: {}".format(time.time() - t1))
    #np.save("neigh_jaccard_bands_3_4.npy", np.array(neigh_jaccard_bands_3_4))

neigh_jaccard_bands_3_4 = np.array(neigh_jaccard_bands_3_4)

2019-11-21 10:58:59,143 : INFO : loading Word2Vec object from ./LMs/w2v_005_EM_corr_qual_3_4.model
2019-11-21 10:59:00,039 : INFO : loading wv recursively from ./LMs/w2v_005_EM_corr_qual_3_4.model.wv.* with mmap=None
2019-11-21 10:59:00,040 : INFO : loading vectors from ./LMs/w2v_005_EM_corr_qual_3_4.model.wv.vectors.npy with mmap=None
2019-11-21 10:59:00,331 : INFO : setting ignored attribute vectors_norm to None
2019-11-21 10:59:00,332 : INFO : loading vocabulary recursively from ./LMs/w2v_005_EM_corr_qual_3_4.model.vocabulary.* with mmap=None
2019-11-21 10:59:00,332 : INFO : loading trainables recursively from ./LMs/w2v_005_EM_corr_qual_3_4.model.trainables.* with mmap=None
2019-11-21 10:59:00,333 : INFO : loading syn1neg from ./LMs/w2v_005_EM_corr_qual_3_4.model.trainables.syn1neg.npy with mmap=None
2019-11-21 10:59:00,635 : INFO : setting ignored attribute cum_table to None
2019-11-21 10:59:00,636 : INFO : loaded ./LMs/w2v_005_EM_corr_qual_3_4.model
2019-11-21 10:59:01,748 : INFO 

topn: 50000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
2019-11-21 11:02:32,846 : INFO : precomputing L2-norms of word weight vectors


corr: 208.3446500301361


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ocr: 420.53262519836426
total: 472.8203191757202


In [10]:
np.save("neigh_jaccard_bands_3_4.npy", neigh_jaccard_bands_3_4)