# Language Model analysis

In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np

from gensim.models import Word2Vec
from gensim import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# embedding models, base model
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model"
w2v = Word2Vec.load(model_path)

# OCR model, quality 1, 2
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/w2v_005_EM_ocr_qual_1_2.model"
w2v_em_ocr_qual_1_2 = Word2Vec.load(model_path)

# corrected model, quality 1, 2
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/w2v_005_EM_corr_qual_1_2.model"
w2v_em_corr_qual_1_2 = Word2Vec.load(model_path)

# OCR model, quality 3, 4
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/w2v_005_EM_ocr_qual_3_4.model"
w2v_em_ocr_qual_3_4 = Word2Vec.load(model_path)

# corrected model, quality 3, 4
model_path = "/Users/khosseini/myJobs/ATI/Projects/2019/lwm_ocr_assessment/LMs/w2v_005_EM_corr_qual_3_4.model"
w2v_em_corr_qual_3_4 = Word2Vec.load(model_path)

2019-11-20 10:28:33,829 : INFO : loading Word2Vec object from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model
2019-11-20 10:28:35,867 : INFO : loading wv recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.* with mmap=None
2019-11-20 10:28:35,869 : INFO : loading vectors from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/models/all_books/w2v_005/w2v_words.model.wv.vectors.npy with mmap=None
2019-11-20 10:28:36,419 : INFO : setting ignored attribute vectors_norm to None
2019-11-20 10:28:36,421 : INFO : loading vocabulary recursively from /Users/khosseini/myJobs/ATI/Projects/2019/Living-with-Machines-code/language-lab-mro/lexicon_expansion/interactive_expansion/m

In [3]:
def found_neighbors(myrow, embedding, colname='vocab', topn=1):
    try:
        vocab_neigh = embedding.wv.most_similar([myrow['vocab']], topn=topn)
        return list(np.array(vocab_neigh)[:, 0])
    except KeyError:
        return []

In [4]:
def jaccard_similarity_df(myrow, colname_1, colname_2, make_lowercase=True):
    """
    Jaccard similarity between two documents (e.g., OCR and Human) on flattened list of words
    """
    list1 = myrow[colname_1]
    list2 = myrow[colname_2]
    if make_lowercase:
        list1 = [x.lower() for x in list1]
        list2 = [x.lower() for x in list2]
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

# Quality bands 3, 4

## Create list of words and their frequencies in the corrected set

In [None]:
words_corrected = []
for item in w2v_em_corr_qual_3_4.wv.vocab:
    words_corrected.append([item, int(w2v_em_corr_qual_3_4.wv.vocab[item].count)])

In [None]:
pd_words = pd.DataFrame(words_corrected, columns=['vocab', 'count'])

In [None]:
pd_words = pd_words.sort_values(by=['count'], ascending=False)
print("size: {}".format(len(pd_words)))
pd_words.head()

In [None]:
pd2search = pd_words[0:5000]
pd2search

In [None]:
neigh_jaccard_bands_3_4 = []

for topn in [1, 2, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000]:
    print("topn: {}".format(topn))
    t1 = time.time()
    
    pd2search = pd_words[0:1000]
    pd2search['w2v_em_corr_qual_3_4'] = pd2search.apply(found_neighbors, args=[w2v_em_corr_qual_3_4, 
                                                                               'vocab', 
                                                                               topn], axis=1)
    print("corr: {}".format(time.time() - t1))
    pd2search['w2v_em_ocr_qual_3_4'] = pd2search.apply(found_neighbors, args=[w2v_em_ocr_qual_3_4, 
                                                                             'vocab', 
                                                                              topn], axis=1)
    pd2search['jaccard_qual_3_4'] = \
        pd2search.apply(jaccard_similarity_df, args=['w2v_em_corr_qual_3_4', 
                                                     "w2v_em_ocr_qual_3_4", 
                                                     True], 
                        axis=1)
    
    neigh_jaccard_bands_3_4.append(
        [topn, 
         pd2search['jaccard_qual_3_4'].mean(), 
         pd2search['jaccard_qual_3_4'].std()])
    
    print("total: {}".format(time.time() - t1))

neigh_jaccard_bands_3_4 = np.array(neigh_jaccard_bands_3_4)

# Quality bands 1, 2

## Create list of words and their frequencies in the corrected set

In [5]:
words_corrected = []
for item in w2v_em_corr_qual_1_2.wv.vocab:
    words_corrected.append([item, int(w2v_em_corr_qual_1_2.wv.vocab[item].count)])

In [6]:
pd_words = pd.DataFrame(words_corrected, columns=['vocab', 'count'])

In [7]:
pd_words = pd_words.sort_values(by=['count'], ascending=False)
print("size: {}".format(len(pd_words)))
pd_words.head()

size: 439314


Unnamed: 0,vocab,count
55,the,292602038
113,of,174510683
11,.,168101519
39,and,139819513
106,to,108989625


In [8]:
pd2search = pd_words[0:5000]
pd2search

Unnamed: 0,vocab,count
55,the,292602038
113,of,174510683
11,.,168101519
39,and,139819513
106,to,108989625
...,...,...
5006,subjected,69788
2045,occupies,69787
2339,gravely,69777
5444,attendants,69770


In [9]:
neigh_jaccard_bands_1_2 = []

for topn in [1, 2, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000]:
    print("topn: {}".format(topn))
    t1 = time.time()
    
    pd2search = pd_words[0:1000]
    pd2search['w2v_em_corr_qual_1_2'] = pd2search.apply(found_neighbors, args=[w2v_em_corr_qual_1_2, 
                                                                               'vocab', 
                                                                               topn], axis=1)
    print("corr: {}".format(time.time() - t1))
    pd2search['w2v_em_ocr_qual_1_2'] = pd2search.apply(found_neighbors, args=[w2v_em_ocr_qual_1_2, 
                                                                             'vocab', 
                                                                              topn], axis=1)
    pd2search['jaccard_qual_1_2'] = \
        pd2search.apply(jaccard_similarity_df, args=['w2v_em_corr_qual_1_2', 
                                                     "w2v_em_ocr_qual_1_2", 
                                                     True], 
                        axis=1)
    
    neigh_jaccard_bands_1_2.append(
        [topn, 
         pd2search['jaccard_qual_1_2'].mean(), 
         pd2search['jaccard_qual_1_2'].std()])
    
    print("total: {}".format(time.time() - t1))

neigh_jaccard_bands_1_2 = np.array(neigh_jaccard_bands_1_2)

2019-11-20 10:29:02,489 : INFO : precomputing L2-norms of word weight vectors


topn: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
2019-11-20 10:29:54,745 : INFO : precomputing L2-norms of word weight vectors


corr: 52.2600359916687


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


total: 105.23348593711853
topn: 2
corr: 49.01822900772095
total: 98.79225397109985
topn: 5
corr: 48.63156270980835
total: 96.51067972183228
topn: 10
corr: 46.37088894844055
total: 98.53879714012146
topn: 50
corr: 43.56462907791138
total: 88.98045206069946
topn: 100
corr: 35.471516132354736
total: 72.67836213111877
topn: 500
corr: 38.1435272693634
total: 79.2555341720581
topn: 1000
corr: 39.5123016834259
total: 82.0018618106842
topn: 5000
corr: 63.327855825424194
total: 126.80228090286255
topn: 10000
corr: 77.40446305274963
total: 159.26662611961365
topn: 50000
corr: 249.6574649810791
total: 534.9824228286743


In [12]:
np.save("neigh_jaccard_bands_1_2.npy", neigh_jaccard_bands_1_2)

In [None]:
plt.figure(figsize=(10, 5))


plt.plot(neigh_jaccard_bands_1_2[:, 0], neigh_jaccard_bands_1_2[:, 1], 
         'k-o', alpha=1.0, 
         lw=4,
         label='Quality bands=1,2')

plt.plot(neigh_jaccard_bands_3_4[:, 0], neigh_jaccard_bands_3_4[:, 1], 
         'r-o', alpha=1.0, 
         lw=4,
         label='Quality bands=3,4')

plt.grid()
plt.xticks(size=20)
plt.yticks(size=20)
plt.xlabel("#neighbours", size=24)
plt.ylabel("Jaccard similarity", size=24)
plt.xscale("log")
plt.xlim(1.0, 100000)
plt.ylim(0.05, 1.0)

plt.legend(prop={'size': 20})
plt.show()
#plt.xlim(0, 20000)