# Evaluating W2V Model

In [8]:
import gensim
from gensim.models import Word2Vec
# load pre-trained model
model_name ='models/wiki_iter=5_algorithm=skipgram_window=10_size=300_neg-samples=10.m'

model = Word2Vec.load(model_name)

## Word2Vec inference of doc as list of entities
We will use these methods thoughout the notebook to infer a vector for a document, given a list of entities 
that represents a doc.
TODO: WEIGHTED SUM BEFORE AVERAGING

In [10]:
import numpy as np
def mean_of_vectors(vectors):
    """given a list of vectors, return the simplest mean of vectors."""
    ## vectors might be empty if no entity is recognized
    if vectors == []:
        return []
    sum_vectors = np.zeros(np.shape(vectors[0]))
    for vec in vectors:
        sum_vectors = sum_vectors + vec
    return sum_vectors/len(vectors)

def infer_vector(entities, model, verbose = True):
    """Given a list of entities, returns the vector representing the document from which the entities 
    were extracted from, wrt a given W2V model.
    
    entities: list of entities, our way of representing a single document.
    model: w2v model.
    """
    unknown_words = 0
    # get word vector of each entity; ignores word if the model does not know it
    entities_vecs = []
    for e in entities:
        try:
            # make sure to lower case each word!
            entities_vecs.append(model[e.lower()])
        except:
            unknown_words += 1 # ignore unknown word
    if unknown_words > 1 and verbose:
        print("Number of unknown words: ", unknown_words, ", number of known words:", (len(entities)-unknown_words))
    return mean_of_vectors(entities_vecs)
    

## Triples of docs test
This test consists of presenting the model with triples of docs:
e.g. (facebook_doc1, facebook_doc2, space_doc)
the model correctly 'classifies' these triples if it's able to tell the difference between the 3rd and first two docs,
while also recognizing the similarity between the first couple.

Succeeding in this test means, later on, that whit a very high probability, the first 2 docs will be part of the same cluster, and the third one will be instead 'out' of that cluster.

In [30]:
import json
import sklearn.metrics.pairwise as sk

# load test file
# I also have another small set of docs from wikipedia, but they only contain abstract, not pre-processed entities
filename = '3-docs-test/test.json'
with open(filename, 'r') as test:
    test_data = json.load(test)
    
similarity_threshold = 0.61 # when can we define two docs as similar? (empirical-obtained value)
    
# format: [... ,[{}, {}, {}], [ {}, {}, {}] ...   ]

# let's try on flattened_entities too
test_data = [(a['result_entities'], b['result_entities'], c['result_entities'])
             for (a, b, c) in test_data]
print("Number of triples: ", len(test_data))
#print(test_data[:3])
correct = 0
for (a, b, c) in test_data: 
    # infer vector from each document
    inferred_docs = [infer_vector(a, model), infer_vector(b, model), infer_vector(c, model)]
    for ifd in inferred_docs:
        if ifd == []:
            print("Unknown doc found")
            continue
            
    # model.n_similarity does the same thing too
    # we have to make sure we're passing it LOWER-CASE words
    try:
        if sk.cosine_similarity([inferred_docs[0]], [inferred_docs[1]]) >= similarity_threshold:
            # docA and docB are guessed to be (correctly) similar
            if sk.cosine_similarity([inferred_docs[0]], [inferred_docs[2]]) < similarity_threshold:
                #docA and docC are not similar
                if sk.cosine_similarity([inferred_docs[1]], [inferred_docs[2]]) < similarity_threshold:
                        # guessed right
                        correct += 1
    except:
        None
print("%s correct guesses over %s triples" %(correct, len(test_data)))
percentage = correct*100 / len(test_data)
print(str(percentage))

Number of triples:  102
Number of unknown words:  5 , number of known words: 7
Number of unknown words:  3 , number of known words: 1
Number of unknown words:  2 , number of known words: 5
Number of unknown words:  17 , number of known words: 14
Number of unknown words:  5 , number of known words: 10
Number of unknown words:  7 , number of known words: 11
Number of unknown words:  11 , number of known words: 8
Number of unknown words:  9 , number of known words: 13
Number of unknown words:  5 , number of known words: 1
Number of unknown words:  8 , number of known words: 8
Number of unknown words:  19 , number of known words: 8
Number of unknown words:  29 , number of known words: 32
Number of unknown words:  20 , number of known words: 37
Number of unknown words:  16 , number of known words: 24
Number of unknown words:  15 , number of known words: 9
Number of unknown words:  5 , number of known words: 7
Number of unknown words:  7 , number of known words: 11
Number of unknown words:  



 10 , number of known words: 20
Number of unknown words:  29 , number of known words: 32
Number of unknown words:  11 , number of known words: 14
Number of unknown words:  7 , number of known words: 8
Number of unknown words:  8 , number of known words: 11
Number of unknown words:  13 , number of known words: 17
Number of unknown words:  9 , number of known words: 9
Number of unknown words:  2 , number of known words: 9
Number of unknown words:  10 , number of known words: 7
Number of unknown words:  6 , number of known words: 8
Number of unknown words:  6 , number of known words: 15
Number of unknown words:  7 , number of known words: 6
Number of unknown words:  7 , number of known words: 8
Number of unknown words:  10 , number of known words: 21
Number of unknown words:  5 , number of known words: 6
Number of unknown words:  7 , number of known words: 8
Number of unknown words:  18 , number of known words: 20
Number of unknown words:  7 , number of known words: 8
Number of unknown wo

## Model Online update approach
re-train every time to make sure to have a representation for each word, even tho the model only sees this word in a small number of context

In [9]:
from gensim.utils import simple_preprocess as sp
import json

filename = '3-docs-test/test.json'
with open(filename, 'r') as test:
    test_data = json.load(test)
# re-train model using abstract
print("Lenght before update", len(model.wv.vocab))

test_abstract = []
for (a, b, c) in test_data:
    test_abstract.append(sp(a['title'].lower()+a['abstract'].lower()))
    test_abstract.append(sp(b['title'].lower()+b['abstract'].lower()))
    test_abstract.append(sp(c['title'].lower()+c['abstract'].lower()))

%time model.build_vocab(test_abstract, update=True)
%time model.train(test_abstract, total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=3)

print("Lenght after update", len(model.wv.vocab))

Lenght before update 733392
CPU times: user 4.04 s, sys: 5.55 s, total: 9.59 s
Wall time: 7.57 s
CPU times: user 1min 39s, sys: 757 ms, total: 1min 40s
Wall time: 58.3 s
Lenght after update 733502


In [13]:
## Re-do test
import json
import sklearn.metrics.pairwise as sk

# load test file
# I also have another small set of docs from wikipedia, but they only contain abstract, not pre-processed entities
filename = '3-docs-test/test.json'
with open(filename, 'r') as test:
    test_data = json.load(test)
    
similarity_threshold = 0.61 # when can we define two docs as similar? (empirical-obtained value)
    
# format: [... ,[{}, {}, {}], [ {}, {}, {}] ...   ]

test_data = [(a['result_entities'], b['result_entities'], c['result_entities'])
             for (a, b, c) in test_data]

correct = 0
for (a, b, c) in test_data: 
    # infer vector from each document
    inferred_docs = [infer_vector(a, model, verbose=False), infer_vector(b, model, verbose = False), 
                     infer_vector(c, model, verbose = False)]
    for ifd in inferred_docs:
        if ifd == []:
            print("Unknown doc found")
            continue
            
    try:
        if sk.cosine_similarity([inferred_docs[0]], [inferred_docs[1]]) >= similarity_threshold:
            # docA and docB are guessed to be (correctly) similar
            if sk.cosine_similarity([inferred_docs[0]], [inferred_docs[2]]) < similarity_threshold:
                #docA and docC are not similar
                if sk.cosine_similarity([inferred_docs[1]], [inferred_docs[2]]) < similarity_threshold:
                        # guessed right
                        correct += 1
    except:
        None
print("%s correct guesses over %s triples" %(correct, len(test_data)))
percentage = correct*100 / len(test_data)
print(str(percentage))



Unknown doc found
Unknown doc found
Unknown doc found
Unknown doc found
Unknown doc found
Unknown doc found
38 correct guesses over 102 triples
37.254901960784316
