Task1: Run several analogy solving models with several different representations on the benchmarking analogy dataset and report your findings. 

Focus on the following questions:

1. Is the choice of the analogy model important? Which representations work better with which analogy models?
2. Is dimensionality of the representation important when using GloVe vectors?
3. What is the computational complexity of the analogy models given the pre-trained vectors?
4. What are the typical errors?

In [None]:
from gensim import models, matutils
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Load pre-trained word2vec model from disk
word2vec_model = models.word2vec.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative3000.bin', binary = True)

# word2vec_model.vocab: 3000000  words for word2vec_model.
# each word are represented as a vector with 300 terms
# word2vec_model.syn0: matrix for the model
# word2vec_model.syn0.shape: check the shape of this matrix

# Normalize all vectors in this model. 
# So we can use dot product to calculate cosine similarity which is more efficient
word2vec_model.init_sims()
# The normalized vectors are stored in model.syn0 and model.syn0norm. These are the same.

In [None]:
# Load pre-trained GloVe model from disk
glove50d_model = models.word2vec.Word2Vec.load_word2vec_format('glove.6B.50d.txt')

In [None]:
def findAnalogy_model1(a, b, c, model): 
    """
    addition model
    a to b is c to d. a, b, c, d are all words. 
    d = argmax(cos(d', c-a+b)). 
    """
    mixedNormVec = None
    all_words = set()
    for word in [a, b, c]:
        if not word in model.vocab:
            raise KeyError("word '%s' not in vocabulary" % word)
        else:
            all_words.add(model.vocab[word].index)
    # normalize the result of b + c - a. prepare for computing cosine similarity
    mixedNormVec = matutils.unitvec(model[b] + model[c] - model[a]).astype(np.float32)
    
    # calculate the cosine similarity between all words (d') and c-a+b
    sims = np.dot(model.syn0norm, mixedNormVec)
    # find 5 best result which is the highest similarity score
    # it is possible that finding the same word as a or b or c 
    # so we need to give some space for other possible words
    best = matutils.argsort(sims, topn = 5, reverse=True)
    # ignore words from the input
    result = [(model.index2word[sim], float(sims[sim])) for sim in best if sim not in all_words and "_" not in model.index2word[sim]]
    return result[0]


In [None]:
def findAnalogy_model2(a, b, c, model):
    """
    multiplication model
    a to b is c to d. d = argmax(cos(d',c)*cos(d',b)/(cos(d'a)+e))
    e = 0.001 to avoid division by zero
    """
    all_words = set()
    for word in [a, b, c]:
        if not word in model.vocab:
            raise KeyError("word '%s' not in vocabulary" % word)
        else:
            all_words.add(model.vocab[word].index)

    sims = (np.dot(model.syn0norm, model[c]) + 1) / 2 * \
        (np.dot(model.syn0norm, model[b]) + 1) / 2 / \
        ((np.dot(model.syn0norm, model[a]) + 1) / 2 + 0.001)
    best = matutils.argsort(sims, topn = 5, reverse=True)
    # ignore words from the input
    result = [(model.index2word[sim], float(sims[sim])) for sim in best if sim not in all_words and "_" not in model.index2word[sim]]
    return result[0]

In [None]:
def recallOfModel(questions, r_model, a_model):
    """
    questions are file path for the test file. In this case, 'questions-words.txt'
    r_model is representation model of word vector (word2vec, GloVe)
    a_model is analogy model (1, 2). 1 stands for findAnalogy_model1, 2 stands for findAnalogy_model2
    """
    count_correct = 0 # counter for number of correct result
    count_total = 0 # count total number of questions
    
    if a_model == 1:
        with open(questions, 'r') as ifile:
            for line in ifile:
                if line[0] != ':' :
                    count_total += 1
                    line_sp = line.split()
                    result_text = findAnalogy_model1(line_sp[0], line_sp[1], line_sp[2], r_model)                 
                    if result_text[0] == line_sp[3]:
                        count_correct += 1
    elif a_model == 2:
        with open(questions, 'r') as ifile:
            for line in ifile:
                if line[0] != ':' :
                    count_total += 1
                    line_sp = line.split()
                    result_text = findAnalogy_model2(line_sp[0], line_sp[1], line_sp[2], r_model)
                    if result_text[0] == line_sp[3]:
                        count_correct += 1
    else:
        raise ValueError("invalid analogy model")
    recall = float(count_correct) / float(count_total)
    return float('%.4f'% recall)


In [None]:
print ["word2vec & addition model", recallOfModel('questions-words.txt', word2vec_model, 1)]
print ["word2vec & multiplication model", recallOfModel('questions-words.txt', word2vec_model, 2)]
