Task1: Run several analogy solving models with several different representations on the benchmarking analogy dataset and report your findings. 

Focus on the following questions:

1. Is the choice of the analogy model important? Which representations work better with which analogy models?
2. Is dimensionality of the representation important when using GloVe vectors?
3. What is the computational complexity of the analogy models given the pre-trained vectors?
4. What are the typical errors?

In [None]:
from gensim import models, matutils
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load pre-trained model from disk
word2vec_model = models.word2vec.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative3000.bin', binary = True)
# method for computing cosine_similarity (since we normalize the vectors) cos_sim(v1, v2) = np.dot(v1, v2)

# pre-processing: change all vectors of words into unit vectors, not able to do it. any ideas?
vec2compare = map(lambda x : matutils.unitvec(word2vec_model[x]), word2vec_model.vocab)

def findAnalogy(a, b, c): 
    """a to b is c to d. a, b, c, d are all words. d = argmax(cos(d', c-a+b))"""
    mixedNormVec = None
    for word in b + c:
        if not word in word2vec_model.vocab:
            raise KeyError("word '%s' not in vocabulary" % word)
    if a in word2vec_model.vocab:
        mixedNormVec = matutils.unitvec(word2vec_model[b] + word2vec_model[c] - word2vec_model[a])
    else:
        raise KeyError("word '%s' not in vocabulary" % a)
    if not mixedNormVec:
        raise ValueError("sorry for any inconvenient...")
    sims = np.dot(vec2compare, mixedNormVec)
    best = matutils.argsort(sims, topn = 5, reverse=True)
    # ignore words from the input
    result = [(word2vec_model.index2word[sim], float(sims[sim])) for sim in best if sim not in a + b + c]
    return result[0]