Task1: Run several analogy solving models with several different representations on the benchmarking analogy dataset and report your findings. 

Focus on the following questions:

1. Is the choice of the analogy model important? Which representations work better with which analogy models?
2. Is dimensionality of the representation important when using GloVe vectors?
3. What is the computational complexity of the analogy models given the pre-trained vectors?
4. What are the typical errors?
    
    One error is from the testing set. It says London is capital of England but actually it is UK or Britain. So there are about 200 false negative is about that.
    Another error is (although I haven't meet it yet) the given word is not in the vocabulary. So it doesn't have a vector to present itself.

In [None]:
from gensim import models, matutils
import numpy as np
import smart_open
import os
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
def glove2word2vec(glove_filename):
    """
    Convert glove file to word2vec format
    The only difference between word2vec format and glove is
    word2vec provides number of lines and dimension of word vectors
    in first line
    This is a short cut for given 4 glove file:
        "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt", "glove.6B.300d.txt"
    Other format of glove filename needs other modification of getFirstLineInfo    
    """
    def getFirstLineInfo(glove_filename):
        """
        Calculate the number of lines and dimensions of word vector
        """
        num_lines = sum(1 for line in smart_open.smart_open(glove_filename))
        # the file name of glove file contains number of dimensions so we can extract that from file name
        dims = glove_filename.split('.')[2].split('d')[0]
        return num_lines, dims
    
    def addFirstLine(glove_filename, word2vec_filename, first_line_info):
        """
        Add information of number of lines and dimensions into the first line
        """
        with smart_open.smart_open(glove_filename, 'rb') as infile:
            with smart_open.smart_open(word2vec_filename, 'wb') as outfile:
                outfile.write(str(first_line_info) + '\n')
                for line in infile:
                    outfile.write(line)
        return word2vec_filename
    
    word2vec_filename = glove_filename[:-3] + "word2vec.txt"
    if os.path.isfile(word2vec_filename):
        model = models.word2vec.Word2Vec.load_word2vec_format(word2vec_filename)
    else:
        num_lines, dims = getFirstLineInfo(glove_filename)
        first_line = "{} {}".format(num_lines, dims)
        model_file = addFirstLine(glove_filename, word2vec_filename, first_line)
        model = models.word2vec.Word2Vec.load_word2vec_format(model_file)
    
    # normalize all word vectors
    model.init_sims(replace = True)
    return model

In [None]:
def findAnalogy_model1(a, b, c, model): 
    """
    addition model
    a to b is c to d. a, b, c, d are all words. 
    d = argmax(cos(d', c-a+b)). 
    
    bonus:
        If you want to find the most corrected word of a given word, you can use
        findAnalogy_model1("empty", "empty", "given_word", model)
        yes. I know you find out the first two "empty" can be replaced by any word but these two must be exactly the same word.
    """
    mixedNormVec = None
    all_words = set()
    for word in [a, b, c]:
        if not word in model.vocab:
            raise KeyError("word '%s' not in vocabulary" % word)
        else:
            all_words.add(model.vocab[word].index)
    # normalize the result of b + c - a. prepare for computing cosine similarity
    mixedNormVec = matutils.unitvec(model[b] + model[c] - model[a]).astype(np.float32)
    
    # calculate the cosine similarity between all words (d') and c-a+b
    sims = np.dot(model.syn0norm, mixedNormVec)
    # find 5 best result which is the highest similarity score
    # it is possible that finding the same word as a or b or c 
    # so we need to give some space for other possible words
    best = matutils.argsort(sims, topn = 5, reverse=True)
    # ignore words from the input
    result = [(model.index2word[sim], float(sims[sim])) for sim in best if sim not in all_words and "_" not in model.index2word[sim]]
    return result[0]


In [None]:
def findAnalogy_model2(a, b, c, model):
    """
    multiplication model
    a to b is c to d. d = argmax(cos(d',c)*cos(d',b)/(cos(d'a)+e))
    e = 0.001 to avoid division by zero
    """
    all_words = set()
    for word in [a, b, c]:
        if not word in model.vocab:
            raise KeyError("word '%s' not in vocabulary" % word)
        else:
            all_words.add(model.vocab[word].index)

    sims = (np.dot(model.syn0norm, model[c]) + 1) / 2 * \
        (np.dot(model.syn0norm, model[b]) + 1) / 2 / \
        ((np.dot(model.syn0norm, model[a]) + 1) / 2 + 0.001)
    best = matutils.argsort(sims, topn = 5, reverse=True)
    # ignore words from the input
    result = [(model.index2word[sim], float(sims[sim])) for sim in best if sim not in all_words and "_" not in model.index2word[sim]]
    return result[0]

In [None]:
def recallOfAnalogyModel(questions, r_model, a_model):
    """
    questions are file path for the test file. In this case, 'questions-words.txt'
    r_model is representation model of word vector (word2vec, GloVe)
    a_model is analogy model (1, 2). 1 stands for findAnalogy_model1, 2 stands for findAnalogy_model2
    """
    count_correct = 0 # counter for number of correct result
    count_total = 0 # count total number of questions
    
    if a_model == 1:
        with open(questions, 'r') as ifile:
            for line in ifile:
                if line[0] != ':' :
                    count_total += 1
                    line_sp = line.split()
                    result_text = findAnalogy_model1(line_sp[0], line_sp[1], line_sp[2], r_model)                 
                    if result_text[0] == line_sp[3]:
                        count_correct += 1
    elif a_model == 2:
        with open(questions, 'r') as ifile:
            for line in ifile:
                if line[0] != ':' :
                    count_total += 1
                    line_sp = line.split()
                    result_text = findAnalogy_model2(line_sp[0], line_sp[1], line_sp[2], r_model)
                    if result_text[0] == line_sp[3]:
                        count_correct += 1
    else:
        raise ValueError("invalid analogy model")
    recall = float(count_correct) / float(count_total)
    return float('%.4f'% recall)


In [None]:
# Load pre-trained representation model. Whether load those model at the same time depends on the space of your RAM

# Load pre-trained word2vec model from disk
word2vec_model = models.word2vec.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative3000.bin', binary = True)

# word2vec_model.vocab: 3000000  words for word2vec_model.
# each word are represented as a vector with 300 terms
# word2vec_model.syn0: matrix for the model
# word2vec_model.syn0.shape: check the shape of this matrix

# Normalize all vectors in this model. 
# So we can use dot product to calculate cosine similarity which is more efficient
word2vec_model.init_sims(replace = True)
# The normalized vectors are stored in model.syn0 and model.syn0norm. These are the same.

# Load pre-trained glove model from dist
glove50d_model = glove2word2vec('glove.6B.50d.txt')
glove100d_model = glove2word2vec('glove.6B.100d.txt')
glove200d_model = glove2word2vec('glove.6B.200d.txt')
glove300d_model = glove2word2vec('glove.6B.300d.txt')


In [None]:
print "word2vec & addition model", recallOfAnalogyModel('questions-words.txt', word2vec_model, 1)
print "word2vec & multiplication model", recallOfAnalogyModel('questions-words.txt', word2vec_model, 2)
print "glove50d & addition model", recallOfAnalogyModel('questions-words.txt', glove50d_model, 1)
print "glove50d & multiplication model", recallOfAnalogyModel('questions-words.txt', glove50d_model, 2)
print "glove100d & addition model", recallOfAnalogyModel('questions-words.txt', glove100d_model, 1)
print "glove100d & multiplication model", recallOfAnalogyModel('questions-words.txt', glove100d_model, 2)
print "glove200d & addition model", recallOfAnalogyModel('questions-words.txt', glove200d_model, 1)
print "glove200d & multiplication model", recallOfAnalogyModel('questions-words.txt', glove200d_model, 2)
print "glove300d & addition model", recallOfAnalogyModel('questions-words.txt', glove300d_model, 1)
print "glove300d & multiplication model", recallOfAnalogyModel('questions-words.txt', glove300d_model, 2)