In [2]:
import numpy as np
from annoy import AnnoyIndex

In [6]:
class PreTrainedEmbedding:
    
    def __init__(self,word_to_index,word_vector) -> None:
        self.word_to_index = word_to_index
        self.word_vectors = word_vector
        self.index_to_word = {v:k for k,v in word_to_index.items()}
        self.index = AnnoyIndex(f=len(word_vector[0]),
                                metric="euclidean")
        for idx in self.word_to_index.values():
            self.index.add_item(idx,self.word_vectors[idx])
        
        self.index.build(n_trees=50)
    
    @classmethod
    def from_embedding_file(cls,embedding_file):
        word_to_index = {}
        word_vector = []
        with open(embedding_file,"r") as f:
            for line in f.readlines():
                line = line.split(" ")
                word = line[0]
                vector = np.array([float(num) for num in line[1:]])
                word_to_index[word] = len(word_to_index)
                word_vector.append(vector)
        return cls(word_to_index,word_vector)
    
    def get_embedding(self,word):
        return self.word_vectors[self.word_to_index[word]]
    
    def get_closest_to_vector(self,vector,n=1):
        nn_indices = self.index.get_nns_by_vector(vector,n)
        return [self.index_to_word[neighbor]
                for neighbor in nn_indices]
    
    def compute_and_print_analogy(self,word1,word2,word3):
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)
        
        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship
        
        closest_words = self.get_closest_to_vector(vec4,n=4)
        closest_words = [word for word in closest_words
                         if word not in [word1,word2,word3]]
        
        for word4 in closest_words:
            print(f"{word1} : {word2} :: {word3} : {word4}")

In [7]:
embeddings = PreTrainedEmbedding.from_embedding_file("../data/glove/glove.6B.50d.txt")

In [8]:
embeddings.compute_and_print_analogy('man', 'he', 'woman')

man : he :: woman : she
man : he :: woman : having
man : he :: woman : his


In [10]:
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')

cat : kitten :: dog : puppy
cat : kitten :: dog : proverbial
cat : kitten :: dog : crazed


In [13]:
embeddings.compute_and_print_analogy('talk', 'communicate', 'read')

talk : communicate :: read : interpret
talk : communicate :: read : instructions
talk : communicate :: read : typing


In [16]:
embeddings.compute_and_print_analogy('white', 'peace', 'black')

white : peace :: black : reconciliation
white : peace :: black : unity
white : peace :: black : independence


In [17]:
embeddings.compute_and_print_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse
man : doctor :: woman : pregnant
man : doctor :: woman : child


In [18]:
embeddings.compute_and_print_analogy('man', 'king', 'woman')

man : king :: woman : queen
man : king :: woman : prince
man : king :: woman : elizabeth


In [19]:
embeddings.compute_and_print_analogy('fast', 'fastest', 'small')

fast : fastest :: small : smallest
fast : fastest :: small : smaller
fast : fastest :: small : larger
fast : fastest :: small : oldest
