# Inspect Embedding and Compare to Glove


In [None]:
#imports 
import json
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix as get_dm

In [None]:
# functions to load embeddings from file

def load_embedding_from_json(file_path:str):    
    with open(file_path,"r",encoding="utf-8") as json_file:
         word_embedding = json.load(json_file)       
    # convert vectors from list to np array
    for key, vector in word_embedding.items():
        word_embedding[key] = np.array(vector)
    return word_embedding

def load_glove_embedding_from_file(file_path):
    df = pd.read_csv(file_path, sep=" ", quoting=3, header=None, index_col=0)
    glove = {key: val.values for key, val in df.T.items()}
    return glove


In [None]:
# load open api embedding 
api_embedding = load_embedding_from_json("saved_embeddings/open_api_embedding_5d_314_words.json")

In [None]:
#load glove embedding
glove_embedding = load_glove_embedding_from_file("saved_embeddings/glove.6B.50d.txt")

### Reduce Glove Embedding to same Vocabulary as OpenAPI Embedding

In [None]:
def extract_words_and_vectors_from_embedding(embedding: dict):
    words = []
    vectors = []
    
    for word, vector in embedding.items():
        words.append(word)
        vectors.append(vector)
    
    return words, np.array(vectors)

In [None]:
api_embedding_words, api_embedding_vectors = extract_words_and_vectors_from_embedding(api_embedding)

In [None]:
def reduce_embedding_to_words(embedding: dict, words: list):
    reduced_embedding = {}
    for w in words:
        try:
            vector = embedding[w]
            reduced_embedding[w] = vector
        except:
            pass
    return reduced_embedding    

In [None]:
# reduce glove embedding to the same words as api embedding
glove_embedding = reduce_embedding_to_words(glove_embedding, api_embedding_words)

In [None]:
print("Size own embedding: "+str(len(api_embedding)))
print("Size glove embedding: "+str(len(glove_embedding)))

In [None]:
glove_embedding_words, glove_embedding_vectors = extract_words_and_vectors_from_embedding(glove_embedding)

In [None]:
api_embedding_distance_matrix = get_dm(api_embedding_vectors,api_embedding_vectors)

In [None]:
glove_embedding_distance_matrix = get_dm(glove_embedding_vectors, glove_embedding_vectors)

# Closest words 

In [None]:
def get_nearest_words(word:str, distance_matrix, words: list):
    index_word = -1
    for i, w in enumerate(words):
        if w == word:
            index_word = i
            break
    if index_word == -1:
        msg = "error: word "+str(word)+" is not embedded"
        print(msg)
        return msg 
    wordpair_distances = [] # list of ["word", "other_word", distance]
    for i, dist in enumerate(distance_matrix[index_word]):
        other_word = words[i]
        dist = distance_matrix[index_word][i]
            
        wordpair_distances.append([word, other_word, dist])
    
    def sort_comparer(e):
        return e[2]
            
    wordpair_distances.sort(key=sort_comparer)
    
    return wordpair_distances  

<h2>Check Word </h2>

In [None]:
word_to_check = "owner"
number_to_show = 10

In [None]:
# nearest words open api embedding
nearest_words_api_embedding = get_nearest_words(word_to_check, api_embedding_distance_matrix, api_embedding_words)

print("nearest words in api embedding are: ")
nearest_words_api_embedding[:number_to_show]

In [None]:
# nearest words glove embedding
nearest_words_glove_embedding = get_nearest_words(word_to_check, glove_embedding_distance_matrix, glove_embedding_words)

print("nearest words in glove embedding are: ")
nearest_words_glove_embedding[:number_to_show]

# Show closest words to every word

In [None]:
# get 5 closest words to every word

number_closest_words = 5

for word in api_embedding_words:
    closest_api_embedding = get_nearest_words(word, api_embedding_distance_matrix, api_embedding_words)[1:number_closest_words+1]
    try:
        closest_api_embedding = [i[1] for i in closest_api_embedding]
    except:
        pass
    
    closest_glove_embedding = get_nearest_words(word, glove_embedding_distance_matrix, glove_embedding_words)[1:number_closest_words+1]
    try:
        closest_glove_embedding = [i[1] for i in closest_glove_embedding]
    except:
        pass
    
    print("word: "+word)
    print("closest API embedding:   "+str(closest_api_embedding))
    print("closest GloVe embedding: "+str(closest_glove_embedding))
    print()
    