# Inspect Embedding and Compare to Glove


In [27]:
#imports 
import json
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix as get_dm

In [28]:
# functions to load embeddings from file

def load_embedding_from_json(file_path:str):    
    with open(file_path,"r",encoding="utf-8") as json_file:
         word_embedding = json.load(json_file)       
    # convert vectors from list to np array
    for key, vector in word_embedding.items():
        word_embedding[key] = np.array(vector)
    return word_embedding

def load_glove_embedding_from_file(file_path):
    df = pd.read_csv(file_path, sep=" ", quoting=3, header=None, index_col=0)
    glove = {key: val.values for key, val in df.T.items()}
    return glove


In [29]:
# load open api embedding 
api_embedding = load_embedding_from_json("saved_embeddings/open_api_embedding_5d_314_words.json")

In [30]:
#load glove embedding
glove_embedding = load_glove_embedding_from_file("saved_embeddings/glove.6B.50d.txt")

### Reduce Glove Embedding to same Vocabulary as OpenAPI Embedding

In [31]:
def extract_words_and_vectors_from_embedding(embedding: dict):
    words = []
    vectors = []
    
    for word, vector in embedding.items():
        words.append(word)
        vectors.append(vector)
    
    return words, np.array(vectors)

In [32]:
api_embedding_words, api_embedding_vectors = extract_words_and_vectors_from_embedding(api_embedding)

In [33]:
def reduce_embedding_to_words(embedding: dict, words: list):
    reduced_embedding = {}
    for w in words:
        try:
            vector = embedding[w]
            reduced_embedding[w] = vector
        except:
            pass
    return reduced_embedding    

In [34]:
# reduce glove embedding to the same words as api embedding
glove_embedding = reduce_embedding_to_words(glove_embedding, api_embedding_words)

In [35]:
print("Size own embedding: "+str(len(api_embedding)))
print("Size glove embedding: "+str(len(glove_embedding)))

Size own embedding: 314
Size glove embedding: 311


In [36]:
glove_embedding_words, glove_embedding_vectors = extract_words_and_vectors_from_embedding(glove_embedding)

In [37]:
api_embedding_distance_matrix = get_dm(api_embedding_vectors,api_embedding_vectors)

In [38]:
glove_embedding_distance_matrix = get_dm(glove_embedding_vectors, glove_embedding_vectors)

# Closest words 

In [39]:
def get_nearest_words(word:str, distance_matrix, words: list):
    index_word = -1
    for i, w in enumerate(words):
        if w == word:
            index_word = i
            break
    if index_word == -1:
        msg = "error: word "+str(word)+" is not embedded"
        print(msg)
        return msg 
    wordpair_distances = [] # list of ["word", "other_word", distance]
    for i, dist in enumerate(distance_matrix[index_word]):
        other_word = words[i]
        dist = distance_matrix[index_word][i]
            
        wordpair_distances.append([word, other_word, dist])
    
    def sort_comparer(e):
        return e[2]
            
    wordpair_distances.sort(key=sort_comparer)
    
    return wordpair_distances  

<h2>Check Word </h2>

In [40]:
word_to_check = "owner"
number_to_show = 10

In [41]:
# nearest words open api embedding
nearest_words_api_embedding = get_nearest_words(word_to_check, api_embedding_distance_matrix, api_embedding_words)

print("nearest words in api embedding are: ")
nearest_words_api_embedding[:number_to_show]

nearest words in api embedding are: 


[['owner', 'owner', 0.0],
 ['owner', 'file', 1.1714972951960434],
 ['owner', 'event', 1.4067387042686557],
 ['owner', 'settings', 1.4684077833552351],
 ['owner', 'events', 1.4998481322402601],
 ['owner', 'default', 1.5868387233692043],
 ['owner', 'user', 1.657490680034519],
 ['owner', 'attributes', 1.7983961547964487],
 ['owner', 'links', 1.8745989960425573],
 ['owner', 'self', 1.9856773481001821]]

In [42]:
# nearest words glove embedding
nearest_words_glove_embedding = get_nearest_words(word_to_check, glove_embedding_distance_matrix, glove_embedding_words)

print("nearest words in glove embedding are: ")
nearest_words_glove_embedding[:number_to_show]

nearest words in glove embedding are: 


[['owner', 'owner', 0.0],
 ['owner', 'name', 4.256417422285671],
 ['owner', 'parent', 4.347656112098677],
 ['owner', 'client', 4.378706726525105],
 ['owner', 'instance', 4.475563292100783],
 ['owner', 'managed', 4.487743598502036],
 ['owner', 'brand', 4.53856056674856],
 ['owner', 'exception', 4.555435206668623],
 ['owner', 'has', 4.645052051693285],
 ['owner', 'merchant', 4.684459211227589]]

# Show closest words to every word

In [43]:
# get 5 closest words to every word

number_closest_words = 5

for word in api_embedding_words:
    closest_api_embedding = get_nearest_words(word, api_embedding_distance_matrix, api_embedding_words)[1:number_closest_words+1]
    try:
        closest_api_embedding = [i[1] for i in closest_api_embedding]
    except:
        pass
    
    closest_glove_embedding = get_nearest_words(word, glove_embedding_distance_matrix, glove_embedding_words)[1:number_closest_words+1]
    try:
        closest_glove_embedding = [i[1] for i in closest_glove_embedding]
    except:
        pass
    
    print("word: "+word)
    print("closest API embedding:   "+str(closest_api_embedding))
    print("closest GloVe embedding: "+str(closest_glove_embedding))
    print()
    

word: action
closest API embedding:   ['client', 'request', 'policy', 'account', 'from']
closest GloVe embedding: ['response', 'result', 'set', 'to', 'charge']

word: account
closest API embedding:   ['client', 'email', 'action', 'message', 'errors']
closest GloVe embedding: ['accounts', 'instance', 'value', 'expense', 'claims']

word: id
closest API embedding:   ['name', 'group', 'type', 'code', 'error']
closest GloVe embedding: ['tags', 'card', 'info', 'addresses', 'url']

word: request
closest API embedding:   ['client', 'action', 'update', 'settings', 'account']
closest GloVe embedding: ['charge', 'to', 'instance', 'allowed', 'exception']

word: user
closest API embedding:   ['created', 'default', 'size', 'parent', 'url']
closest GloVe embedding: ['users', 'web', 'addresses', 'available', 'client']

word: resource
closest API embedding:   ['version', 'api', 'properties', 'providers', 'subscription']
closest GloVe embedding: ['resources', 'source', 'availability', 'fields', 'reposit