In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Parsing Embeddings File into Hashmap

In [108]:
'''
using stanford's pretrained GloVe embeddings,
dataset wont be uploaded to github because it is too big
https://nlp.stanford.edu/projects/glove/
'''

embeddings ={}
with open("datasets/glove.6B/glove.6B.300d.txt", "r", encoding="utf-8") as f: # parsing file and saving each word embedding in a hashmap as {word: embedding}
    for line in f:
        values = line.split()
        embeddings[values[0]] = np.array(values[1:], dtype="float32")

### Defining metrics

In [112]:
def cosine_similarity(word1, word2, embeddings):
    embed1 = embeddings[word1]
    embed2 = embeddings[word2]
    return np.dot(embed1, embed2) / (np.linalg.norm(embed1) * np.linalg.norm(embed2))

def chebyshev_distance(word1, word2, embeddings):
    '''
    https://towardsdatascience.com/17-types-of-similarity-and-dissimilarity-measures-used-in-data-science-3eb914d2681
    this dissimilarity measure is used to find the distance between two vectors,
    distance is measured by the max variance between the coordinates,
    the lower the score, the better, a score of 0 means the two vectors are identical
    '''
    embed1 = embeddings[word1]
    embed2 = embeddings[word2]
    return np.max(np.abs(embed1 - embed2))


In [124]:
word1 = str(input("Input word 1")) # asking user for input
word2 = str(input("Input word 1"))

### Computing simiarity and dissimilarity Score

In [125]:
print(f"({word1}, {word2}), has a cosine similarity score of {cosine_similarity(word1, word2, embeddings):.2f}")
print(f"({word1}, {word2}), has a chebyshev distance of {chebyshev_distance(word1, word2, embeddings):.2f}")


(dog, cat), has a cosine similarity score of 0.68
(dog, cat), has a chebyshev distance of 0.93
