In [1]:
import torch
import torchtext.vocab as vocab



In [2]:
glove = vocab.GloVe(name='6B', dim=100)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.42MB/s]                           
100%|█████████▉| 399999/400000 [00:27<00:00, 14600.46it/s]


In [3]:
# The number of words and embeddings
glove.vectors.shape

torch.Size([400000, 100])

In [4]:
# getting the embedding vector
def get_embedding_vector(word):
  word_index = glove.stoi[word]
  emb = glove.vectors[word_index]

  return emb

In [6]:
get_embedding_vector('chess').shape

torch.Size([100])

In [10]:
def get_closest_words_from_word(word, max_n=5):
  word_emb = get_embedding_vector(word)
  distances = [(w, torch.dist(word_emb, get_embedding_vector(w)).cpu().item()) for w in glove.itos]
  dist_sort_filt = sorted(distances, key=lambda x: x[1])[:max_n]

  return dist_sort_filt

In [12]:
def get_closest_words_from_emb(word_emb, max_n=5):
  distances = [(w, torch.dist(word_emb, get_embedding_vector(w)).cpu().item()) for w in glove.itos]
  dist_sort_filt = sorted(distances, key=lambda x: x[1])[:max_n]

  return dist_sort_filt

In [11]:
get_closest_words_from_word('chess')

[('chess', 0.0),
 ('backgammon', 4.379469394683838),
 ('grandmasters', 4.56368350982666),
 ('grandmaster', 4.613785743713379),
 ('scrabble', 4.677640438079834)]

In [13]:
# Word analogies

def get_word_analogy(word1, word2, word3, max_n=5):
  # w1 - w2 + w3 --> w4
  word1_emb = get_embedding_vector(word1)
  word2_emb = get_embedding_vector(word2)
  word3_emb = get_embedding_vector(word3)

  word4_emb = word1_emb - word2_emb + word3_emb

  analogy = get_closest_words_from_emb(word4_emb)

  return analogy

In [15]:
get_word_analogy('king', 'man', 'woman')

[('king', 3.364067792892456),
 ('queen', 4.081079006195068),
 ('monarch', 4.642907619476318),
 ('throne', 4.905500411987305),
 ('elizabeth', 4.921558856964111)]