In [4]:
import numpy as np
import scipy.spatial

# Read embeddings from file.
def read_embeddings():
    FILE_NAME = '/content/glove.6B.100d.txt'
    embeddings = {}
    file = open(FILE_NAME, 'r', encoding='utf-8')
    expected_dim = 100  # As per the file name 'glove.6B.100d.txt'
    for line in file:
        values = line.split()
        # Ensure there are enough values for a word and a vector of expected_dim
        if len(values) < expected_dim + 1:
            # print(f"Skipping malformed line (too few values): {line.strip().split(' ')[0]}")
            continue
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        # Only store the vector if it has the expected dimension
        if len(vector) == expected_dim:
            embeddings[word] = vector
        # else:
            # print(f"Skipping word '{word}' with incorrect dimension: {len(vector)} instead of {expected_dim}")
    file.close()
    print('Read %s embeddings.' % len(embeddings))
    return embeddings

In [2]:
def print_n_closest(embeddings, vec0, n):
    word_distances = {}
    for (word, vec1) in embeddings.items():
        distance = scipy.spatial.distance.cosine(
            vec1, vec0)
        word_distances[distance] = word
    # Print words sorted by distance.
    for distance in sorted(word_distances.keys())[:n]:
        word = word_distances[distance]
        print(word + ': %6.3f' % distance)

In [5]:
embeddings = read_embeddings()

lookup_word = 'hello'
print('\nWords closest to ' + lookup_word)
if lookup_word in embeddings:
    print_n_closest(embeddings,
                    embeddings[lookup_word], 3)
else:
    print(f"Word '{lookup_word}' not found or had incorrect dimensions.")

lookup_word = 'precisely'
print('\nWords closest to ' + lookup_word)
if lookup_word in embeddings:
    print_n_closest(embeddings,
                    embeddings[lookup_word], 3)
else:
    print(f"Word '{lookup_word}' not found or had incorrect dimensions.")

lookup_word = 'dog'
print('\nWords closest to ' + lookup_word)
if lookup_word in embeddings:
    print_n_closest(embeddings,
                    embeddings[lookup_word], 3)
else:
    print(f"Word '{lookup_word}' not found or had incorrect dimensions.")

Read 149780 embeddings.

Words closest to hello
hello:  0.000
goodbye:  0.209
hey:  0.283

Words closest to precisely
precisely:  0.000
exactly:  0.147
accurately:  0.293

Words closest to dog
dog:  0.000
cat:  0.120
dogs:  0.166


In [6]:
lookup_word = 'king'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings,
                embeddings[lookup_word], 3)

lookup_word = '(king - man + woman)'
print('\nWords closest to ' + lookup_word)
vec = embeddings['king'] - embeddings[
    'man'] + embeddings['woman']
print_n_closest(embeddings, vec, 3)


Words closest to king
king:  0.000
prince:  0.232
queen:  0.249

Words closest to (king - man + woman)
king:  0.145
queen:  0.217
monarch:  0.307


In [7]:
lookup_word = 'sweden'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings,
                embeddings[lookup_word], 3)

lookup_word = 'madrid'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings,
                embeddings[lookup_word], 3)

lookup_word = '(madrid - spain + sweden)'
print('\nWords closest to ' + lookup_word)
vec = embeddings['madrid'] - embeddings[
    'spain'] + embeddings['sweden']
print_n_closest(embeddings, vec, 3)


Words closest to sweden
sweden:  0.000
denmark:  0.138
norway:  0.193

Words closest to madrid
madrid:  0.000
barcelona:  0.157
valencia:  0.197

Words closest to (madrid - spain + sweden)
stockholm:  0.271
sweden:  0.300
copenhagen:  0.305
