# Word Embeddings from word2vec and GloVe

### GloVe WORD EMBEDDINGS
The GloVe word embeddings file is downloaded from http://nlp.stanford.edu/data/glove.6B.zip.

In [1]:
# Download the zip file
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-11-08 18:34:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-11-08 18:34:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-11-08 18:34:18--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-1

In [2]:
# Unzip the downloaded file.
# Create the base directory folder.
!mkdir data
# Run the unzip feature.
!unzip /content/glove.6B.zip -d data

Archive:  /content/glove.6B.zip
  inflating: data/glove.6B.50d.txt   
  inflating: data/glove.6B.100d.txt  
  inflating: data/glove.6B.200d.txt  
  inflating: data/glove.6B.300d.txt  


In [3]:
# Libraries
import numpy as np
import scipy.spatial

In [7]:
# Read embeddings from file
def read_embeddings(
    file_path:"The full path to the file to be read."
    ):
  """Reads the contents of the file with the word embeddings."""
  embeddings = {}
  file = open(file_path, 'r', encoding='utf-8')
  for line in file:
    values = line.split() # Split based on space.
    word = values[0] # Select the word.
    vector = np.asarray(values[1:], dtype='float32') # Select the corresponding embeddings.
    embeddings[word] = vector
  file.close()
  print(f"Read {len(embeddings)} embeddings.")
  return embeddings

In [18]:
def print_n_closest(embeddings, vec0, n):
  """Print the closest (in vector space) n words to a given vector (vec0)."""
  word_distances = {}
  for (word, vec1) in embeddings.items():
    distance = scipy.spatial.distance.cosine(vec1, vec0)
    word_distances[distance] = word
  # Print words sorted by distance.
  for distance in sorted(word_distances.keys())[:n]:
    word = word_distances[distance]
    print(f"{word}: {distance:6.3f}")

In [16]:
# Read in the embeddings
embeddings = read_embeddings('/content/data/glove.6B.100d.txt')

Read 400000 embeddings.


In [19]:
lookup_word = 'hello'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word], 3)

lookup_word = 'precisely'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word], 3)

lookup_word = 'dog'
print('\nWords closest to ' + lookup_word) 
print_n_closest(embeddings, embeddings[lookup_word], 3)


Words closest to hello
hello:  0.000
goodbye:  0.209
hey:  0.283

Words closest to precisely
precisely:  0.000
exactly:  0.147
accurately:  0.293

Words closest to dog
dog:  0.000
cat:  0.120
dogs:  0.166


In [20]:
lookup_word = 'king'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word], 3)

# king - man + woman = ?
lookup_word = '(king - man + woman)'
print('\nWords closest to ' + lookup_word)
vec = embeddings['king'] - embeddings['man'] + embeddings['woman']
print_n_closest(embeddings, vec, 3)


Words closest to king
king:  0.000
prince:  0.232
queen:  0.249

Words closest to (king - man + woman)
king:  0.145
queen:  0.217
monarch:  0.307


In [22]:
lookup_word = 'uganda'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word], 3)

lookup_word = 'kampala'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word], 3)

lookup_word = '(madrid - spain + uganda)'
print('\nWords closest to ' + lookup_word)
vec = embeddings['madrid'] - embeddings['spain'] + embeddings['uganda']
print_n_closest(embeddings, vec, 3)


Words closest to uganda
uganda:  0.000
tanzania:  0.141
rwanda:  0.170

Words closest to kampala
kampala:  0.000
nairobi:  0.235
kigali:  0.302

Words closest to (madrid - spain + uganda)
kampala:  0.235
uganda:  0.291
ugandan:  0.330
