In [None]:
# Analysis and DataFrames
import numpy as np
import pandas as pd

# Tensorflow
import tensorflow as tf

# Visualization
import matplotlib.pyplot as plt

# Misc.
import re
import os
import time
import random

In [None]:
def load_glove_embedding(directory, filename):
  embeddings_index = {}
  f = open(os.path.join(directory, filename), encoding = 'utf8')
  for line in f:
    if len(line) > 1:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype = 'float32')
      embeddings_index[word] = coefs
  f.close()

  return embeddings_index

In [None]:
import os
import urllib.request
urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', 'pretrained_glove.zip')

In [None]:
!apt install unzip

!unzip  pretrained_glove.zip

In [None]:
# Create the embedding
embeddings_index = load_glove_embedding('./', 'glove.6B.300d.txt')

# Verify 
print('Words found:', len(embeddings_index))
print('Embedding shape for the word "the" :', embeddings_index['gradient'].shape)
print('Embedding dtype for the word "the" :', type(embeddings_index['gradient']))

In [None]:
# Grab the embedded word
target_emb = embeddings_index['gradient']

# Normalize the word's embedding
target_emb_norm = np.linalg.norm(target_emb)

# Cosine Similarity
similarity = []
for word in embeddings_index.keys():
  word_emb = embeddings_index[word]
  if len(word_emb) == len(target_emb):
    cos_sim = np.dot(target_emb, word_emb) / (np.linalg.norm(word_emb) * target_emb_norm)
    similarity.append((word, cos_sim))

# Display
[ item for item in sorted(similarity, key = lambda x: x[1], reverse=True)[:10]]

In [None]:
# Identify the words that we want
words_list = ['man', 'woman', 'king', 'queen', 'emperor', 'empress', 'earl', 'countess']

# Pull the words and the embedding
ex_emb = []
for word in words_list:
  ex_emb.append(embeddings_index[word])

# Error if
assert np.shape(ex_emb)[0] == len(words_list), 'Word in embedding might not be found'

In [None]:
# Singular Value Decomposition
U,S,Vt = np.linalg.svd(ex_emb)
print(np.shape(U))
print(S)

In [None]:
import matplotlib.pyplot as plt
plt.figure()
for i in range(len(words_list)):
  plt.scatter(U[i,0], U[i,1])
  plt.text(U[i,0], U[i,1], words_list[i])
plt.show()