In [262]:
import os
import numpy as np

embedding_dim = 100

sen_1 = 'Obama speaks to the media in illinois'
sen_2 = 'the president speaks the press in chicago'

Head to `https://nlp.stanford.edu/projects/glove/` (where you can learn more about the GloVe algorithm), and download the pre-computed 
embeddings from 2014 English Wikipedia. It's a 822MB zip file named `glove.6B.zip`, containing 100-dimensional embedding vectors for 
400,000 words (or non-word tokens). Un-zip it.

### Pre-process the embeddings


Let's parse the un-zipped file (it's a `txt` file) to build an index mapping words (as strings) to their vector representation (as number 
vectors).

In [263]:

glove_dir = 'G:/Task/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

The text introduces WRD as text similarity algorithms, that use Wasserstein distance (Earth Mover's Distance, bulldozer distance) to directly compare the differences between two indefinite length vectors.

In [264]:
import time
from scipy.optimize import linprog

def EMD(p, q, D):
    
    # wasserstein distance: earth mover’s distance (EMD) 
    A_eq = []
    for i in range(len(p)):
        A = np.zeros_like(D)
        A[i, :] = 1
        A_eq.append(A.reshape(-1))
    for i in range(len(q)):
        A = np.zeros_like(D)
        A[:, i] = 1
        A_eq.append(A.reshape(-1))
    A_eq = np.array(A_eq)
    b_eq = np.concatenate([p, q])
    D = D.reshape(-1)
    result = linprog(D, A_eq=A_eq[:-1], b_eq=b_eq[:-1])
    return result.fun

def WRD(x, y): # word rotator distance
    
    # Determined Norm and Direction of Word Vectors
    x_norm = (x**2).sum(axis=1, keepdims=True)**0.5
    y_norm = (y**2).sum(axis=1, keepdims=True)**0.5
    p = x_norm[:, 0] / x_norm.sum()
    q = y_norm[:, 0] / y_norm.sum()
    D = 1 - np.dot(x / x_norm, (y / y_norm).T)
    wrd = EMD(p, q, D)
    
    return 1 - wrd # word rotator similarity



Now let's build an embedding matrix, where each entry `i` contains the `embedding_dim`-dimensional vector for the word of index `i` in our reference word index.

In [265]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

vectors_sen1 = text_to_word_sequence(sen_1)
embedding_vector_sen1 = np.zeros((len(vectors_sen1), embedding_dim))

vectors_sen2 = text_to_word_sequence(sen_2)
embedding_vector_sen2 = np.zeros((len(vectors_sen2), embedding_dim))

for i in range(len(vectors_sen1)):
  embedding_vector_sen1[i] = embeddings_index.get(vectors_sen1[i])


for i in range(len(vectors_sen2)):
  embedding_vector_sen2[i] = embeddings_index.get(vectors_sen2[i])

similarity = WRD(embedding_vector_sen1, embedding_vector_sen2)


In [266]:
print('word_rotator_similarity is', similarity)

word_rotator_similarity is 0.8120568375979997
