In [1]:
import numpy as np
from keras.layers.embeddings import Embedding
from keras.layers import LSTM, Input
from keras.models import Model

MAX_WORDS = 50

def GetGlove(glove_file):
    with open(glove_file, 'r') as file:
        words = set()
        word_to_vec, word_to_index, index_to_word = {},{},{}

        for line in file:
            line = line.strip().split()
            words.add(line[0])
            word_to_vec[line[0]] = np.array(line[1:], dtype='float64')
    
    for x,y in enumerate(sorted(words)):
        word_to_index[y] = x
        index_to_word[x] = y
        
    return {'word_to_vec': word_to_vec, 'word_to_index': word_to_index, 'index_to_word': index_to_word}

def QuestionIndices(Q, word_to_vec, word_to_index):
    x_index = np.zeros((len(Q), MAX_WORDS)) 

    for i in range(len(Q)):
        words = Q[i].lower().strip().split()
        j = 0
        for w in words:
            x_index[i,j] = word_to_index[w]
            j += 1
    
    return x_index
    
def EmbeddingLayer(word_to_vec, word_to_index):

    EMBED_DIM = word_to_vec['the'].shape[0]
    VOCAB_SIZE = len(word_to_index)+1

    embed_matrix = np.zeros((VOCAB_SIZE, EMBED_DIM))
    for word, index in word_to_index.items():
        embed_matrix[index] = word_to_vec[word]
    
    embedded_layer = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, trainable = False, input_length=MAX_WORDS)
    embedded_layer.build((None, ))
    embedded_layer.set_weights([embed_matrix])
    
    return embedded_layer

def Encode(word_to_vec, word_to_index):
    input = Input((50, ), dtype='int32')
    
    embedding_layer = EmbeddingLayer(word_to_vec, word_to_index)
    embeddings = embedding_layer(input)
    x = LSTM(128, return_sequences=False)(embeddings)
    
    model = Model(inputs=input, outputs=x)
    
    return model

Using TensorFlow backend.


In [4]:
print('glove')
params = GetGlove('glove.6B.100d.txt')
word_to_vec, word_to_index, index_to_word = params['word_to_vec'],params['word_to_index'],params['index_to_word']
   
print('indices')
ind = QuestionIndices(['Which is the tallest building'],word_to_vec,word_to_index)
print(ind)
   
print('model')
model = Encode(word_to_vec,word_to_index)
print(model.predict(ind))


glove
indices
[[386474. 192973. 357266. 352267.  86371.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.]]
model
[[-0.06840204  0.4617101   0.3132516  -0.38414174 -0.42119378 -0.23517838
   0.05085213  0.38185617  0.40476155 -0.508157    0.25222638 -0.45245972
   0.2276089   0.02648647  0.12201608 -0.05308213 -0.27630755 -0.19566384
   0.3021578   0.04783177  0.1570822   0.25458294 -0.09061718  0.24194878
   0.3956943   0.2668201  -0.02871352  0.0446302  -0.31203675 -0.43226755
  -0.10929769  0.05444723  0.0776777   0.3943209   0.16269517  0.2933461
   0.09048925  0.0637419  -0.5112531  -0.17375949  0.37025145  0.3163457
   0.23845947 -0.24136646  0.2347535  -0.39858133 

In [12]:
class WordEmbeddings:
    """
    Class to load and handle the GloVe Word Embeddinfs
    """

    def __init__(self):
        self.vocabulary = set()
        self.word_to_vec = {}
        self.word_to_index = {}
        self.index_to_word = {}

    def load_glove(self, path):
        """
        Loads a pretrained GloVe model
        Expects a path to a GloVe pretrained word embeddings file
        """

        with open(path, 'r') as file:
            for line in file:
                line = line.strip().split()
                self.vocabulary.add(line[0])
                self.word_to_vec[line[0]] = np.array(line[1:], dtype='float64')
    
            for x,y in enumerate(sorted(self.vocabulary)):
                self.word_to_index[y] = x
                self.index_to_word[x] = y
        
    def in_vocab(self, word):
        """
        Checks if a word is present in the vocabulary
        """
        return (word in self.vocabulary)

    def autocorrect(self, wrong_word):
        """
        Attempts to map a wrongly spelt word to the closest one present in the vocabulary.
        THIS IS NOT COSINE SIMILARITY. THIS IS AUTOCORRECT.
        """
        closest_ratio = 0.0
        closest_word = None
        for word in self.vocabulary:
            if fuzz.ratio(word,wrong_word) > closest_ratio :
                closest_word = word
                closest_ratio = fuzz.ratio(word,wrong_word)
        return closest_word

    def similarity(self, word_1, word_2):
        """
        Returns the cosine similarity of word_1 and word_2
        """
        
        assert (self.in_vocab(word_1) and self.in_vocab(word_2))

        u = self.word_to_vec[word_1]
        v = self.word_to_vec[word_2]

        dot = np.sum(u * v)
        norm_u = np.sqrt(np.sum(u ** 2))
        norm_v = np.sqrt(np.sum(v ** 2))
        cosine_similarity = dot / (norm_u * norm_v)

        return cosine_similarity


In [13]:
glove = WordEmbeddings()

In [14]:
glove.load_glove('glove.6B.100d.txt')

In [15]:
glove.similarity("not", "n\'t")

0.9010410922621748

In [16]:
len(glove.word_to_vec)

400001

In [22]:
glove.autocorrect('Mumbai')

'mumbai'

In [18]:
from fuzzywuzzy import fuzz, StringMatcher