In [2]:
import numpy as np
import pandas as pd
import h5py
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.models import Sequential, Model
from keras.layers import Dropout, Dense, Activation
from keras.layers import LSTM, Bidirectional, Input
from keras.layers import concatenate

Using TensorFlow backend.


In [3]:
import string
import pandas as pd
import numpy as np
!pip install python-levenshtein
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz, StringMatcher

class WordEmbeddings:
    """
    Module to load and handle the GloVe Word Embeddings
    """

    def __init__(self):
        self.vocabulary = set()
        self.word_to_vec = {}
        self.word_to_index = {}
        self.index_to_word = {}
    
    def load_glove(self, path):
        """
        Loads a pretrained GloVe model
        Expects a path to a GloVe pretrained word embeddings file
        """

        with open(path, 'r') as file:
            for line in file:              
                line = line.strip().split()
                self.vocabulary.add(line[0])
                self.word_to_vec[line[0]] = np.array(line[1:], dtype='float64')
    
            for x,y in enumerate(sorted(self.vocabulary)):
                self.word_to_index[y] = x+1
                self.index_to_word[x+1] = y
        
        self.EMBEDDING_DIM = len(self.word_to_vec['the'])
        print(len(self.word_to_vec))
        print(self.EMBEDDING_DIM)
        
    def get_matrix(self):
        embedding_matrix = np.zeros((len(self.word_to_index) + 1, self.EMBEDDING_DIM))
        for word, i in self.word_to_index.items():
            embedding_vector = self.word_to_vec[word]
            if(embedding_vector is not None):
                embedding_matrix[i] = embedding_vector
        print(embedding_matrix)
        return embedding_matrix

    def in_vocab(self, word):
        """
        Checks if a word is present in the vocabulary
        """
        return (word in self.vocabulary)

    def autocorrect(self, wrong_word):
        """
        Attempts to map a wrongly spelt word to the closest one present in the vocabulary.
        THIS IS NOT COSINE SIMILARITY. THIS IS AUTOCORRECT.
        """

        if self.in_vocab(wrong_word):
            return wrong_word

        closest_ratio = 0.0
        closest_word = None
        for word in self.vocabulary:
            if fuzz.ratio(word,wrong_word) > closest_ratio :
                closest_word = word
                closest_ratio = fuzz.ratio(word,wrong_word)
        return closest_word

    def similarity(self, word_1, word_2):
        """
        Returns the cosine similarity of word_1 and word_2
        """
        
        assert (self.in_vocab(word_1) and self.in_vocab(word_2))

        u = self.word_to_vec[word_1]
        v = self.word_to_vec[word_2]

        dot = np.sum(u * v)
        norm_u = np.sqrt(np.sum(u ** 2))
        norm_v = np.sqrt(np.sum(v ** 2))
        cosine_similarity = dot / (norm_u * norm_v)

        return cosine_similarity




In [4]:
glove_embeddings = WordEmbeddings()

In [5]:
path = 'glove.6B.100d.txt'

In [6]:
glove_embeddings.load_glove(path)

400001
100


In [7]:
matrix = glove_embeddings.get_matrix()

[[ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.38472    0.49351    0.49096   ...  0.026263   0.39052    0.52217  ]
 [ 0.22657    0.64651    0.84828   ...  0.54712    0.7697     0.35075  ]
 ...
 [ 0.14335    0.557     -0.68806   ...  0.10501   -0.49575    0.39039  ]
 [-0.036419  -0.63433   -0.26185   ...  0.25043    0.21037    0.75933  ]
 [ 0.32008    0.21479   -0.036466  ...  0.088318   0.11623    0.0020262]]


In [8]:
context_input = Input(shape=(None,),dtype='int32', name='context_input')
x= Embedding(input_dim=400002, output_dim=100, weights=[matrix], trainable=False)(context_input)
hidden_layer = Bidirectional(LSTM(128, return_state=False, return_sequences=True),merge_mode='concat')(x)
drop_1 = Dropout(0.5)(hidden_layer)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:
ques_input = Input(shape=(None, ), dtype='int32', name='context_input')
x= Embedding(input_dim=400002, output_dim=100, weights=[matrix], 
               trainable=False)(context_input)
hidden_layer = Bidirectional(LSTM(128, return_state=False, return_sequences=True),merge_mode='concat')(x)
drop_2 = Dropout(0.5)(hidden_layer)

In [10]:
with h5py.File('context.h5', 'r') as hf:
    context_array = hf['context'][:]
with h5py.File('questions.h5', 'r') as hf:
    question_array = hf['questions'][:]
with h5py.File('begin.h5', 'r') as hf:
    begin_span = hf['begin'][:]
with h5py.File('end.h5', 'r') as hf:
    end_span = hf['end'][:]

In [11]:
max_span_begin = np.amax(begin_span)
max_span_end = np.amax(end_span)
batch = 100
# slice of data to be used as one epoch training on full data is expensive
slce = 100
merge_layer = concatenate([drop_1, drop_2], axis=1)
biLSTM = Bidirectional(LSTM(128, implementation=2), merge_mode='mul')(merge_layer)
drop_3 =  Dropout(0.5)(biLSTM)
softmax_1 = Dense(max_span_begin, activation='softmax')(biLSTM)
softmax_2 = Dense(max_span_end, activation='softmax')(biLSTM)

model = Model(inputs=[context_input, ques_input], outputs=[softmax_1, softmax_2])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    40000200    context_input[0][0]              
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    40000200    context_input[0][0]              
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, None, 256)    234496      embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectio

In [None]:
model_history = model.fit([context_array[:slce], question_array[:slce]],
                        [begin_span[:slce], end_span[:slce]], verbose=2,
                         batch_size=batch, epochs=100)