In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional
from keras.layers import Dropout, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
import datetime
from IPython.core.interactiveshell import InteractiveShell

In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape =(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape =(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
import glob
cleaned_text_path  = "*/cleaned-text/"
entries = []
filenames =[]
for i in range(64):
    for file_name in glob.glob(cleaned_text_path+str(i+1)+"/*.cleaned.txt"):
        print(file_name)
        filenames.append(file_name)
        entries.append({
            "content":open(file_name, encoding='utf-8').read(),
            "label":str(i+1),
            "filepaths":file_name,
            "manuscriptID":str(i+1),
        })
filenames = np.array(filenames)
df = pd.DataFrame(entries)
df

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                      lower=True, oov_token="UNK")

tokenizer.fit_on_texts(df['content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
## Code with attention
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, 
                    100,  
                    input_length=MAX_SEQUENCE_LENGTH))

model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Attention(MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.5))
model.add(Dense(64, activation='softmax'))
model.compile(loss='weighted_categorical_crossentropy', optimizer=Adam(learning_rate=0.000001), metrics=['accuracy'])
print(model.summary())

In [None]:
X = tokenizer.texts_to_sequences(df['content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
Y = pd.get_dummies(df['label'].values)

In [None]:
from keras.models import Model, load_model

new_model = Model(model.inputs, model.layers[-2].output)
new_model.compile(optimizer='Adam', loss='weighted_categorical_crossentropy', metrics=['accuracy'])
new_model.summary()
X_feature = new_model.predict(X)

In [None]:
K = 10 
start_time = datetime.datetime.now().timestamp()

total_accuracy = 0
Y_pred_list = []

count = 0

for key,image_path in df['filepaths'].iteritems():
    #Get the predicted feature vector for the given image
    content = open(image_path, encoding='utf-8').read()
    X_Q = tokenizer.texts_to_sequences([content])
    X_Q = pad_sequences(X_Q, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
    pred_feature_vec =  new_model.predict(X_Q) 
    
    #Find the cosine similarity array based on all the feature vectors 
    #stored in X
    similarity_array = cosine_similarity(pred_feature_vec,X_feature)[0]
    
    #Get top K indices 
    indices = similarity_array.argsort()[-K:][::-1]
    
    true_ID = df['manuscriptID'].loc[key]
    total_pages = df[ df['manuscriptID'] == true_ID]['filepaths'].count()
    predicted_arr = df['manuscriptID'].loc[indices].values
    
    Y_pred_list.append(predicted_arr[0])
    
    #Number of correct predictions out of K
    found = np.count_nonzero(predicted_arr == true_ID)
    #print(indices)    
    if total_pages >= K:
        total_accuracy +=  found/K
    else:
        #total_pages is less than K
        total_accuracy += found/total_pages
    
    count += 1
    
    if count > 0 and count % 100 == 0:
        print("Done ", count)
        print("Accuracy so far %g %%" % (total_accuracy/count * 100))
       
end_time =  datetime.datetime.now().timestamp()
total_retrieval_time = end_time - start_time #In Seconds
print("Total retrieval time %g seconds" % total_retrieval_time)

mean_accuracy = total_accuracy/df.shape[0]
print("\nThe mean accuracy for top %d images is %g %%" % (K, mean_accuracy*100))

In [None]:
query_text_file  = "*/cleaned-text/2/DSC00009.JPG.txt.cleaned.txt"
content = open(query_text_file, encoding='utf-8').read()

In [None]:
X_Q = tokenizer.texts_to_sequences([content])
X_Q = pad_sequences(X_Q, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_Q[0].shape


In [None]:
start_time = datetime.datetime.now().timestamp()
output = new_model.predict(X_Q) 
similarity_array = cosine_similarity(output, X_feature)[0]
 
#Get top K indices 
indices = similarity_array.argsort()[-10:][::-1]
end_time =  datetime.datetime.now().timestamp()

total_retrieval_time = end_time - start_time #In Seconds
total_retrieval_time

In [None]:
print(filenames[indices])
similarity_array[similarity_array.argsort()[-10:][::-1]]