In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras 
import tensorflow as tf 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional
from keras.layers import Dropout, GRU, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell

In [None]:
cleaned_text_path  = "*/cleaned-text/"
OUTPUT_LENGTH=64 # Size of Author List

In [None]:
import glob
entries = []
for i in range(64):
    for file_name in glob.glob(cleaned_text_path+str(i+1)+"/*.cleaned.txt"):
        entries.append({"content":open(file_name, encoding='utf-8').read(),"label":str(i+1), "file_path": file_name})
    print(i)

In [None]:
len(entries)

In [None]:
df = pd.DataFrame(entries)
df.head(1)

In [None]:
df.shape

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                      lower=True, oov_token="UNK")

tokenizer.fit_on_texts(df['content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
class Attention(Layer):
    
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape =(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape =(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
#### Textual model code with attention
textual_model = Sequential()
textual_model.add(Embedding(MAX_NB_WORDS, 
                    100,  
                    input_length=MAX_SEQUENCE_LENGTH, input_shape=(500, )))

textual_model.add(SpatialDropout1D(0.3))
textual_model.add(Bidirectional(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
textual_model.add(Attention(MAX_SEQUENCE_LENGTH))
textual_model.add(Dropout(0.5))
textual_model.add(BatchNormalization())

textual_model.add(Dense(OUTPUT_LENGTH, activation='softmax'))
textual_model.compile(loss='weighted_categorical_crossentropy', optimizer=Adam(learning_rate=0.000001), metrics=['accuracy'])
textual_model.load_weights("manuscript_lstm.hdf5")
print(textual_model.summary())

In [None]:
textual_model_int = Model(inputs=textual_model.input, outputs=textual_model.layers[-4].output)

In [None]:
# VGG Stuff
import keras, os
from keras import Input
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
import numpy as np
from keras.applications.vgg19 import VGG19
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.utils import class_weight
from keras.optimizers import Adam
from keras.models import Model
from sklearn.metrics import confusion_matrix

In [None]:
# Generate the Visual model with all layers (with top)
def build_model_local():
    base_model = VGG19(weights='imagenet',
                       include_top=False, input_shape=(None, None, 3))  # imports the mobilenet model and discards the last 1000 neuron layer.
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='sigmoid')(x)  # we add dense layers so that the model can learn more complex functions and classify for better results.
    x = Dense(1024, activation='sigmoid')(x)  # dense layer 2
    x = Dense(512, activation='sigmoid')(x)  # dense layer 3
    preds = Dense(OUTPUT_LENGTH, activation='softmax')(x)  # final layer with softmax activation

    model = Model(inputs=base_model.input, outputs=preds)
    return model


visual_model = build_model_local()
visual_model.compile(optimizer=Adam(), loss='weighted_categorical_crossentropy', metrics=['accuracy'])
visual_model.load_weights("manuscript_visual.h5")

In [None]:
visual_model_int = Model(inputs=visual_model.input, outputs=visual_model.layers[-2].output)

In [None]:
visual_model.summary()

In [None]:
from random import shuffle
images = []
for i in range(64):
    for f_n in glob.glob("*/arabic/" + str(i + 1) + '/*'):
        images.append({
            "image_path":f_n,
            "text_path":f_n.replace("arabic","cleaned-text")+".txt.cleaned.txt",
            "label":i
        })

shuffle(images)
images_train = images[:7000]
images_validation = images[7000:8000]
images_test = images[8000:]

In [None]:
import glob

In [None]:
from random import shuffle
images = []

for i in range(64):
  for f_n in glob.glob(file_path + "/" + str(i + 1) + "/*.JPG"):
    images.append({
        "image_path": f_n,
        "text_path": f_n + ".txt.cleaned.txt",
        "label": i
    })

shuffle(images)
images_train = images[:7000]
images_validation = images[7000:8000]
images_test = images[8000:]

In [None]:
images_test[0]

In [None]:
len(images_test)

In [None]:
df.columns

In [None]:
len(images)

In [None]:
len(images_test)

In [None]:
from keras.preprocessing.image import load_img

# Load an image from file
def get_input(path):
    image = load_img(path, target_size=(224, 224))
    return np.array(image)

In [None]:
class My_Generator(keras.utils.Sequence):
    
    def __init__(self, img_txt_objs , batch_size):
        self.img_txt_objs = img_txt_objs
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.img_txt_objs) / float(self.batch_size)))

    def __getitem__(self, idx):
        
        batch_objs = self.img_txt_objs[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        batch_input_1 = []
        batch_input_2 = []
        batch_output = []

        for obj in batch_objs:
            
            input1 = get_input(obj["image_path"])
            
            content = open(obj["text_path"], encoding='utf-8').read()            
            input2 = tokenizer.texts_to_sequences([content.strip()])
            input2 = pad_sequences(input2, maxlen=MAX_SEQUENCE_LENGTH, padding="post").reshape(500)

            
            one_hot = np.zeros(OUTPUT_LENGTH)
            one_hot[obj["label"]] = 1
            
            batch_input_1.append(input1)            
            batch_input_2.append(input2)
            batch_output.append(one_hot)
            
        return [np.array(batch_input_1), np.array(batch_input_2) ], np.array(batch_output)

test_image_gen = My_Generator(images_test, 32)

In [None]:
images_test

In [None]:
from keras.layers import *
from keras import optimizers
# Features-level fusion model

input_1 = Input(shape=(224, 224, 3))
input_2 =  Input(shape=(500,))

x1 = visual_model_int(input_1)
x2 = textual_model_int(input_2)

x = Concatenate(axis=-1)([x1,x2])
x = Dropout(0.5)(x)
x = BatchNormalization()(x)

out = Dense(OUTPUT_LENGTH, activation="softmax")(x)
model = Model([input_1, input_2], out)

model.compile(loss='weighted_categorical_crossentropy', optimizer=Adam(learning_rate=0.000001), metrics=['accuracy'])

model.summary()

In [None]:
len(images_test)/batch_size

In [None]:
filepath="weights_fusion_manuscript_v2.hdf5"
batch_size= 32
model.load_weights(filepath)

In [None]:
y_pred = np.argmax(probabilities, axis=1)
len(y_pred)

In [None]:
y_true = [i['label'] for i in images_test]
len(y_true)

In [None]:
y_pred, y_true

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,recall_score,precision_score,average_precision_score

confusion_matrix(y_true, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

In [None]:
accuracy_score(y_true, y_pred)

In [None]:
precision_score(y_true, y_pred, average='weighted')

In [None]:
recall_score(y_true, y_pred,average='weighted')

In [None]:
f1_score(y_true, y_pred, average='weighted')

In [None]:
X_feature = model.predict_generator(test_image_gen, steps=20)

In [None]:
import glob
entries = []
filenames =[]
for i in range(64):
    for file_name in glob.glob(cleaned_text_path+str(i+1)+"/*.cleaned.txt"):
        print(file_name)
        filenames.append(file_name)
        entries.append({
            "content":open(file_name, encoding='utf-8').read(),
            "label":str(i+1),
            "filepaths":file_name,
            "manuscriptID":str(i+1),
        })
filenames = np.array(filenames)
df = pd.DataFrame(entries)
df

In [None]:
df.head(1)['filepaths'][0]

In [None]:
import datetime
K = 10
start_time = datetime.datetime.now().timestamp()

total_accuracy = 0
Y_pred_list = []

count = 0

for key,image_path in df['filepaths'].iteritems():
    #Get the predicted feature vector for the given image
    content = open(image_path, encoding='utf-8').read()
    X_Q = tokenizer.texts_to_sequences([content])
    X_Q = pad_sequences(X_Q, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
    pred_feature_vec =  model.predict(X_Q) 
    
    #Find the cosine similarity array based on all the feature vectors 
    #stored in X
    similarity_array = cosine_similarity(pred_feature_vec,X_feature)[0]
    
    #Get top K indices 
    indices = similarity_array.argsort()[-K:][::-1]
    
    true_ID = df['manuscriptID'].loc[key]
    total_pages = df[ df['manuscriptID'] == true_ID]['filepaths'].count()
    predicted_arr = df['manuscriptID'].loc[indices].values
    
    Y_pred_list.append(predicted_arr[0])
    
    #Number of correct predictions out of K
    found = np.count_nonzero(predicted_arr == true_ID)
    #print(indices)    
    if total_pages >= K:
        total_accuracy +=  found/K
    else:
        #total_pages is less than K
        total_accuracy += found/total_pages
    
    count += 1
    
    if count > 0 and count % 100 == 0:
        print("Done ", count)
        print("Accuracy so far %g %%" % (total_accuracy/count * 100))
       
end_time =  datetime.datetime.now().timestamp()
total_retrieval_time = end_time - start_time #In Seconds
print("Total retrieval time %g seconds" % total_retrieval_time)

mean_accuracy = total_accuracy/df.shape[0]
print("\nThe mean accuracy for top %d images is %g %%" % (K, mean_accuracy*100))

In [None]:
query_text_file  = "*/cleaned-text/2/DSC00009.JPG.txt.cleaned.txt"
content = open(query_text_file, encoding='utf-8').read()

In [None]:
start_time = datetime.datetime.now().timestamp()
output = new_model.predict(X_Q) 
similarity_array = cosine_similarity(output, X_feature)[0]
 
#Get top K indices 
indices = similarity_array.argsort()[-20:][::-1]
end_time =  datetime.datetime.now().timestamp()

total_retrieval_time = end_time - start_time #In Seconds
total_retrieval_time

In [None]:
print(filenames[indices])
similarity_array[similarity_array.argsort()[-20:][::-1]]

In [None]:
text_model_preds = textual_model.predict_generator(generator=test_image_gen, steps=20)

In [None]:
test_images_list = list(test_image_gen)

In [None]:
test_images_list[0]

In [None]:
df.head(1)

In [None]:
# Images_test

def get_dataset(images_obj):

  image_test_data = []
  content_test_data = []
  output_test_data = []

  for index, obj in enumerate(images_obj):

    input1 = get_input(obj["image_path"])
                
    # content = open(obj["text_path"], encoding='utf-8').read()
    content =   df.loc[df.file_path == obj["text_path"], 'content'].values.tolist()[0]        
    input2 = tokenizer.texts_to_sequences([content.strip()])
    input2 = pad_sequences(input2, maxlen=MAX_SEQUENCE_LENGTH, padding="post").reshape(500)


    one_hot = np.zeros(OUTPUT_LENGTH)
    one_hot[obj["label"]] = 1

    if index % 100 == 0:
      print(index)

    image_test_data.append(input1)
    content_test_data.append(input2)
    output_test_data.append(one_hot)

  return image_test_data, content_test_data, output_test_data

In [None]:
images_train_data, content_train_data, output_train_data = get_dataset(images_train)

In [None]:
images_validation_data, content_validation_data, output_validation_data = get_dataset(images_validation)

In [None]:
image_test_data, content_test_data, output_test_data = get_dataset(images_test)

In [None]:
len(images_validation_data), len(content_validation_data), len(output_validation_data)

In [None]:
len(image_test_data), len(content_test_data), len(output_test_data)

In [None]:
def get_tanh_normalized_score(input_value_preds):
  # TanH normalized probability scores
  score = 0.5*(np.tanh(0.5*(input_value_preds - np.mean(input_value_preds, axis=0)) / np.std(input_value_preds, axis=0)) + 1)

  return score

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,recall_score,precision_score,average_precision_score
from sklearn.metrics import classification_report

In [None]:
def get_correction_score(y_true_one_hot, y_pred_probs, is_one_hot=False):
  if is_one_hot:
    y_pred = np.argmax(y_pred_probs, axis=1)
  else:
    y_pred = y_pred_probs

  if is_one_hot:
    y_true = np.argmax(y_true_one_hot, axis=1)
  else:
    y_true = y_true_one_hot

  print(y_pred)
  print(y_true)

  print(confusion_matrix(y_true, y_pred))

  print(classification_report(y_true, y_pred))

  print('accuracy = ',accuracy_score(y_true, y_pred))
  print('precision = ', precision_score(y_true, y_pred, average='weighted'))
  print('recall = ', recall_score(y_true, y_pred,average='weighted'))
  print('f1 score = ', f1_score(y_true, y_pred, average='weighted'))

In [None]:
text_data_probs = textual_model.predict(np.array(content_test_data))

In [None]:
image_data_probs = visual_model.predict(np.array(image_test_data))

In [None]:
text_train_data_probs = textual_model.predict(np.array(content_train_data))

In [None]:
image_train_data_probs = visual_model.predict(np.array(images_train_data))

In [None]:
text_train_score = get_tanh_normalized_score(text_train_data_probs)
image_train_score = get_tanh_normalized_score(image_train_data_probs)

In [None]:
train_sum_rule_score = np.sum([text_train_score, image_train_score], axis=0)

In [None]:
text_test_score = get_tanh_normalized_score(text_data_probs)
image_test_score = get_tanh_normalized_score(image_data_probs)

In [None]:
test_sum_rule_score = np.sum([text_test_score, image_test_score], axis=0)

In [None]:
train_output = np.argmax(output_train_data, axis=1)
test_output = np.argmax(output_test_data, axis=1)

In [None]:
from keras.layers import *
from keras import optimizers
# Score-level fusion model

input_1 = Input(shape=(224, 224, 3))
input_2 =  Input(shape=(500,))

x1 = text_test_score
x2 = image_test_score

x = test_sum_rule_score([x1,x2])
x = Dropout(0.5)(x)
x = BatchNormalization()(x)

out = Dense(OUTPUT_LENGTH, activation="softmax")(x)
model = Model([input_1, input_2], out)

model.compile(loss='weighted_categorical_crossentropy', optimizer=Adam(learning_rate=0.000001), metrics=['accuracy'])

model.summary()

In [None]:
get_correction_score(test_output, clf.predict(test_min_rule_score))

In [None]:
## Scores of text_data_probs, and score of image_data_probs

In [None]:
get_correction_score(output_test_data, text_data_probs, True)

In [None]:
get_correction_score(output_test_data, image_data_probs, True)

In [None]:
## Similarity scores fusion

In [None]:
train_text_features = textual_model_int.predict(np.array(content_train_data))
train_image_features = visual_model_int.predict(np.array(images_train_data))

test_text_features = textual_model_int.predict(np.array(content_test_data))
test_image_features = visual_model_int.predict(np.array(image_test_data))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
images_test_df = pd.DataFrame(images_test)
images_train_df = pd.DataFrame(images_train)

In [None]:
text_similarity = cosine_similarity(test_text_features, train_text_features)
image_similarity = cosine_similarity(test_image_features, train_image_features)

In [None]:
text_similarity.shape

In [None]:
text_df = pd.DataFrame(text_similarity)
image_df = pd.DataFrame(image_similarity)

In [None]:
fused_similarity = text_similarity + image_similarity

In [None]:
fused_df = pd.DataFrame(fused_similarity, index=images_test_df['image_path'], columns=images_train_df['image_path'])

In [None]:
acc = 0
K = 10
count = 0

for each_index in range(fused_df.shape[0]):
  index_label = fused_df.iloc[each_index].name.rsplit('/', 2)[1]
  labels = list(map(lambda x: x.rsplit('/', 2)[1], fused_df.iloc[each_index].sort_values().nlargest(K).index))
  found_nums = np.count_nonzero(np.array(labels) == index_label)
  acc += (found_nums / K)
  # acc += int(index_label == labels[0])
  # print('-----------------------------------------------------')
  count += 1
  # print(count)
  if count > 0 and count % 100 == 0:
      print("Done ", count)
      print("Accuracy so far %g %%" % (acc/count * 100))

mean_accuracy = acc/fused_df.shape[0]
print("\nThe mean accuracy for top %d images is %g %%" % (K, mean_accuracy*100))

In [None]:
acc /= fused_df.shape[0]

In [None]:
acc