In [None]:
import pickle
import numpy as np
from aux_functions import get_n_images, get_cross_model
from sklearn.model_selection import KFold
from keras import optimizers
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import h5py

## Experiment selection

In [None]:
corpus = "scigraph"
#corpus = "semantic-scholar"

embeddingsType = "scratch"

print("Selected the "+corpus+" corpus with embeddings from " +embeddingsType)

## Input files and parameters

In [None]:
h5path ='./databases/cross-corpus-'+corpus+'.h5'
vocabulary_tokens = './vocabularies/cross-vocab-tokens-'+corpus+'.pkl'

max_sequence_length = 1000
dim = 300
batchSize= 32
n_images = get_n_images(corpus)
epochs = 4

## Dictionaries

In [None]:
with open(vocabulary_tokens, 'rb') as fdict:
        word_index_tokens = pickle.load(fdict)
fdict.close()
print("Found " + str(len(word_index_tokens)) + " unique tokens")

## Dataset generator

In [None]:
def generator (h5path, indexes,batchSize, shuffle):
  db = h5py.File(h5path, "r")
  while True:
    if shuffle:
        np.random.shuffle(indexes)
    for i in range(0, len(indexes), batchSize):
        batch_indexes = indexes[i:i+batchSize]
        batch_indexes.sort()
        
        bx1 = db["captions_tokens"][batch_indexes,:]
        bx2 = db["images"][batch_indexes,:,:,:]
        by = db["labels"][batch_indexes,:]
            
        yield ([bx1, bx2], by)

## Training model (cross-validation)

In [None]:
kfold = KFold(n_splits=10, shuffle=True)

precisions = []
recalls = []
f1s = []

fold = 1
print ("Number of images: "+str(n_images))
print ("Number of classes: 2\n")
for train, test in kfold.split([None] * n_images):
    print("FOLD: " +str(fold))
    print("Training with "+ str(len(train))+ " samples and evaluating with "+str(len(test)))
    
    model = get_cross_model (word_index_tokens, max_sequence_length, dim)
    
    adam = optimizers.Adam(lr=1e-4, decay=1e-5)
    model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['categorical_accuracy'])
  
    model.fit_generator(generator(h5path,train,batchSize, shuffle=True), epochs=epochs, steps_per_epoch = len(train)//batchSize, validation_data=(generator(h5path,test,batchSize, shuffle=False)), validation_steps= len(test)//batchSize)
      
    db = h5py.File(h5path, "r")
    labels_test = db["labels"][test,:]
    db.close()

    pred = model.predict_generator(generator(h5path, test, batchSize=batchSize, shuffle=False),steps = len(test)//batchSize)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    print(classification_report(labels_test[0:batchSize*(len(test)//batchSize)], pred, digits=4))
    precisions.append(precision_score(labels_test[0:batchSize*(len(test)//batchSize)], pred, average="weighted"))
    recalls.append(recall_score(labels_test[0:batchSize*(len(test)//batchSize)], pred, average="weighted"))
    f1s.append(f1_score(labels_test[0:batchSize*(len(test)//batchSize)], pred, average="weighted"))
    fold = fold +1
print("Precision: %.4f (+/- %.2f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.2f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.2f)" % (np.mean(f1s), np.std(f1s)))