In [1]:
import pickle
import numpy as np
from aux_functions import get_cross_dim, get_n_images, get_cross_model
from sklearn.model_selection import KFold
from keras import optimizers
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import h5py

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Experiment selection

In [2]:
corpus = "scigraph"
#corpus = "semantic-scholar"

#embeddingsType = "scratch"
#embeddingsType = "holE"
#embeddingsType = "wikipedia"
#embeddingsType = "fastText"
embeddingsType = "semantic-scholar"

print("Selected the "+corpus+" corpus with embeddings from " +embeddingsType)

Selected the scigraph corpus with embeddings from semantic-scholar


## Input files and parameters

In [3]:
h5path ='./databases/cross-corpus-'+corpus+'.h5'
vocabulary_tokens = './vocabularies/cross-vocab-tokens-'+corpus+'.pkl'
vocabulary_syncons = './vocabularies/cross-vocab-syncons-'+corpus+'.pkl'
embeddingsFile = "./embeddings/cross-embeddings-"+embeddingsType+".tsv"

max_sequence_length = 1000
dim = get_cross_dim(embeddingsType)
batchSize= 32
n_images = get_n_images(corpus)
epochs = 4

## Dictionaries

In [4]:
with open(vocabulary_tokens, 'rb') as fdict:
        word_index_tokens = pickle.load(fdict)
fdict.close()
print("Found " + str(len(word_index_tokens)) + " unique tokens")
with open(vocabulary_syncons, 'rb') as fdict:
        word_index_syncons = pickle.load(fdict)
fdict.close()
print("Found " + str(len(word_index_syncons)) + " unique syncons")

Found 102167 unique tokens
Found 46577 unique syncons


## Loading Embeddings

In [5]:
if (embeddingsType != "scratch"):
    file = open(embeddingsFile, "r", encoding="utf-8", errors="surrogatepass")
    embeddings_index_tokens = {}
    embeddings_index_syncons = {}
    for line in file:
        values = line.split()
        word = values[0]
        if len(values) == dim+1:
            if (line.startswith("en#")):
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index_syncons[word] = vector
            else:
                if (line.startswith("lem_")):
                    continue
                else:
                    vector = np.asarray(values[1:], dtype='float32')
                    embeddings_index_tokens[word] = vector
    file.close()
    print('Found %s token vectors.' % len(embeddings_index_tokens))
    print('Found %s syncon vectors.' % len(embeddings_index_syncons))

count = 0
embedding_matrix_tokens = np.zeros((len(word_index_tokens) + 1, dim))
if (embeddingsType != "scratch"):
    for word, i in word_index_tokens.items():
        embedding_vector_tokens = embeddings_index_tokens.get(word)
        if embedding_vector_tokens is not None:
            embedding_matrix_tokens[i] = embedding_vector_tokens
            count = count + 1
    print("\nNumber of non-zero vectors in the tokens embedding matrix: " + str(count))
    
count = 0
embedding_matrix_syncons = np.zeros((len(word_index_syncons) + 1, dim))
if (embeddingsType != "scratch"):
    for word, i in word_index_syncons.items():
        embedding_vector_syncons = embeddings_index_syncons.get(word)
        if embedding_vector_syncons is not None:
            embedding_matrix_syncons[i] = embedding_vector_syncons
            count = count + 1
    print("Number of non-zero vectors in the syncons embedding matrix: " + str(count))

Found 156167 token vectors.
Found 82554 syncon vectors.

Number of non-zero vectors in the tokens embedding matrix: 48305
Number of non-zero vectors in the syncons embedding matrix: 42109


## Dataset generator

In [6]:
def generator (h5path, indexes,batchSize, shuffle):
  db = h5py.File(h5path, "r")
  while True:
    if shuffle:
        np.random.shuffle(indexes)
    for i in range(0, len(indexes), batchSize):
        batch_indexes = indexes[i:i+batchSize]
        batch_indexes.sort()
        
        if (embeddingsType == "scratch"):
            bx1 = db["captions_tokens"][batch_indexes,:]
            bx2 = db["images"][batch_indexes,:,:,:]
            by = db["labels"][batch_indexes,:]
            
            yield ([bx1, bx2], by)
            
        if (embeddingsType == "fastText"):
            bx1 = db["captions_tokens"][batch_indexes,:]
            bx2 = db["captions_tokens"][batch_indexes,:]
            bx3 = db["images"][batch_indexes,:,:,:]
            by = db["labels"][batch_indexes,:]
            
            yield ([bx1, bx2, bx3], by)
            
        if (embeddingsType == "holE" or embeddingsType == "wikipedia" or embeddingsType == "semantic-scholar"):
            bx1 = db["captions_tokens"][batch_indexes,:]
            bx2 = db["captions_tokens"][batch_indexes,:]
            bx3 = db["captions_syncons"][batch_indexes,:]
            bx4 = db["images"][batch_indexes,:,:,:]
            by = db["labels"][batch_indexes,:]

            yield ([bx1, bx2, bx3, bx4], by)

## Training model (cross-validation)

In [7]:
kfold = KFold(n_splits=10, shuffle=True)

precisions = []
recalls = []
f1s = []

fold = 1
print ("Number of images: "+str(n_images))
print ("Number of classes: 2\n")
for train, test in kfold.split([None] * n_images):
    print("FOLD: " +str(fold))
    print("Training with "+ str(len(train))+ " samples and evaluating with "+str(len(test)))
    
    model = get_cross_model (embeddingsType, word_index_tokens, word_index_syncons, embedding_matrix_tokens, embedding_matrix_syncons, max_sequence_length, dim)
    
    adam = optimizers.Adam(lr=1e-4, decay=1e-5)
    model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['categorical_accuracy'])
  
    model.fit_generator(generator(h5path,train,batchSize, shuffle=True), epochs=epochs, steps_per_epoch = len(train)//batchSize, validation_data=(generator(h5path,test,batchSize, shuffle=False)), validation_steps= len(test)//batchSize)
      
    db = h5py.File(h5path, "r")
    labels_test = db["labels"][test,:]
    db.close()

    pred = model.predict_generator(generator(h5path, test, batchSize=batchSize, shuffle=False),steps = len(test)//batchSize)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    print(classification_report(labels_test[0:batchSize*(len(test)//batchSize)], pred, digits=4))
    precisions.append(precision_score(labels_test[0:batchSize*(len(test)//batchSize)], pred, average="weighted"))
    recalls.append(recall_score(labels_test[0:batchSize*(len(test)//batchSize)], pred, average="weighted"))
    f1s.append(f1_score(labels_test[0:batchSize*(len(test)//batchSize)], pred, average="weighted"))
    fold = fold +1
print("Precision: %.4f (+/- %.2f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.2f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.2f)" % (np.mean(f1s), np.std(f1s)))

Number of images: 953492
Number of classes: 2

FOLD: 1
Training with 858142 samples and evaluating with 95350


  num_elements)


Epoch 1/4


OSError: Unable to open file (Unable to open file: name = './databases/cross-corpus-semantic-scholar.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)