In [1]:
import pickle
import numpy as np
from aux_functions import get_cat_dim, get_n_images, get_cat_captions_model, get_cat_figures_model
from sklearn.model_selection import KFold
from keras import optimizers
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import h5py

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Captions

## Experiment selection

In [2]:
weights = "cross"
#weights = "cross-vecsi"

print("Selected the "+weights+" weights for categorizing captions")

Selected the cross weights for categorizing captions


## Input files and parameters

In [3]:
h5path="./databases/cat-corpus-captions.h5"
vocabulary_tokens = './vocabularies/cross-vocab-tokens-scigraph.pkl'
vocabulary_syncons = './vocabularies/cross-vocab-syncons-scigraph.pkl'

max_sequence_length = 1000
dim = get_cat_dim(weights)
batchSize= 128
n_images = 82396
epochs = 6

## Dictionaries

In [4]:
with open(vocabulary_tokens, 'rb') as fdict:
        word_index_tokens = pickle.load(fdict)
fdict.close()
print("Found " + str(len(word_index_tokens)) + " unique tokens")
with open(vocabulary_syncons, 'rb') as fdict:
        word_index_syncons = pickle.load(fdict)
fdict.close()
print("Found " + str(len(word_index_syncons)) + " unique syncons")

Found 102167 unique tokens
Found 46577 unique syncons


## Dataset generator

In [5]:
def generator (h5path, indexes,batchSize, shuffle):
  db = h5py.File(h5path, "r")
  while True:
    if shuffle:
        np.random.shuffle(indexes)
    for i in range(0, len(indexes), batchSize):
        batch_indexes = indexes[i:i+batchSize]
        batch_indexes.sort()
        
        if (weights == "cross"):
            bx = db["captions_tokens"][batch_indexes,:]
            by = db["labels"][batch_indexes,:]
            
            yield (bx, by)
            
        if (weights == "cross-vecsi"):
            bx1 = db["captions_tokens"][batch_indexes,:]
            bx2 = db["captions_tokens"][batch_indexes,:]
            bx3 = db["captions_syncons"][batch_indexes,:]
            by = db["labels"][batch_indexes,:]

            yield ([bx1, bx2, bx3], by)

## Training model (cross-validation)

In [None]:
kfold = KFold(n_splits=10, shuffle=True)

precisions = []
recalls = []
f1s = []

fold = 1
print ("Number of images: "+str(n_images))
print ("Number of classes: 5\n")
for train, test in kfold.split([None] * n_images):
    print("FOLD: " +str(fold))
    print("Training with "+ str(len(train))+ " samples and evaluating with "+str(len(test)))
    
    model = get_cat_captions_model (weights, word_index_tokens, word_index_syncons, max_sequence_length, dim)

    model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=['categorical_accuracy'])
  
    model.fit_generator(generator(h5path,train,batchSize, shuffle=True), epochs=epochs, steps_per_epoch = len(train)//batchSize, validation_data=(generator(h5path,test,batchSize, shuffle=False)), validation_steps= len(test)//batchSize)
      
    db = h5py.File(h5path, "r")
    labels_test = db["labels"][test,:]
    db.close()

    pred = model.predict_generator(generator(h5path, test, batchSize=batchSize, shuffle=False),steps = len(test)//batchSize)
    maximos = np.argmax(pred,axis=1)
    predNew = np.zeros(np.shape(pred))
    for i in range(len(predNew)):
        predNew[i,maximos[i]]=1
    print(classification_report(labels_test[0:batchSize*(len(test)//batchSize)], predNew, digits=4))
    precisions.append(precision_score(labels_test[0:batchSize*(len(test)//batchSize)], predNew, average="weighted"))
    recalls.append(recall_score(labels_test[0:batchSize*(len(test)//batchSize)], predNew, average="weighted"))
    f1s.append(f1_score(labels_test[0:batchSize*(len(test)//batchSize)], predNew, average="weighted"))
    fold = fold +1
print("Precision: %.4f (+/- %.2f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.2f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.2f)" % (np.mean(f1s), np.std(f1s)))

# Figures

## Experiment selection

In [2]:
weights = "cross"
#weights = "cross-vecsi"

print("Selected the "+weights+" weights for categorizing figures")

Selected the cross weights for categorizing figures


## Input files and parameters

In [3]:
h5path="./databases/cat-corpus-figures.h5"

batchSize= 32
n_images = 82396
epochs = 5

## Dataset generator

In [4]:
def generator (h5path, indexes,batchSize, shuffle):
  db = h5py.File(h5path, "r")
  while True:
    if shuffle:
        np.random.shuffle(indexes)
    for i in range(0, len(indexes), batchSize):
        batch_indexes = indexes[i:i+batchSize]
        batch_indexes.sort()
        
        bx = db["images"][batch_indexes,:,:,:]
        by = db["labels"][batch_indexes,:]

        yield (bx, by)

## Training model (cross-validation)

In [None]:
kfold = KFold(n_splits=10, shuffle=True)

precisions = []
recalls = []
f1s = []

fold = 1
print ("Number of images: "+str(n_images))
print ("Number of classes: 5\n")
for train, test in kfold.split([None] * n_images):
    print("FOLD: " +str(fold))
    print("Training with "+ str(len(train))+ " samples and evaluating with "+str(len(test)))
    
    model = get_cat_figures_model (weights)
    
    adam = optimizers.Adam(lr=1e-4, decay=1e-5)
    model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['categorical_accuracy'])
  
    model.fit_generator(generator(h5path,train,batchSize, shuffle=True), epochs=epochs, steps_per_epoch = len(train)//batchSize, validation_data=(generator(h5path,test,batchSize, shuffle=False)), validation_steps= len(test)//batchSize)
      
    db = h5py.File(h5path, "r")
    labels_test = db["labels"][test,:]
    db.close()

    pred = model.predict_generator(generator(h5path, test, batchSize=batchSize, shuffle=False),steps = len(test)//batchSize)
    maximos = np.argmax(pred,axis=1)
    predNew = np.zeros(np.shape(pred))
    for i in range(len(predNew)):
        predNew[i,maximos[i]]=1
    print(classification_report(labels_test[0:batchSize*(len(test)//batchSize)], predNew, digits=4))
    precisions.append(precision_score(labels_test[0:batchSize*(len(test)//batchSize)], predNew, average="weighted"))
    recalls.append(recall_score(labels_test[0:batchSize*(len(test)//batchSize)], predNew, average="weighted"))
    f1s.append(f1_score(labels_test[0:batchSize*(len(test)//batchSize)], predNew, average="weighted"))
    fold = fold +1
print("Precision: %.4f (+/- %.2f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.2f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.2f)" % (np.mean(f1s), np.std(f1s)))