In [None]:
import cv2
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Input
import keras.backend.tensorflow_backend as KTF
from keras import backend as K
from keras import objectives
import tensorflow as tf
import numpy as np
from functools import partial
import json
import os
from scipy import stats
from math import sqrt, inf, isnan
import hashlib
import multiprocessing
from multiprocessing import Pool
from multiprocessing import Process
import time
from subprocess import Popen, PIPE
import tempfile
import subprocess
import shlex
import scipy.io
import random
import sys

sys.path.append("../../utilities")
sys.path.append("../regionProposal")
from imageHandler import getImagePath, readImage, squareResizeImage, getCropImage, parseOrnament
from EvaluationHelper import findBestIou, getProposalsIou
from log import log_progress
from regionProposal import processSelectiveSearch

tf.python.control_flow_ops = tf
multManager = multiprocessing.Manager()

outputFolder = '/scratch/fcjunker/output/'
jsonsExtractionPath = '/mnt/cluster-nas/florian/extractionResults/'
imagesPath = '/mnt/Ornaments_IMG/'

currentFolder = os.path.dirname(os.path.realpath('test.py'))
script_dirname = currentFolder+'/../regionProposal/matlabEdgeBoxes/'
annotatedPagesJsonPath = currentFolder+'/../../data/annotatedPages.json'
annotatedPagesSortedJsonPath = currentFolder+'/../../data/annotatedPagesSorted.json'

sizes = {
    'testSize': 3000,
    'trainSize': 4000
    }

numProcess = 24

np.set_printoptions(threshold='nan')

In [None]:
from gpuConfigs import configureGpu
"""
    Configure GPU usage for Keras with Tensorflow.
"""
configureGpu(gpuId='0', mode='limit', limit=0.2)

In [None]:
def createModel(verbose=True):
    """
    Create a Keras model for the CNN
    Its input shape is (1, 7, 7, 512) that correspond to the output of VGG16 given an image of 256x256
    Its output is a real value between 0 and 1.
    """
    
    model = Sequential()
    model.add(Flatten(input_shape=(1, 7, 7, 512)))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
    
    if verbose:
        print(model.summary())
        
    return model

def trainModel(model, trainingSet, testingSet, iouThreshold=0.5, maxNegExamplePerPage=2, verbose=2, nb_epoch=15):
    """
    Train the given model with the given training set.
    """
    
    trainingGenerator = generateClassifierSample(iouThreshold, maxNegExamplePerPage, trainingSet)
    testingGenerator = generateClassifierSample(iouThreshold, maxNegExamplePerPage, testingSet)

    model.fit_generator(
            trainingGenerator,
            samples_per_epoch = countGeneratedProposals(trainingSet),
            validation_data = testingGenerator,
            nb_val_samples = countGeneratedProposals(testingSet),
            max_q_size=3, nb_epoch=nb_epoch, verbose=verbose)

def generateClassifierSample(iouThreshold, numNegProposalsPerPage, annotatedPages):
    """
    A generator that yield a tuple (input, output) for the ornament classifier.
    Output at most (in average) 'numNegProposalsPerPage' negative examples per page and all positive examples.
    """
    
    inputData = []
    outputData = []
    batchSize = 128
    
    while True:
        adjustNumNegProposals = 0
        for key in annotatedPages:
            perm = np.random.permutation(len(annotatedPages[key]))
            for index in range(len(annotatedPages['annotatedPagesWithOrnament'])):
                annotatedPage = annotatedPages[key][perm[index]]
                pagePath = getImagePath(imagesPath+"bookm-"+annotatedPage['bookId']+"/"+annotatedPage['pageId'])
                fullImage = readImage(pagePath)

                ornaments = list(map(parseOrnament, annotatedPage['ornaments']))
                numPosProposals = np.sum(np.array(annotatedPage['proposalsScores']) > iouThreshold)
                numNegProposals = numNegProposalsPerPage+adjustNumNegProposals
                extractedOrnaments = getRandomProposals(annotatedPage, threshold=iouThreshold, cut=numNegProposals)
                adjustNumNegProposals = numNegProposals - len(extractedOrnaments) + numPosProposals

                for extractedOrnamentStr in extractedOrnaments:
                    extractedOrnament = parseOrnament(extractedOrnamentStr)
                    im = squareResizeImage(getCropImage(extractedOrnament, fullImage), 244)

                    inputData.append(extractFeatures(im))
                    outputData.append(findBestIou(ornaments, extractedOrnamentStr) > iouThreshold)

                    if len(inputData) >= batchSize:
                        yield (np.asarray(inputData), np.asarray(outputData))
                        inputData = []
                        outputData = []

        while adjustNumNegProposals > 0:
            adjustNumNegProposals -= 1
            inputData.append(extractFeatures(im))
            outputData.append(findBestIou(ornaments, extractedOrnamentStr))


        yield (np.asarray(inputData), np.asarray(outputData))
        
def getRandomProposals(annotatedPage, threshold=0.5, cut=2):
    """
    Shuffle proposals and output at most'cut' negatives examples per page and all positive examples.
    """
    
    indices = np.array(annotatedPage['proposalsScores']) > threshold
    arr = np.array(annotatedPage['proposals'])

    neg = arr[indices == False]
    p = np.random.permutation(len(neg))
    res = np.concatenate((neg[p][:cut], arr[indices]), axis=0)
    random.shuffle(res)
    return res
        
def countGeneratedProposals(dataset):
    """
    Count the number of proposals in one pass (typically one epoch).
    """
    
    threshold = 0.5
    count = 0
    for annotatedPage in dataset['annotatedPagesWithOrnament']:
        val = np.sum(np.array(annotatedPage['proposalsScores']) > threshold)
        count += val

    count += 4*len(dataset['annotatedPagesWithOrnament'])
    return count
    
def getProposals(annotatedPage, proposalsFolderPath):
    """
    Read proposals from disk.
    Note it assume that proposals from median filter and selective search are written on disk.
    """
    
    #Read Median filter proposals
    if not os.path.isdir("{0}median/".format(proposalsFolderPath)):
        print("Please write proposals from median filter in the following folder before proceding:")
        print("{0}median/".format(proposalsFolderPath))
        return []
    
    proposalMedianJsonPath = "{0}median/{1}.json".format(proposalsFolderPath, annotatedPage['bookId'])
    proposalMedianJson = open(proposalMedianJsonPath).read()
    pages = json.loads(proposalMedianJson)['pages']
    
    medianProposals = []
    for currentPage in pages:
        if annotatedPage['pageId'] == currentPage['id']:
            medianProposals = currentPage['segments']
    
    
    #Read selective search proposals
    proposalsSelectiveSearchFolderPath = '{}selectiveSearch/'.format(proposalsFolderPath)
    proposalSelSearchJsonPath = '{0}{1}-{2}.json'.format(proposalsFolderPath,
                                                         annotatedPage['bookId'], annotatedPage['pageId'])
    proposalSelSearchJson = open(proposalSelSearchJsonPath).read()
    
    return json.loads(proposalSelSearchJson)['candidates'] + medianProposals

def genProposals(annotatedPages, proposalsFolderPath):
    """
    Generate proposals with selective search.
    """
    
    pagesToProcessJsonPath = '{}genProposals.json'.format(proposalsFolderPath)
    proposalsSelectiveSearchFolderPath = '{}selectiveSearch/'.format(proposalsFolderPath)

    pagesToProcess = []
    for key in annotatedPages:
        for annotatedPage in annotatedPages[key]:
            pagesToProcess.append({
                'bookId': annotatedPage['bookId'],
                'pageId': annotatedPage['pageId']
            })

    jsonFile = open(pagesToProcessJsonPath, "w")
    jsonFile.write(json.dumps({'annotatedPages': pagesToProcess}, indent=4, sort_keys=True))
    jsonFile.close()

    print("Generating proposals")
    processSelectiveSearch(pagesToProcessJsonPath, proposalsFolderPath, resize=500, scale=500, sigma=0.9,
                           minSize=100, numProcess=numProcess)
    print("Proposals generation finished")

def extractFeaturesVGG16(modelVgg16, x):
    if len(x.shape) == 2:
        x = cv2.cvtColor(x,cv2.COLOR_GRAY2RGB)
        
    if len(x.shape) != 3:
        print('Error, image with shape: ', x.shape)
    
    x = x.astype('float')
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return modelVgg16.predict(x)

def preparedata(size, frontData = True):
    dataset = {}
    numExamplePerPage = {}
    numExample = {}
    numPosExamples = 0
    for key in annotatedPages:
        if frontData:
            dataset[key] = annotatedPages[key][:size[key]]
        else:
            dataset[key] = annotatedPages[key][-size[key]:]

        numExamplePerPage[key] = np.asarray(list(map(lambda x: len(x['proposals']), dataset[key])))
        numExample[key] = np.sum(numExamplePerPage[key])

    return dataset, numExamplePerPage, numExample

In [None]:
"""
    Create a keras model to extract VGG 16 features from 244x244x3 images
"""
modelVgg16 = VGG16(weights='imagenet', include_top=False)
extractFeatures = partial(extractFeaturesVGG16, modelVgg16)
extractFeatures(np.ones((244, 244, 3)))

In [None]:
"""
    Read pages annotation from disk
"""
annotatedPagesJson = open(annotatedPagesSortedJsonPath).read()
annotatedPages = json.loads(annotatedPagesJson)['annotatedPages']
numPagesWithOrnament = len(annotatedPages['annotatedPagesWithOrnament'])
numPagesWithoutOrnament = len(annotatedPages['annotatedPagesWithoutOrnament'])
numPages = numPagesWithOrnament+numPagesWithoutOrnament

testSize = {
    'annotatedPagesWithOrnament': round(numPagesWithOrnament/numPages * sizes['testSize']),
    'annotatedPagesWithoutOrnament': round(numPagesWithoutOrnament/numPages * sizes['testSize'])
}
trainSize = {
    'annotatedPagesWithOrnament': round(numPagesWithOrnament/numPages * sizes['trainSize']),
    'annotatedPagesWithoutOrnament': round(numPagesWithoutOrnament/numPages * sizes['trainSize'])
}

In [None]:
"""
    Generate, write an read proposals for each page
"""
proposalsFolderPath = '{}proposals/'.format(outputFolder)
genProposals(annotatedPages, proposalsFolderPath)

print('Read regions proposed')
for key in annotatedPages:
    for annotatedPage in log_progress(annotatedPages[key], 1):
        annotatedPage['proposals'] = getProposals(annotatedPage, proposalsFolderPath)

In [None]:
"""
    Rank each proposals (IoU)
"""
workers = Pool(processes=numProcess, maxtasksperchild=100)
for key in annotatedPages:
    dataset = annotatedPages[key]
    datsetIndex = 0
    for proposalsIou in log_progress(enumerate(workers.imap(getProposalsIou, dataset), 1), 1, len(dataset)):
        dataset[datsetIndex]['proposalsScores'] = proposalsIou[1]
        datsetIndex += 1

workers.close()
workers.terminate()
workers.join()

In [None]:
"""
    Train the classifier model
"""
testingSet, numTestingExamplePerPage, numTestingExample = preparedata(testSize, frontData=False)
trainingSet, numTrainingExamplePerPage, numTrainingExample = preparedata(trainSize)

specializedNetwork = createModel()
trainModel(specializedNetwork, trainingSet, testingSet)