In [3]:
!pip install tensorflow-gpu==1.15.0



In [4]:
!pip install wget
!pip install pysaliency

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=3179dacf782c8a4b2823659248777149452daa950ae609686d6b070990a4d6d3
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting pysaliency
  Downloading pysaliency-0.2.21.tar.gz (156 kB)
[K     |████████████████████████████████| 156 kB 2.4 MB/s 
[?25hCollecting boltons
  Downloading boltons-21.0.0-py2.py3-none-any.whl (193 kB)
[K     |████████████████████████████████| 193 kB 32.8 MB/s 
[?25hCollecting deprecation
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting piexif
  Downloading piexif-1.1.3-py2.py3-none-any.whl (20 kB)
Collecting schema
  Downloading schema-0.7.5-py2.py3-none-any.whl

In [5]:
'''This file downloads COCO-Search18 dataset, target images,
 and VGG16 pretrained weights on ImageNet.'''

import os
import wget
import zipfile
import gdown
import argparse

def unzip(zip_path, extract_path):
    """extracts the files in a zip file
       in the specified directory
    args:
        zip_path (str): the path to the zip file
        extract_path (str): the path to save the extracted files 
    """

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        for file in zip_ref.namelist():
            zip_ref.extract(file, extract_path)

    os.remove(zip_path)

    return


def download_cocosearch(data_path):
    """Downloads the COCOSearch18 dataset. The dataset
       contains the image stimuli and label/annotation files.
    Args:
        data_path (str): Defines the path where the dataset will be
                         downloaded and extracted to.
    """

    print(">> Downloading COCOSearch18 dataset...", end="", flush=True)

    os.makedirs(data_path, exist_ok=True)

    urls = ['http://vision.cs.stonybrook.edu/~cvlab_download/COCOSearch18-images-TP.zip',
            'https://saliency.tuebingen.ai/data/coco_search18_TP.zip']

    for url in urls:
        filename = wget.download(url, data_path)
        unzip(filename, data_path)

    url = "https://drive.google.com/uc?export=download&id=1vEzgF54LPK2adlI7DdlXWGkYV76L-jjK"

    gdown.download(url, data_path, quiet=False)
    unzip(os.path.join(data_path , 'targets.zip'), data_path)

    # Downloading target object bounding box annotation
    url = "https://drive.google.com/uc?id=1OkpX_Md-lFwCo5TB_cq0Qxoe4oEB8eKG"
    output_path = os.path.join(data_path , 'bbox_annos.npy')
    gdown.download(url, output_path, quiet=False)

    url = "https://drive.google.com/u/0/uc?export=download&confirm=ATmP&id=1ff0va472Xs1bvidCwRlW3Ctf7Hbyyn7p"
    weights_path = os.path.join(data_path , 'weights')
    os.makedirs(weights_path, exist_ok=True)
    gdown.download(url, os.path.join(weights_path , 'vgg16_hybrid.zip'), quiet=False)
    unzip(os.path.join(weights_path , 'vgg16_hybrid.zip'), weights_path)

    print("done!", flush=True)
    return

if __name__ == '__main__':

    download_cocosearch('/content/')

>> Downloading COCOSearch18 dataset...

Downloading...
From: https://drive.google.com/uc?export=download&id=1vEzgF54LPK2adlI7DdlXWGkYV76L-jjK
To: /content/targets.zip
100%|██████████| 14.7M/14.7M [00:00<00:00, 42.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1OkpX_Md-lFwCo5TB_cq0Qxoe4oEB8eKG
To: /content/bbox_annos.npy
100%|██████████| 127k/127k [00:00<00:00, 24.5MB/s]
Downloading...
From: https://drive.google.com/u/0/uc?export=download&confirm=ATmP&id=1ff0va472Xs1bvidCwRlW3Ctf7Hbyyn7p
To: /content/weights/vgg16_hybrid.zip
100%|██████████| 519M/519M [00:08<00:00, 64.5MB/s]


done!


In [6]:
import numpy as np
from os.path import join
from itertools import groupby
import argparse
import os
import json
import random
import cv2


def GaussianMask(sizex, sizey, sigma=11, center=None, fix=1):
    """Blurs each fixation point by convolving it
        with a Gaussian kernel. This function is adopted from
        https://github.com/takyamamoto/Fixation-Densitymap repository.
    args:
        sizex (int): mask width
        sizey (int): mask height
        sigma (int): gaussian std
        center (tuple): gaussian mean
        fix (int or float): gaussian max
    returns:
        gaussian mask
    """

    x = np.arange(0, sizex, 1, float)
    y = np.arange(0, sizey, 1, float)
    x, y = np.meshgrid(x, y)

    if center is None:
        x0 = sizex // 2
        y0 = sizey // 2
    else:
        if np.isnan(center[0]) == False and np.isnan(center[1]) == False:
            x0 = center[0]
            y0 = center[1]
        else:
            return np.zeros((sizey, sizex))

    return fix * np.exp(-0.5 * ((x - x0) ** 2 + (y - y0) ** 2) / sigma ** 2)


def preprocess_fixations(phase,
                         task_img_pair,
                         trajs,
                         im_h,
                         im_w,
                         bbox,
                         sigma,
                         dldir,
                         datadir,
                         truncate_num=-1):
    """Processes fixation data and creates 
        fixation maps. Resizes all search and target images 
        and save them in the corresponding directories. 
        Splits data into train-validation-test sets. 
        Augments the training data.
        saves unblurred fixation maps for saliency metric computation.
        saves target bbox overlayed test images for the purpose of results visualization.
    Args:
        phase (str): train or valid set (test set is separated from train set)
        task_img_pair (list): a list of task-image pairs 
        trajs (list): a list of all trials 
        im_h (int): resize search images to this height
        im_w (int): resize search images to this width
        bbox  (dict): target object bbox for each task-image pair
        sigma (int): sigma for Gaussian blurring function
        dldir (str): directory of downloaded data
        datadir (str): directory to save the preprocessed train/val/test sets
        truncate_num (int): maximum number of fixations to be processed from each trial
    """
    fix_labels = []
    stimuli = []
    heat_maps_list = []
    min_fix_x = 100000
    max_fix_x = -100000
    min_fix_y = 100000
    max_fix_y = -100000
    flat_test_task_img_pair = []

    if phase == 'train':
        test_task_img_pair = []
        for key, group in groupby(task_img_pair, lambda x: x.split('_')[0]):
            key_and_group = {key: random.sample(list(group), 18)}
            test_task_img_pair.append(key_and_group[key])

        flat_test_task_img_pair = [item for sublist in test_task_img_pair for item in sublist]

    for traj in trajs:
        for i in range(len(traj['X'])):
            if traj['X'][i] < 0 or traj['Y'][i] < 0 or traj['X'][i] > 1680 or traj['Y'][i] > 1050:
                continue

            if traj['X'][i] < min_fix_x:
                min_fix_x = traj['X'][i]

            if traj['X'][i] > max_fix_x:
                max_fix_x = traj['X'][i]

            if traj['Y'][i] < min_fix_y:
                min_fix_y = traj['Y'][i]

            if traj['Y'][i] > max_fix_y:
                max_fix_y = traj['Y'][i]

    for task_img in task_img_pair:

        heatmap = np.zeros((im_h, im_w), np.float32)
        heatmap_unblurred = np.zeros((im_h, im_w), np.float32)

        x1 = bbox[task_img][0]
        y1 = bbox[task_img][1]
        w_image = bbox[task_img][2]
        h_image = bbox[task_img][3]

        for traj in trajs:

            if (traj['task'] + '_' + traj['name']) == task_img:

                # first fixations are fixed at the screen center
                traj['X'][0], traj['Y'][0] = im_w / 2, im_h / 2
                if truncate_num < 1:
                    traj_len = len(traj['X'])
                else:
                    traj_len = min(truncate_num, len(traj['X']))

                for i in range(1, traj_len):
                    # remove out of boundary fixations
                    if traj['X'][i] < 0 or traj['Y'][i] < 0 or traj['X'][i] > 1680 or traj['Y'][i] > 1050:
                        continue
                    fix = (
                        ((traj['X'][i] - min_fix_x) / max_fix_x) * (512),
                        ((traj['Y'][i] - min_fix_y) / max_fix_y) * (320))
                    
                    # masking the target, uncomment if you want to mask the target
                    '''
                    if (x1<=fix[0]<=x1+w_image and y1<=fix[1]<=y1+h_image):
                        continue
                    else:
                    '''
                    heatmap += GaussianMask(im_w, im_h, sigma, (fix[0], fix[1]))
                    heatmap_unblurred[int(fix[1]), int(fix[0])] = 1

        # Normalization
        heatmap = heatmap / np.amax(heatmap)
        heatmap_np = heatmap * 255
        heatmap = heatmap_np.astype("uint8")

        heatmap_unblurred = heatmap_unblurred / np.amax(heatmap_unblurred)
        heatmap_unblurred_np = heatmap_unblurred * 255
        heatmap_unblurred = heatmap_unblurred_np.astype("uint8")

        source = os.path.join(dldir , 'images' , str(task_img.split('_')[0]) , str(task_img.split('_')[1]))
        heatmap_flip = cv2.flip(heatmap, 1)
        img = cv2.imread(source)
        img_resized = cv2.resize(img, (im_w, im_h), interpolation=cv2.INTER_AREA)
        # bbox = [top left x position, top left y position, width, height].
        img_resized_flip = cv2.flip(img_resized, 1)

        target_0 = cv2.imread(os.path.join(dldir , 'targets' , (task_img.split('_')[
            0] + '_0.png')))  # img_resized[y1:y1+h_image , x1:x1+w_image, :]

        target_1 = cv2.imread(os.path.join(dldir , 'targets' , (task_img.split('_')[
            0] + '_1.png'))) 

        target_2 = cv2.imread(os.path.join(dldir , 'targets' , (task_img.split('_')[
            0] + '_2.png'))) 

        target_3 = cv2.imread(os.path.join(dldir , 'targets' , (task_img.split('_')[
            0] + '_3.png'))) 

        target_4 = cv2.imread(os.path.join(dldir , 'targets' ,(task_img.split('_')[
            0] + '_4.png'))) 

        target_0 = cv2.resize(target_0, (64, 64), interpolation=cv2.INTER_AREA)
        target_flip_0 = cv2.flip(target_0, 1)

        target_1 = cv2.resize(target_1, (64, 64), interpolation=cv2.INTER_AREA)
        target_flip_1 = cv2.flip(target_1, 1)

        target_2 = cv2.resize(target_2, (64, 64), interpolation=cv2.INTER_AREA)
        target_flip_2 = cv2.flip(target_2, 1)

        target_3 = cv2.resize(target_3, (64, 64), interpolation=cv2.INTER_AREA)
        target_flip_3 = cv2.flip(target_3, 1)

        target_4 = cv2.resize(target_4, (64, 64), interpolation=cv2.INTER_AREA)
        target_flip_4 = cv2.flip(target_4, 1)

        img_target_frame=cv2.rectangle(img_resized.copy(),(x1,y1),(x1+w_image,y1+h_image),(0,255,0),2)
        
        unblur = False
        flip_f = False

        if phase == 'train':

            if task_img in flat_test_task_img_pair:
 
                unblur = True

                out_name = os.path.join(datadir , 'saliencymap/test' , str(task_img))
                out_name_np = os.path.join(datadir , 'saliencymap/test' , (os.path.splitext(str(task_img))[0]+'.npy'))
                
                with open(out_name_np, "wb") as file:
                    np.save(file, heatmap_np )

                destination = os.path.join(datadir , 'stimuli/test' , str(task_img))
                target_path_0 = os.path.join(datadir , 'target_0/test' , str(task_img))
                target_path_1 = os.path.join(datadir , 'target_1/test' , str(task_img))
                target_path_2 = os.path.join(datadir , 'target_2/test' , str(task_img))
                target_path_3 = os.path.join(datadir , 'target_3/test' , str(task_img))
                target_path_4 = os.path.join(datadir , 'target_4/test' , str(task_img))

                img_target_rect_path = os.path.join(datadir , 'stimuli/test_targ_bbox' , str(task_img))
                cv2.imwrite(img_target_rect_path, img_target_frame)
                
                out_name_unblur = os.path.join(datadir , 'saliencymap/test_unblur' , str(task_img))
                out_name_unblur_npy = os.path.join(datadir , 'saliencymap/test_unblur' , (os.path.splitext(str(task_img))[0]+'.npy'))

            else:

                flip_f = True

                out_name = os.path.join(datadir , 'saliencymap/train' , str(task_img))
                destination = os.path.join(datadir , 'stimuli/train' , str(task_img))

                target_path_0 = os.path.join(datadir , 'target_0/train' , str(task_img))
                target_path_1 = os.path.join(datadir , 'target_1/train' , str(task_img))
                target_path_2 = os.path.join(datadir , 'target_2/train' , str(task_img))
                target_path_3 = os.path.join(datadir , 'target_3/train' , str(task_img))
                target_path_4 = os.path.join(datadir , 'target_4/train' , str(task_img))

                sal_out_flip = os.path.join(datadir , 'saliencymap/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))
                stim_out_flip = os.path.join(datadir , 'stimuli/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))

                tar_out_flip_0 = os.path.join(datadir , 'target_0/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))
                tar_out_flip_1 = os.path.join(datadir , 'target_1/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))
                tar_out_flip_2 = os.path.join(datadir , 'target_2/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))
                tar_out_flip_3 = os.path.join(datadir , 'target_3/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))
                tar_out_flip_4 = os.path.join(datadir , 'target_4/train' , (str(task_img.split('.')[0]) + '_flip.' + str(
                    task_img.split('.')[1])))
        else:
            
            out_name = os.path.join(datadir , 'saliencymap/valid' , str(task_img))
            destination = os.path.join(datadir , 'stimuli/valid' , str(task_img))

            target_path_0 = os.path.join(datadir , 'target_0/valid' , str(task_img))
            target_path_1 = os.path.join(datadir , 'target_1/valid' , str(task_img))
            target_path_2 = os.path.join(datadir , 'target_2/valid' , str(task_img))
            target_path_3 = os.path.join(datadir , 'target_3/valid' , str(task_img))
            target_path_4 = os.path.join(datadir , 'target_4/valid' , str(task_img))

        #uncomment this part to save colorful heatmap of fixations overlayed on images
        '''#create groundtruth heatmaps
        heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)

        # Create mask
        threshold = 30 #10

        mask = np.where(heatmap <= threshold, 1, 0)
        mask = np.reshape(mask, (im_h, im_w, 1))
        mask = np.repeat(mask, 3, axis=2)

        # Marge images

        marge = img_resized * mask + heatmap_color * (1 - mask)
        marge = marge.astype("uint8")
        alpha = 0.5

        marge = cv2.addWeighted(img_resized, 1 - alpha, marge, alpha, 0)'''

        cv2.imwrite(destination, img_resized)
        cv2.imwrite(out_name, heatmap)
        #cv2.imwrite(out_name, marge)
        cv2.imwrite(target_path_0, target_0)
        cv2.imwrite(target_path_1, target_1)
        cv2.imwrite(target_path_2, target_2)
        cv2.imwrite(target_path_3, target_3)
        cv2.imwrite(target_path_4, target_4)

        if flip_f:
            cv2.imwrite(stim_out_flip, img_resized_flip)
            cv2.imwrite(sal_out_flip, heatmap_flip)
            cv2.imwrite(tar_out_flip_0, target_flip_0)
            cv2.imwrite(tar_out_flip_1, target_flip_1)
            cv2.imwrite(tar_out_flip_2, target_flip_2)
            cv2.imwrite(tar_out_flip_3, target_flip_3)
            cv2.imwrite(tar_out_flip_4, target_flip_4)

        if unblur:
            cv2.imwrite(out_name_unblur, heatmap_unblurred)
            with open(out_name_unblur_npy, "wb") as file:
                    np.save(file, heatmap_unblurred_np)

    return 

def process_data(trajs_train,
                 trajs_valid,
                 target_annos,
                 sigma,
                 dldir,
                 datadir):
    """creates task-image pairs for training and validation sets
        then calls preprocess_fixations func to 
        create fixation maps and train-test-valid split.
    args:
        trajs_train (list): a list of all trials in the original dataset training split 
        trajs_valid (list): a list of all trials in the original dataset validation split 
        target_annos (dict):  contains target object bbox for each task-image pair
        sigma (int): sigma for Gaussian blurring function
        dldir (str): directory of downloaded data
        datadir (str): directory to save the preprocessed train/val/test sets
    """

    im_w = 512
    im_h = 320
    #max_traj_length = 6

    target_init_fixs = {}
    cat_names = list(np.unique([x['task'] for x in trajs_train]))
    catIds = dict(zip(cat_names, list(range(len(cat_names)))))

    # training fixation data
    train_task_img_pair = np.unique(
        [traj['task'] + '_' + traj['name'] for traj in trajs_train])

    # uncomment this part to process train data for only a single category
    '''train_task_img_pair = []
    for traj in trajs_train:
      if traj['task'] =='tv':
        train_task_img_pair.append(traj['task'] + '_' + traj['name'])
    train_task_img_pair = np.unique(np.asarray(train_task_img_pair))'''

    preprocess_fixations(
        'train',
        train_task_img_pair,
        trajs_train,
        im_h,
        im_w,
        target_annos,
        sigma,
        dldir,
        datadir,
        truncate_num=-1)

    # validation fixation data
    valid_task_img_pair = np.unique(
        [traj['task'] + '_' + traj['name'] for traj in trajs_valid])

    # uncomment this part to process valid data for only a single category
    '''valid_task_img_pair = []
    for traj in trajs_valid:
      if traj['task'] =='tv':
        valid_task_img_pair.append(traj['task'] + '_' + traj['name'])
    valid_task_img_pair = np.unique(np.array(valid_task_img_pair))'''

    preprocess_fixations(
        'valid',
        valid_task_img_pair,
        trajs_valid,
        im_h,
        im_w,
        target_annos,
        sigma,
        dldir,
        datadir,
        truncate_num=-1)

    return 


if __name__ == '__main__':

    #The directory to save images along with their fixation maps.
    datadir = os.path.join('.' , 'cocosearch/')

    sl_map = os.path.join(datadir, 'saliencymap')
    tr_sl_map = os.path.join(sl_map, 'train')
    v_sl_map = os.path.join(sl_map, 'valid')
    te_sl_map = os.path.join(sl_map, 'test')
    te_unbur_sl_map = os.path.join(sl_map, 'test_unblur')

    stimuli = os.path.join(datadir, 'stimuli')
    tr_stimuli = os.path.join(stimuli, 'train')
    v_stimuli = os.path.join(stimuli, 'valid')
    te_stimuli = os.path.join(stimuli, 'test')
    targ_t_stimuli = os.path.join(stimuli, 'test_targ_bbox')

    target_0 = os.path.join(datadir, 'target_0')
    tr_target_0 = os.path.join(target_0, 'train')
    v_target_0 = os.path.join(target_0, 'valid')
    te_target_0 = os.path.join(target_0, 'test')

    target_1 = os.path.join(datadir, 'target_1')
    tr_target_1 = os.path.join(target_1, 'train')
    v_target_1 = os.path.join(target_1, 'valid')
    te_target_1 = os.path.join(target_1, 'test')

    target_2 = os.path.join(datadir, 'target_2')
    tr_target_2 = os.path.join(target_2, 'train')
    v_target_2 = os.path.join(target_2, 'valid')
    te_target_2 = os.path.join(target_2, 'test')

    target_3 = os.path.join(datadir, 'target_3')
    tr_target_3 = os.path.join(target_3, 'train')
    v_target_3 = os.path.join(target_3, 'valid')
    te_target_3 = os.path.join(target_3, 'test')

    target_4 = os.path.join(datadir, 'target_4')
    tr_target_4 = os.path.join(target_4, 'train')
    v_target_4 = os.path.join(target_4, 'valid')
    te_target_4 = os.path.join(target_4, 'test')

    os.makedirs('./', exist_ok=True)
    os.makedirs(datadir, exist_ok=True)

    os.makedirs(sl_map, exist_ok=True)
    os.makedirs(tr_sl_map, exist_ok=True)
    os.makedirs(v_sl_map, exist_ok=True)
    os.makedirs(te_sl_map, exist_ok=True)
    os.makedirs(te_unbur_sl_map, exist_ok=True)

    os.makedirs(stimuli, exist_ok=True)
    os.makedirs(tr_stimuli, exist_ok=True)
    os.makedirs(v_stimuli, exist_ok=True)
    os.makedirs(te_stimuli, exist_ok=True)
    os.makedirs( targ_t_stimuli, exist_ok=True)

    os.makedirs(target_0, exist_ok=True)
    os.makedirs(tr_target_0, exist_ok=True)
    os.makedirs(v_target_0, exist_ok=True)
    os.makedirs(te_target_0, exist_ok=True)

    os.makedirs(target_1, exist_ok=True)
    os.makedirs(tr_target_1, exist_ok=True)
    os.makedirs(v_target_1, exist_ok=True)
    os.makedirs(te_target_1, exist_ok=True)

    os.makedirs(target_2, exist_ok=True)
    os.makedirs(tr_target_2, exist_ok=True)
    os.makedirs(v_target_2, exist_ok=True)
    os.makedirs(te_target_2, exist_ok=True)

    os.makedirs(target_3, exist_ok=True)
    os.makedirs(tr_target_3, exist_ok=True)
    os.makedirs(v_target_3, exist_ok=True)
    os.makedirs(te_target_3, exist_ok=True)

    os.makedirs(target_4, exist_ok=True)
    os.makedirs(tr_target_4, exist_ok=True)
    os.makedirs(v_target_4, exist_ok=True)
    os.makedirs(te_target_4, exist_ok=True)

    dataset_root = '/content'

    # bounding box of the target object (for search efficiency evaluation)
    bbox_annos = np.load(join(dataset_root, 'bbox_annos.npy'),
                         allow_pickle=True).item()

    # load ground-truth human scanpaths

    with open(join(dataset_root,
                   'coco_search18_fixations_TP_train_split1.json')) as json_file:
        human_scanpaths_train = json.load(json_file)

    with open(join(dataset_root,
                   'coco_search18_fixations_TP_validation_split1.json')) as json_file:
        human_scanpaths_valid = json.load(json_file)

    # exclude incorrect scanpaths
    human_scanpaths_train = list(
        filter(lambda x: x['correct'] == 1, human_scanpaths_train))
    human_scanpaths_valid = list(
        filter(lambda x: x['correct'] == 1, human_scanpaths_valid))

    sigma = 11
    # process fixation data
    process_data(human_scanpaths_train, human_scanpaths_valid, bbox_annos,
                           sigma, dataset_root, datadir)

    train = next(os.walk(tr_stimuli))[2] 
    print(len(train))

    valid = next(os.walk(v_stimuli))[2] 
    print(len(valid))

    test = next(os.walk(te_stimuli))[2] 
    print(len(test))

3652
324
324


In [7]:
!rm -r './images/'

In [8]:
class config:
  """General training parameters that define the maximum number of
  training epochs, the batch size, and learning rate for the ADAM
  optimization method. To reproduce the results from the paper,
  these values should not be changed. The device can be either
  "cpu" or "gpu", which then optimizes the model accordingly after
  training or uses the correct version for inference when testing.
  """

  PARAMS = {
      "n_epochs": 7,
      "batch_size": 1,
      "n_training_steps": 10000,
      "learning_rate": 1e-5,
      "learning_power": 0.5,
      "momentum": 0.9,
      "device": "gpu"
  }

  """The predefined input image sizes for the search and target images.
  To reproduce the results from the paper, these values should
  not be changed. They must be divisible by 8 due to the model's
  downsampling operations.
  """ 

  DIMS = {
      "image_size_cocosearch": (320, 512),
      "image_target_size_cocosearch": (64, 64)
  }

In [9]:
import os
import sys
import numpy as np
import tensorflow as tf
import random 

class COCOSEARCH:

    """This class represents the COCO-Search18 dataset. It consists of 3101
       target-present images. All stimuli are of size 1680x1050 pixels
       and are resized to 512x320 (height by width). It also randomly chooses
       one of the five available sample target images for image stimuli. 
       Thus at each epoch when the dataset iterator is re-initialized, 
       the network randomly chooses a different set of sample targets.       

    Attributes:
        n_train: Number of training instances as defined in the dataset.
        n_valid: Number of validation instances as defined in the dataset.

    Returns:
        tuple: A tuple that consists of dataset objects holding the training
               and validation set instances respectively.
    """
    n_train = 0
    n_valid = 0

    def __init__(self, data_path):

        type(self).n_train = len(next(os.walk(data_path + "cocosearch/stimuli/train"))[2])
        type(self).n_valid = len(next(os.walk(data_path + "cocosearch/stimuli/valid"))[2])
        
        self._stimuli_size = config.DIMS["image_size_cocosearch"]
        self._target_size = config.DIMS["image_target_size_cocosearch"]

        self._dir_stimuli_train = data_path + "cocosearch/stimuli/train"
        self._dir_stimuli_valid = data_path + "cocosearch/stimuli/valid"

        self._dir_saliency_train = data_path + "cocosearch/saliencymap/train"
        self._dir_saliency_valid = data_path + "cocosearch/saliencymap/valid"

        targ_ind_train = str(random.randint(0, 4))
        targ_ind_valid = str(random.randint(0, 4))

        self._dir_target_train = data_path + "cocosearch/target_" + targ_ind_train + "/train"
        self._dir_target_valid = data_path + "cocosearch/target_" + targ_ind_valid + "/valid"


    def load_data(self):

        train_list_x = _get_file_list(self._dir_stimuli_train)
        train_list_y = _get_file_list(self._dir_saliency_train)
        train_list_z = _get_file_list(self._dir_target_train)

        _check_consistency(zip(train_list_x, train_list_y, train_list_z), self.n_train)

        train_set = _fetch_dataset((train_list_x, train_list_y, train_list_z),
                                    self._stimuli_size, self._target_size, True)

        valid_list_x = _get_file_list(self._dir_stimuli_valid)
        valid_list_y = _get_file_list(self._dir_saliency_valid)
        valid_list_z = _get_file_list(self._dir_target_valid)

        _check_consistency(zip(valid_list_x, valid_list_y, valid_list_z), self.n_valid)

        valid_set = _fetch_dataset((valid_list_x, valid_list_y, valid_list_z),
                                    self._stimuli_size, self._target_size, False)

        return (train_set, valid_set)


class TEST:

    """This class represents test set instances used for inference through
       a trained network. All stimuli are resized to the preferred spatial
       dimensions of the chosen model. This can, however, lead to cases of
       excessive image padding.
    Returns:
        object: A dataset object that holds all test set instances
                specified under the path variable.
    """
    n_test = 0
    def __init__(self, dataset, data_path):
        
        type(self).n_test = len(next(os.walk(data_path + "cocosearch/stimuli/test"))[2])

        self._stimuli_size = config.DIMS["image_size_cocosearch"]
        self._target_size = config.DIMS["image_target_size_cocosearch"]

        targ_ind_test = str(random.randint(0, 4))

        self._dir_stimuli_test = data_path + "cocosearch/stimuli/test"
        self._dir_target_test = data_path + "cocosearch/target_"+ targ_ind_test + "/test"
        self._dir_saliency_test = data_path + "cocosearch/saliencymap/test"
 

    def load_data(self):

        test_list_x = _get_file_list(self._dir_stimuli_test)
        test_list_y = _get_file_list(self._dir_saliency_test)
        test_list_z = _get_file_list(self._dir_target_test)

        _check_consistency(zip(test_list_x, test_list_y, test_list_z), self.n_test)

        test_set = _fetch_dataset((test_list_x, test_list_y, test_list_z), self._stimuli_size, self._target_size,
                                  False, online=True)

        return test_set


def get_dataset_iterator(phase, dataset, data_path):

    """
    Entry point to make an initializable dataset iterator for either
       training or testing a model by calling the respective dataset class.
    Args:
        phase (str): Holds the current phase, which can be "train" or "test".
        dataset (str): Denotes the dataset to be used during training or the
                       suitable resizing procedure when testing a model.
        data_path (str): Points to the directory where training or testing
                         data instances are stored.
    Returns:
        iterator: An initializable dataset iterator holding the relevant data.
        initializer: An operation required to initialize the correct iterator.
    """

    if phase == "train":

        current_module = sys.modules[__name__]
        class_name = "%s" % dataset.upper()

        dataset_class = getattr(current_module, class_name)(data_path)
        train_set, valid_set = dataset_class.load_data()

        iterator = tf.data.Iterator.from_structure(train_set.output_types,
                                                   train_set.output_shapes)
        next_element = iterator.get_next()

        train_init_op = iterator.make_initializer(train_set)
        valid_init_op = iterator.make_initializer(valid_set)

        return next_element, train_init_op, valid_init_op

    if phase == "test":

        test_class = TEST(dataset, data_path)
        test_set = test_class.load_data()

        iterator = tf.data.Iterator.from_structure(test_set.output_types,
                                                   test_set.output_shapes)
        next_element = iterator.get_next()

        init_op = iterator.make_initializer(test_set)

        return next_element, init_op


def postprocess_saliency_map(saliency_map, target_size):
    """This function resizes and crops a single saliency map to the original
       dimensions of the input image. The output is then encoded as a jpeg
       file suitable for saving to disk.
    Args:
        saliency_map (tensor, float32): 3D tensor that holds the values of a
                                        saliency map in the range from 0 to 1.
        target_size (tensor, int32): 1D tensor that specifies the size to which
                                     the saliency map is resized and cropped.
    Returns:
        tensor, str: A tensor of the saliency map encoded as a jpeg file.
    """

    saliency_map_np = saliency_map * 255.0
    saliency_map = _resize_image(saliency_map_np , target_size, True)


    saliency_map = tf.round(saliency_map)
    saliency_map = tf.cast(saliency_map, tf.uint8)

    saliency_map_jpg = tf.image.encode_jpeg(saliency_map, "grayscale", 100)

    return saliency_map_jpg, saliency_map_np 


def _fetch_dataset(files, stimuli_size, target_size, shuffle, online=False):

    """Here the list of file directories is shuffled (only when training),
       loaded, batched, and prefetched to ensure high GPU utilization.
    Args:
        files (list, str): A list that holds the paths to all file instances.
        stimuli_size (tuple, int): A tuple that specifies the size to which
                                  the search images and saliency maps will be reshaped.
        target_size (tuple, int): A tuple that specifies the size to which
                                  the target image will be reshaped.
        shuffle (bool): Determines whether the dataset will be shuffled or not.
        online (bool, optional): Flag that decides whether the batch size must
                                 be 1 or can take any value. Defaults to False.
    Returns:
        object: A dataset object that contains the batched and prefetched data
                instances along with their shapes and file paths.
    """

    dataset = tf.data.Dataset.from_tensor_slices(files)

    if shuffle:
        dataset = dataset.shuffle(len(files[0]))

    dataset = dataset.map(lambda *files: _parse_function(files, stimuli_size, target_size),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    batch_size = 1 if online else config.PARAMS["batch_size"]

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(5)

    return dataset


def _parse_function(files, stimuli_size, target_size):
    """This function reads image data dependent on the image type and
       whether it constitutes a stimulus or saliency map. All instances
       are then reshaped and padded to yield the target dimensionality.
    Args:
        files (tuple, str): A tuple with the paths to all file instances.
                            The first element contains the stimuli and, if
                            present, the second one the ground truth maps.
        stimuli_size (tuple, int): A tuple that specifies the size to which
                                  the stimuli/saliency map will be reshaped.
        target_size (tuple, int): A tuple that specifies the size to which
                                  the target will be reshaped.
    Returns:
        list: A list that holds the image instances along with their
              shapes and file paths.
    """

    image_list = []

    for count, filename in enumerate(files):

        image_str = tf.read_file(filename)
        channels = 3 if (count == 0 or count == 2) else 1
        image = tf.cond(tf.image.is_jpeg(image_str),
                        lambda: tf.image.decode_jpeg(image_str,
                                                     channels=channels),
                        lambda: tf.image.decode_png(image_str,
                                                    channels=channels))
        original_size = tf.shape(image)[:2]

        if count == 2: #target images
            image = _resize_image(image, target_size)
        
        elif count == 0 or count == 1: #saliency maps and stimuli

            image = _resize_image(image, stimuli_size)

        image_list.append(image)


    image_list.append(original_size)
    image_list.append(files)

    return image_list


def _resize_image(image, target_size, overfull=False):
    """This resizing procedure preserves the original aspect ratio and might be
       followed by padding or cropping. Depending on whether the target size is
       smaller or larger than the current image size, the area or bicubic
       interpolation method will be utilized.
    Args:
        image (tensor, uint8): A tensor with the values of an image instance.
        target_size (tuple, int): A tuple that specifies the size to which
                                  the data will be resized.
        overfull (bool, optional): Denotes whether the resulting image will be
                                   larger or equal to the specified target
                                   size. This is crucial for the following
                                   padding or cropping. Defaults to False.
    Returns:
        tensor, float32: 4D tensor that holds the values of the resized image.
    .. seealso:: The reasoning for using either area or bicubic interpolation
                 methods is based on the OpenCV documentation recommendations.
                 [https://bit.ly/2XAavw0]
    """

    current_size = tf.shape(image)[:2]

    height_ratio = target_size[0] / current_size[0]
    width_ratio = target_size[1] / current_size[1]

    if overfull:
        target_ratio = tf.maximum(height_ratio, width_ratio)
    else:
        target_ratio = tf.minimum(height_ratio, width_ratio)

    target_size = tf.cast(current_size, tf.float64) * target_ratio
    target_size = tf.cast(tf.round(target_size), tf.int32)


    shrinking = tf.cond(tf.logical_or(current_size[0] > target_size[0],
                                      current_size[1] > target_size[1]),
                        lambda: tf.constant(True),
                        lambda: tf.constant(False))

    image = tf.expand_dims(image, 0)

    image = tf.cond(shrinking,
                    lambda: tf.image.resize_area(image, target_size,
                                                 align_corners=True),
                    lambda: tf.image.resize_bicubic(image, target_size,
                                                    align_corners=True))

    image = tf.clip_by_value(image[0], 0.0, 255.0)

    return image


def _get_file_list(data_path):
    """This function detects all image files within the specified parent
       directory for either training or testing. The path content cannot
       be empty, otherwise an error occurs.
    Args:
        data_path (str): Points to the directory where training or testing
                         data instances are stored.
    Returns:
        list, str: A sorted list that holds the paths to all file instances.
    """

    data_list = []

    if os.path.isfile(data_path):
        data_list.append(data_path)
    else:
        for subdir, dirs, files in os.walk(data_path):
            for file in files:
                if file.lower().endswith((".png", ".jpg", ".jpeg")):
                    data_list.append(os.path.join(subdir, file))

    data_list.sort()

    if not data_list:
        raise FileNotFoundError("No data was found")

    return data_list


def _check_consistency(zipped_file_lists, n_total_files):
    """A consistency check that makes sure all files could successfully be
       found and stimuli names correspond to the ones of ground truth maps.
    Args:
        zipped_file_lists (tuple, str): A tuple of train and valid path names.
        n_total_files (int): The total number of files expected in the list.
    """

    assert len(list(zipped_file_lists)) == n_total_files, "Files are missing"

    for file_tuple in zipped_file_lists:
        file_names = [os.path.basename(entry) for entry in list(file_tuple)]
        file_names = [os.path.splitext(entry)[0] for entry in file_names]
        file_names = [entry.replace("_fixMap", "") for entry in file_names]
        file_names = [entry.replace("_fixPts", "") for entry in file_names]

        assert len(set(file_names)) == 1, "File name mismatch"

In [10]:
import tensorflow as tf

class loss:
  def kld(y_true, y_pred, eps=1e-7):
      """This function computes the Kullback-Leibler divergence between ground
        truth saliency maps and their predictions. Values are first divided by
        their sum for each image to yield a distribution that adds to 1.
      Args:
          y_true (tensor, float32): A 4d tensor that holds the ground truth
                                    saliency maps with values between 0 and 255.
          y_pred (tensor, float32): A 4d tensor that holds the predicted saliency
                                    maps with values between 0 and 1.
          eps (scalar, float, optional): A small factor to avoid numerical
                                        instabilities. Defaults to 1e-7.
      Returns:
          tensor, float32: A 0D tensor that holds the averaged error.
      """

      sum_per_image = tf.reduce_sum(y_true, axis=(1, 2, 3), keep_dims=True)
      y_true /= eps + sum_per_image

      sum_per_image = tf.reduce_sum(y_pred, axis=(1, 2, 3), keep_dims=True)
      y_pred /= eps + sum_per_image

      loss = y_true * tf.log(eps + y_true / (eps + y_pred))
      loss = tf.reduce_sum(loss, axis=(1, 2, 3))

      return loss



In [11]:
import os
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from tensorflow.tools.graph_transforms import TransformGraph


class MSINET:
    """The class representing the MSI-Net based on the VGG16 model. It
       implements a definition of the computational graph, as well as
       functions related to network training.
    """

    def __init__(self):
        self._output = None
        self._mapping = {}

        if config.PARAMS["device"] == "gpu":
            self._data_format = "channels_first"
            self._channel_axis = 1
            self._dims_axis = (2, 3)
        elif config.PARAMS["device"] == "cpu":
            self._data_format = "channels_last"
            self._channel_axis = 3
            self._dims_axis = (1, 2)

    def _encoder(self, images):
        """The encoder of the model consists of a pretrained VGG16 architecture
           with 13 convolutional layers. All dense layers are discarded and the
           last 3 layers are dilated at a rate of 2 to account for the omitted
           downsampling. Finally, the activations from 3 layers are combined.
        Args:
            images (tensor, float32): A 4D tensor that holds the RGB image
                                      batches used as input to the network.
        """

        imagenet_mean = tf.constant([103.939, 116.779, 123.68])
        imagenet_mean = tf.reshape(imagenet_mean, [1, 1, 1, 3])

        images -= imagenet_mean

        if self._data_format == "channels_first":
            images = tf.transpose(images, (0, 3, 1, 2))

        layer01 = tf.layers.conv2d(images, 64, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv1/conv1_1", reuse=tf.AUTO_REUSE)

        layer02 = tf.layers.conv2d(layer01, 64, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv1/conv1_2", reuse=tf.AUTO_REUSE)

        layer03 = tf.layers.max_pooling2d(layer02, 2, 2,
                                          data_format=self._data_format)

        layer04 = tf.layers.conv2d(layer03, 128, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv2/conv2_1", reuse=tf.AUTO_REUSE)

        layer05 = tf.layers.conv2d(layer04, 128, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv2/conv2_2", reuse=tf.AUTO_REUSE)

        layer06 = tf.layers.max_pooling2d(layer05, 2, 2,
                                          data_format=self._data_format)

        layer07 = tf.layers.conv2d(layer06, 256, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv3/conv3_1", reuse=tf.AUTO_REUSE)

        layer08 = tf.layers.conv2d(layer07, 256, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv3/conv3_2", reuse=tf.AUTO_REUSE)

        layer09 = tf.layers.conv2d(layer08, 256, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv3/conv3_3", reuse=tf.AUTO_REUSE)

        #layer09 = tf.layers.dropout(layer09, rate=0.5)

        layer10 = tf.layers.max_pooling2d(layer09, 2, 2,
                                          data_format=self._data_format)

        layer11 = tf.layers.conv2d(layer10, 512, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv4/conv4_1", reuse=tf.AUTO_REUSE)

        layer12 = tf.layers.conv2d(layer11, 512, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv4/conv4_2", reuse=tf.AUTO_REUSE)

        layer13 = tf.layers.conv2d(layer12, 512, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="conv4/conv4_3", reuse=tf.AUTO_REUSE)

        #layer13 = tf.layers.dropout(layer13, rate=0.5)

        layer14 = tf.layers.max_pooling2d(layer13, 2, 1,
                                          padding="same",
                                          data_format=self._data_format)

        layer15 = tf.layers.conv2d(layer14, 512, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   dilation_rate=2,
                                   data_format=self._data_format,
                                   name="conv5/conv5_1", reuse=tf.AUTO_REUSE)

        layer16 = tf.layers.conv2d(layer15, 512, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   dilation_rate=2,
                                   data_format=self._data_format,
                                   name="conv5/conv5_2", reuse=tf.AUTO_REUSE)

        layer17 = tf.layers.conv2d(layer16, 512, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   dilation_rate=2,
                                   data_format=self._data_format,
                                   name="conv5/conv5_3", reuse=tf.AUTO_REUSE)

        #layer17 = tf.layers.dropout(layer17, rate=0.5)

        layer18 = tf.layers.max_pooling2d(layer17, 2, 1,
                                          padding="same",
                                          data_format=self._data_format)

        encoder_output = tf.concat([layer10, layer14, layer18],
                                   axis=self._channel_axis, name='concat')

        self._output = encoder_output

    def _aspp(self, features):
        """The ASPP module samples information at multiple spatial scales in
           parallel via convolutional layers with different dilation factors.
           The activations are then combined with global scene context and
           represented as a common tensor.
        Args:
            features (tensor, float32): A 4D tensor that holds the features
                                        produced by the encoder module.
        """

        branch1 = tf.layers.conv2d(features, 256, 1,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="aspp/conv1_1", reuse=tf.AUTO_REUSE)

        branch2 = tf.layers.conv2d(features, 256, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   dilation_rate=4,
                                   data_format=self._data_format,
                                   name="aspp/conv1_2", reuse=tf.AUTO_REUSE)

        branch3 = tf.layers.conv2d(features, 256, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   dilation_rate=8,
                                   data_format=self._data_format,
                                   name="aspp/conv1_3", reuse=tf.AUTO_REUSE)

        branch4 = tf.layers.conv2d(features, 256, 3,
                                   padding="same",
                                   activation=tf.nn.relu,
                                   dilation_rate=12,
                                   data_format=self._data_format,
                                   name="aspp/conv1_4", reuse=tf.AUTO_REUSE)

        branch5 = tf.reduce_mean(features,
                                 axis=self._dims_axis,
                                 keepdims=True)

        branch5 = tf.layers.conv2d(branch5, 256, 1,
                                   padding="valid",
                                   activation=tf.nn.relu,
                                   data_format=self._data_format,
                                   name="aspp/conv1_5", reuse=tf.AUTO_REUSE)

        shape = tf.shape(features)

        branch5 = self._upsample(branch5, shape, 1)

        context = tf.concat([branch1, branch2, branch3, branch4, branch5],
                            axis=self._channel_axis)

        aspp_output = tf.layers.conv2d(context, 256, 1,
                                       padding="same",
                                       activation=tf.nn.relu,
                                       data_format=self._data_format,
                                       name="aspp/conv2", reuse=tf.AUTO_REUSE)
        self._output = aspp_output

    def _decoder(self, features):
        """The decoder model applies a series of 3 upsampling blocks that each
           performs bilinear upsampling followed by a 3x3 convolution to avoid
           checkerboard artifacts in the image space. Unlike all other layers,
           the output of the model is not modified by a ReLU.
        Args:
            features (tensor, float32): A 4D tensor that holds the features
                                        produced by the ASPP module.
        """

        shape = tf.shape(features)

        layer1 = self._upsample(features, shape, 2)

        layer2 = tf.layers.conv2d(layer1, 128, 3,
                                  padding="same",
                                  activation=tf.nn.relu,
                                  data_format=self._data_format,
                                  name="decoder/conv1", reuse=tf.AUTO_REUSE)

        shape = tf.shape(layer2)

        layer3 = self._upsample(layer2, shape, 2)

        layer4 = tf.layers.conv2d(layer3, 64, 3,
                                  padding="same",
                                  activation=tf.nn.relu,
                                  data_format=self._data_format,
                                  name="decoder/conv2", reuse=tf.AUTO_REUSE)

        shape = tf.shape(layer4)

        layer5 = self._upsample(layer4, shape, 2)

        layer6 = tf.layers.conv2d(layer5, 32, 3,
                                  padding="same",
                                  activation=tf.nn.relu,
                                  data_format=self._data_format,
                                  name="decoder/conv3", reuse=tf.AUTO_REUSE)

        decoder_output = tf.layers.conv2d(layer6, 1, 3,
                                          padding="same",
                                          data_format=self._data_format,
                                          name="decoder/conv4")

        if self._data_format == "channels_first":
            decoder_output = tf.transpose(decoder_output, (0, 2, 3, 1))

        self._output = decoder_output

    def _overlay(self, feature1, feature2):
        """This function convolves the features extracted from stimuli
            and target streams. It uses a convolutional layer 
            and uses features of the target image as convolution filters
            for stimulus features.
        Args:
            feature1 (tensor, float32): A 4D tensor that contains the features of stimuli.
            feature2 (tensor, int32): A 4D tensor that contains the features of target.
        Returns:
            tensor, float32: A 4D tensor that holds the output of convolving the two streams.
        """

        shape_channel = (feature2.get_shape())[1]
        feature2 = tf.squeeze(feature2, axis=0)
        feature2 = tf.transpose(feature2, [1, 2, 0])
        feature2 = tf.expand_dims(feature2, axis=3)
        feature2 = tf.concat(shape_channel * [feature2], axis=3)
        feature2 = tf.cast(feature2, dtype=tf.float32)

        overlay_output = tf.nn.conv2d(feature1, feature2, strides=[1, 1, 1, 1], padding='SAME', data_format='NCHW',
                                      name="overlay")

        self._output = overlay_output

    def _upsample(self, stack, shape, factor):
        """This function resizes the input to a desired shape via the
           bilinear upsampling method.
        Args:
            stack (tensor, float32): A 4D tensor with the function input.
            shape (tensor, int32): A 1D tensor with the reference shape.
            factor (scalar, int): An integer denoting the upsampling factor.
        Returns:
            tensor, float32: A 4D tensor that holds the activations after
                             bilinear upsampling of the input.
        """

        if self._data_format == "channels_first":
            stack = tf.transpose(stack, (0, 2, 3, 1))

        stack = tf.image.resize_bilinear(stack, (shape[self._dims_axis[0]] * factor,
                                                 shape[self._dims_axis[1]] * factor))

        if self._data_format == "channels_first":
            stack = tf.transpose(stack, (0, 3, 1, 2))

        return stack

    def _normalize(self, maps, eps=1e-7):
        """This function normalizes the output values to a range
           between 0 and 1 per saliency map.
        Args:
            maps (tensor, float32): A 4D tensor that holds the model output.
            eps (scalar, float, optional): A small factor to avoid numerical
                                           instabilities. Defaults to 1e-7.
        """

        min_per_image = tf.reduce_min(maps, axis=(1, 2, 3), keep_dims=True)
        maps -= min_per_image

        max_per_image = tf.reduce_max(maps, axis=(1, 2, 3), keep_dims=True)
        maps = tf.divide(maps, eps + max_per_image, name="output")

        self._output = maps

    def _pretraining(self):
        """The first 26 variables of the model here are based on the VGG16
           network. Therefore, their names are matched to the ones of the
           pretrained VGG16 checkpoint for correct initialization.
        """

        for var in tf.global_variables()[:26]:
            key = var.name.split("/", 1)[1]
            key = key.replace("kernel:0", "weights")
            key = key.replace("bias:0", "biases")
            self._mapping[key] = var

    def forward(self, stimuli):
        """Public method to forward RGB images through the feature 
            extraction parts of the network.
        Args:
            images (tensor, float32): A 4D tensor that holds the values of the
                                      raw input images.
        Returns:
            tensor, float32: A 4D tensor that holds the values of the
                             extracted features from ASPP module.
        """

        self._encoder(stimuli)
        self._aspp(self._output)

        return self._output

    def one_stream(self, stimuli_features):
        """creates the output of the one stream network, which is the
            predicted fixation density map.
        Args:
            stimuli_features (tensor, float32): A 4D tensor containing the features of stimuli.
        Returns:
            tensor, float32: A 4D tensor that holds the values of the
                             predicted saliency maps.
        """
        self._decoder(self._output)
        self._normalize(self._output)

        return self._output

    def output_stream(self, stimuli_features, target_features):
        """creates the output of the two stream network, which is the
            predicted fixation density map.
        Args:
            stimuli_features (tensor, float32): A 4D tensor containing the features of stimuli.
            target_features (tensor, float32): A 4D tensor containing the features of target.
        Returns:
            tensor, float32: A 4D tensor that holds the values of the
                             predicted saliency maps.
        """
        self._overlay(stimuli_features, target_features)
        self._decoder(self._output)
        self._normalize(self._output)

        return self._output

    def train(self, ground_truth, predicted_maps, learning_rate):
        """Public method to define the loss function and optimization
           algorithm for training the model.
        Args:
            ground_truth (tensor, float32): A 4D tensor with the ground truth.
            predicted_maps (tensor, float32): A 4D tensor with the predictions.
            learning_rate (scalar, float): Defines the learning rate.
        Returns:
            object: The optimizer element used to train the model.
            tensor, float32: A 0D tensor that holds the averaged error.
        """

        error = loss.kld(ground_truth, predicted_maps)

        global_step = tf.train.get_or_create_global_step()

        adjusted_global_step = global_step

        base_learning_rate = learning_rate

        '''learning_rate = tf.train.polynomial_decay(
            base_learning_rate,
            adjusted_global_step,
            config.PARAMS["n_training_steps"],
            end_learning_rate=0,
            power=config.PARAMS["learning_power"])'''

        optimizer = tf.train.AdamOptimizer(learning_rate)              
        #optimizer = tf.train.MomentumOptimizer(learning_rate, config.PARAMS["momentum"])
        optimizer = optimizer.minimize(error)  
        return optimizer, error

    def save(self, saver, sess, dataset, path, device):
        """This saves a model checkpoint to disk and creates
           the folder if it doesn't exist yet.
        Args:
            saver (object): An object for saving the model.
            sess (object): The current TF training session.
            path (str): The path used for saving the model.
            device (str): Represents either "cpu" or "gpu".
        """

        os.makedirs(path, exist_ok=True)

        saver.save(sess, path + "model_%s_%s.ckpt" % (dataset, device),
                   write_meta_graph=False, write_state=False)

    def restore(self, sess, dataset, paths, device):
        """This function allows continued training from a prior checkpoint and
           training from scratch with the pretrained VGG16 weights. In case the
           dataset is either CAT2000 or MIT1003, a prior checkpoint based on
           the SALICON dataset is required.
        Args:
            sess (object): The current TF training session.
            dataset ([type]): The dataset used for training.
            paths (dict, str): A dictionary with all path elements.
            device (str): Represents either "cpu" or "gpu".
        Returns:
            object: A saver object for saving the model.
        """

        model_name = "model_%s_%s" % (dataset, device)
        salicon_name = "model_salicon_%s" % device
        vgg16_name = "vgg16_hybrid"

        ext1 = ".ckpt.data-00000-of-00001"
        ext2 = ".ckpt.index"

        saver = tf.train.Saver()

        if os.path.isfile(paths["latest"] + model_name + ext1) and \
                os.path.isfile(paths["latest"] + model_name + ext2):
            saver.restore(sess, paths["latest"] + model_name + ".ckpt")
        else:
            self._pretraining()
            loader = tf.train.Saver(self._mapping)
            loader.restore(sess, paths["weights"] + vgg16_name + ".ckpt")

        return saver

    def optimize(self, sess, dataset, path, device):
        """The best performing model is frozen, optimized for inference
           by removing unneeded training operations, and written to disk.
        Args:
            sess (object): The current TF training session.
            path (str): The path used for saving the model.
            device (str): Represents either "cpu" or "gpu".
        .. seealso:: https://bit.ly/2VBBdqQ and https://bit.ly/2W7YqBa
        """

        model_name = "model_%s_%s" % (dataset, device)
        model_path = path + model_name

        tf.train.write_graph(sess.graph.as_graph_def(),
                             path, model_name + ".pbtxt")

        freeze_graph.freeze_graph(model_path + ".pbtxt", "", False,
                                  model_path + ".ckpt", "output",
                                  "save/restore_all", "save/Const:0",
                                  model_path + ".pb", True, "")

        os.remove(model_path + ".pbtxt")

        graph_def = tf.GraphDef()

        with tf.gfile.Open(model_path + ".pb", "rb") as file:
            graph_def.ParseFromString(file.read())

        transforms = ["remove_nodes(op=Identity)",
                      "merge_duplicate_nodes",
                      "strip_unused_nodes",
                      "fold_constants(ignore_errors=true)"]

        optimized_graph_def = TransformGraph(graph_def,
                                             ["input"],
                                             ["output"],
                                             transforms)

        tf.train.write_graph(optimized_graph_def,
                             logdir=path,
                             as_text=False,
                             name=model_name + ".pb")


In [12]:
import os
import time
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import tensorflow as tf
import numpy as np

class History:
    """This class represents the training history of a model. It can load the
       prior history when training continues, keeps track of the training and
       validation error, and finally plots them as a curve after each epoch.
    """

    def __init__(self, n_train_batches, n_valid_batches,
                 dataset, path, device):
        self.train_history = []
        self.valid_history = []

        self._prior_epochs = 0

        self._train_error = 0
        self._valid_error = 0

        self._n_train_batches = n_train_batches
        self._n_valid_batches = n_valid_batches

        self._path = path
        self._id = (dataset, device)

        self._get_prior_history()

    def _get_prior_history(self):
        if os.path.isfile(self._path + "train_%s_%s.txt" % self._id):
            with open(self._path + "train_%s_%s.txt" % self._id, "r") as file:
                for line in file.readlines():
                    self.train_history.append(float(line))

        if os.path.isfile(self._path + "valid_%s_%s.txt" % self._id):
            with open(self._path + "valid_%s_%s.txt" % self._id, "r") as file:
                for line in file.readlines():
                    self.valid_history.append(float(line))

        self.prior_epochs = len(self.train_history)

    def update_train_step(self, train_error):
        self._train_error += train_error

    def update_valid_step(self, valid_error):
        self._valid_error += valid_error

    def get_mean_train_error(self, reset=True):
        mean_train_error = self._train_error / self._n_train_batches

        if reset:
            self._train_error = 0

        return mean_train_error

    def get_mean_valid_error(self, reset=True):
        mean_valid_error = self._valid_error / self._n_valid_batches

        if reset:
            self._valid_error = 0

        return mean_valid_error

    def save_history(self):
        mean_train_loss = self.get_mean_train_error(False)
        mean_valid_loss = self.get_mean_valid_error(False)

        self.train_history.append(mean_train_loss)
        self.valid_history.append(mean_valid_loss)

        os.makedirs(self._path, exist_ok=True)

        with open(self._path + "train_%s_%s.txt" % self._id, "a") as file:
            file.write("%f\n" % self.train_history[-1])

        with open(self._path + "valid_%s_%s.txt" % self._id, "a") as file:
            file.write("%f\n" % self.valid_history[-1])

        if len(self.train_history) > 1:
            axes = plt.figure().gca()

            x_range = np.arange(1, len(self.train_history) + 1)

            plt.plot(x_range, self.train_history, label="train", linewidth=2)
            plt.plot(x_range, self.valid_history, label="valid", linewidth=2)

            plt.legend()
            plt.xlabel("epochs")
            plt.ylabel("error")

            locations = plticker.MultipleLocator(base=1.0)
            axes.xaxis.set_major_locator(locations)

            plt.savefig(self._path + "curve_%s_%s.png" % self._id)
            plt.close()


class Progbar:
    """This class represents a progress bar for the terminal that visualizes
       the training progress for each epoch, estimated time of accomplishment,
       and then summarizes the training and validation loss together with the
       elapsed time.
    """

    def __init__(self, n_train_data, n_train_batches,
                 batch_size, n_epochs, prior_epochs):
        self._train_time = 0
        self._valid_time = 0

        self._start_time = time.time()

        self._batch_size = batch_size

        self._n_train_data = n_train_data
        self._n_train_batches = n_train_batches

        self._target_epoch = str(n_epochs + prior_epochs).zfill(2)
        self._current_epoch = str(prior_epochs + 1).zfill(2)

    def _flush(self):
        self._train_time = 0
        self._valid_time = 0

        self._start_time = time.time()

        current_epoch_int = int(self._current_epoch) + 1
        self._current_epoch = str(current_epoch_int).zfill(2)

    def update_train_step(self, current_batch):
        current_batch += 1

        self._train_time = time.time() - self._start_time
        batch_train_time = self._train_time / current_batch

        eta = (self._n_train_batches - current_batch) * batch_train_time
        eta = str(timedelta(seconds=np.ceil(eta)))

        progress_line = "=" * (20 * current_batch // self._n_train_batches)

        current_instance = current_batch * self._batch_size
        current_instance = np.clip(current_instance, 0, self._n_train_data)

        progress_frac = "%i/%i" % (current_instance, self._n_train_data)

        information = (self._current_epoch, self._target_epoch,
                       progress_line, progress_frac, eta)

        progbar_output = "Epoch %s/%s [%-20s] %s (ETA: %s)" % information

        print(progbar_output, end="\r", flush=True)

    def update_valid_step(self):
        self._valid_time = time.time() - self._start_time - self._train_time

    def write_summary(self, mean_train_loss, mean_valid_loss):
        train_time = str(timedelta(seconds=np.ceil(self._train_time)))
        valid_time = str(timedelta(seconds=np.ceil(self._valid_time)))

        train_information = (mean_train_loss, train_time)
        valid_information = (mean_valid_loss, valid_time)

        train_output = "\n\tTrain loss: %.6f (%s)" % train_information
        valid_output = "\tValid loss: %.6f (%s)" % valid_information

        print(train_output, flush=True)
        print(valid_output, flush=True)

        self._flush()






In [15]:
import argparse
import os
import numpy as np
import tensorflow as tf
import random
import sys
import cv2

seed_value = 32
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_random_seed(seed_value)

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.logging.set_verbosity(tf.logging.ERROR)

def define_paths(current_path):
    """A helper function to define all relevant path elements for the
       locations of data, weights, and the results from either training
       or testing a model.
    Args:
        current_path (str): The path string of this script.
        args (object): A namespace object with values from command line.
    Returns:
        dict: A dictionary with all path elements.
    """

    data_path = current_path 
    results_path = current_path + "results/"
    weights_path = current_path + "weights/"

    history_path = results_path + "history/"
    images_path = results_path + "images/"
    ckpts_path = results_path + "ckpts/"

    best_path = ckpts_path + "best/"
    latest_path = ckpts_path + "latest/"

    paths = {
        "data": data_path,
        "history": history_path,
        "images": images_path,
        "best": best_path,
        "latest": latest_path,
        "weights": weights_path
    }

    return paths


def train_model(dataset, paths, device):
    """The main function for executing network training. It loads the specified
       dataset iterator, saliency model, and helper classes. Training is then
       performed in a new session by iterating over all batches for a number of
       epochs. After validation on an independent set, the model is saved and
       the training history is updated.
    Args:
        dataset (str): Denotes the dataset to be used during training.
        paths (dict, str): A dictionary with all path elements.
        device (str): Represents either "cpu" or "gpu".
    """

    iterator = get_dataset_iterator("train", dataset, paths["data"])

    next_element, train_init_op, valid_init_op = iterator

    input_images, ground_truths, input_targets  = next_element[:3]
    ground_truths = tf.divide(ground_truths, 255)

    input_plhd = tf.placeholder_with_default(input_images,
                                             (None, None, None, 3),
                                             name="input")

    input_target_img = tf.placeholder_with_default(input_targets,
                                                    (None, None, None, 3),
                                                    name="input_2")

    msinet = MSINET()

    feature_map_stimuli = msinet.forward(input_plhd)

    feature_map_target = msinet.forward(input_target_img)

    predicted_maps = msinet.output_stream(feature_map_stimuli, feature_map_target)

    # uncomment if you want to test with one stream network
    # predicted_maps = msinet.one_stream(feature_map_stimuli)

    optimizer, loss = msinet.train(ground_truths, predicted_maps,
                                   config.PARAMS["learning_rate"])

    curr_module = sys.modules[__name__]

    n_train_data = getattr(curr_module, dataset.upper()).n_train
    n_valid_data = getattr(curr_module, dataset.upper()).n_valid

    print(n_train_data)
    print(n_valid_data)

    n_train_batches = int(np.ceil(n_train_data / config.PARAMS["batch_size"]))
    n_valid_batches = int(np.ceil(n_valid_data / config.PARAMS["batch_size"]))

    history = History(n_train_batches,
                            n_valid_batches,
                            dataset,
                            paths["history"],
                            device)

    progbar = Progbar(n_train_data,
                            n_train_batches,
                            config.PARAMS["batch_size"],
                            config.PARAMS["n_epochs"],
                            history.prior_epochs)


    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        saver = msinet.restore(sess, dataset, paths, device)
        writer = tf.summary.FileWriter('./tflogs', sess.graph)
        print(">> Start training on %s..." % dataset.upper())

        for epoch in range(config.PARAMS["n_epochs"]):
            sess.run(train_init_op)

            for batch in range(n_train_batches):

                _ , error = sess.run([optimizer, loss])

                history.update_train_step(error)
                progbar.update_train_step(batch)

            sess.run(valid_init_op)

            for batch in range(n_valid_batches):
                error = sess.run(loss)

                history.update_valid_step(error)
                progbar.update_valid_step()

            msinet.save(saver, sess, dataset, paths["latest"], device)

            history.save_history()

            progbar.write_summary(history.get_mean_train_error(),
                                history.get_mean_valid_error())

            if history.valid_history[-1] == min(history.valid_history):
                msinet.save(saver, sess, dataset, paths["best"], device)
                msinet.optimize(sess, dataset, paths["best"], device)

                print("\tBest model!", flush=True)


def test_model(dataset, paths, device):
    """The main function for executing network testing. It loads the specified
       dataset iterator and optimized saliency model. By default, when no model
       checkpoint is found locally, the pretrained weights will be downloaded.
       Testing only works for models trained on the same device as specified in
       the config file.
    Args:
        dataset (str): Denotes the dataset that was used during training.
        paths (dict, str): A dictionary with all path elements.
        device (str): Represents either "cpu" or "gpu".
    """

    iterator = get_dataset_iterator("test", dataset, paths["data"])

    next_element, init_op = iterator

    input_images, ground_truths, input_targets, original, file_path = next_element

    original_shape = (320, 512)

    graph_def = tf.GraphDef()

    model_name = "model_%s_%s.pb" % (dataset, device)

    if os.path.isfile(paths["best"] + model_name):
        with tf.gfile.Open(paths["best"] + model_name, "rb") as file:
            graph_def.ParseFromString(file.read())

    predicted_maps = tf.import_graph_def(graph_def,
                                         input_map={"input": input_images, "input_2": input_targets},
                                         return_elements=["output:0"])

    predicted_maps = tf.squeeze(predicted_maps, axis=0)
    input_images = tf.squeeze(input_images, axis=0)
    jpeg , npy = postprocess_saliency_map(predicted_maps[0], original_shape)
    
    curr_module = sys.modules[__name__]
    n_test_data = getattr(curr_module , 'TEST').n_test

    print(">> Start testing with %s %s model..." % (dataset.upper(), device))

    with tf.Session() as sess:
        sess.run(init_op)

        while True:
            try:

                output_file_jpeg, output_file_npy, path = sess.run(
                    [jpeg, npy, file_path])

            except tf.errors.OutOfRangeError:
                break

            path = path[0][0].decode("utf-8")

            filename = os.path.basename(path)
            filename = os.path.splitext(filename)[0]
            filename_jpeg = filename + ".jpg"
            filename_npy = filename + ".npy"

            os.makedirs(paths["images"], exist_ok=True)

            with open(paths["images"] + filename_jpeg, "wb") as file:
                file.write(output_file_jpeg)

            with open(paths["images"] + filename_npy, "wb") as file:
                np.save(file, output_file_npy)
               

def jet_map(paths, threshold=30, alpha=0.5):

    """creates the jet map of predicted fixation density maps
       for the test data.
    Args:
        paths (str): paths to the test search stimuli and results folder
        threshold (int): threshold used to generate a mask of predicted density maps.
        alpha (float): weight for overlaying the get color map and the original image
    """

    test_data_path = paths['data'] + 'cocosearch/stimuli/test_targ_bbox/'
    prediction_path = paths['images']
    output_dir = paths['images'] + 'images_jet/'
    gnd_dir = paths['data'] + 'cocosearch/saliencymap/test/'
    gnd_output_dir = paths['images'] + 'groundtruth_jet/'

    if not os.path.exists(gnd_output_dir):
        os.makedirs(gnd_output_dir)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for subdir, dirs, files in os.walk(test_data_path):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg")):

              ##predicted Saliency
              img = cv2.imread(test_data_path + file)
              heatmap = cv2.imread(prediction_path + file)
              heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
              
              # Create mask
              mask = np.where(heatmap<=threshold, 1, 0)
              mask = np.reshape(mask, (img.shape[0] , img.shape[1], 3))

              # Marge images
              marge = img*mask + heatmap_color*(1-mask)
              marge = marge.astype("uint8")

              marge = cv2.addWeighted(img, 1-alpha, marge, alpha,0)
              cv2.imwrite( output_dir + file ,marge)

              ##Groundtruth saliency
              heatmap_gnd = cv2.imread(gnd_dir + file)
              heatmap_gnd_color = cv2.applyColorMap(heatmap_gnd, cv2.COLORMAP_JET)
              
              # Create mask
              mask_gnd = np.where(heatmap_gnd<=threshold, 1, 0)
              mask_gnd = np.reshape(mask_gnd, (img.shape[0] , img.shape[1], 3))

              # Marge images
              marge_gnd = img*mask_gnd + heatmap_gnd_color*(1-mask_gnd)
              marge_gnd = marge_gnd.astype("uint8")

              marge_gnd = cv2.addWeighted(img, 1-alpha, marge_gnd, alpha,0)
              cv2.imwrite(gnd_output_dir + file , marge_gnd)

def main():
    
    """The main function reads the command line arguments, invokes the
       creation of appropriate path variables, and starts the training
       or testing procedure for a model.
    """

    phases_list = ["train", "test"]

    dataset = 'cocosearch'

    path = '/content/'
    paths = define_paths(path)

    tf.reset_default_graph()

    train_model(dataset, paths, config.PARAMS["device"])

    test_model(dataset, paths, config.PARAMS["device"])

    threshold = 30 # threshold to generate jet maps

    jet_map(paths, threshold, alpha=0.5)

if __name__ == "__main__":
    main()


>> Start testing with COCOSEARCH gpu model...


In [16]:
"""This script caluclates the saliency metrics. 
The code is borrowed from 
https://github.com/tarunsharma1/saliency_metrics repository with
slight modfications. """

import numpy as np
import random
import math

def normalize_map(s_map):
    # normalize the salience map (as done in MIT code)
    norm_s_map = (s_map - np.min(s_map))/((np.max(s_map)-np.min(s_map))*1.0)
    return norm_s_map

def discretize_gt(gt):
	import warnings
	warnings.warn('can improve the way GT is discretized')
	return gt/255

def auc_judd(s_map,gt):
	# ground truth is discrete, s_map is continous and normalized
	gt = discretize_gt(gt)
	# thresholds are calculated from the salience map, only at places where fixations are present
	thresholds = []
	for i in range(0,gt.shape[0]):
		for k in range(0,gt.shape[1]):
			if gt[i][k]>0:
				thresholds.append(s_map[i][k])

	num_fixations = np.sum(gt)
	# num fixations is no. of salience map values at gt >0

	thresholds = sorted(set(thresholds))	
	#fp_list = []
	#tp_list = []
	area = []
	area.append((0.0,0.0))
	for thresh in thresholds:
		# in the salience map, keep only those pixels with values above threshold
		temp = np.zeros(s_map.shape)
		temp[s_map>=thresh] = 1.0
		assert np.max(gt)==1.0, 'something is wrong with ground truth..not discretized properly max value > 1.0'
		assert np.max(s_map)==1.0, 'something is wrong with salience map..not normalized properly max value > 1.0'
		num_overlap = np.where(np.add(temp,gt)==2)[0].shape[0]
		tp = num_overlap/(num_fixations*1.0)
		
		# total number of pixels > threshold - number of pixels that overlap with gt / total number of non fixated pixels
		# this becomes nan when gt is full of fixations..this won't happen
		fp = (np.sum(temp) - num_overlap)/((np.shape(gt)[0] * np.shape(gt)[1]) - num_fixations)
		
		area.append((round(tp,4),round(fp,4)))
		#tp_list.append(tp)
		#fp_list.append(fp)

	#tp_list.reverse()
	#fp_list.reverse()
	area.append((1.0,1.0))
	#tp_list.append(1.0)
	#fp_list.append(1.0)
	#print tp_list
	area.sort(key = lambda x:x[0])
	tp_list =  [x[0] for x in area]
	fp_list =  [x[1] for x in area]
	return np.trapz(np.array(tp_list), np.array(fp_list))

def auc_borji(s_map, gt, splits=100, stepsize=0.1):
	gt = discretize_gt(gt)
	num_fixations = np.sum(gt)

	num_pixels = s_map.shape[0]*s_map.shape[1]
	random_numbers = []
	for i in range(0,splits):
		temp_list = []
		for k in range(0, int(num_fixations)):
			temp_list.append(np.random.randint(num_pixels))
		random_numbers.append(temp_list)

	aucs = []
	# for each split, calculate auc
	for i in random_numbers:
		r_sal_map = []
		for k in i:
			r_sal_map.append(s_map[k%s_map.shape[0]-1, int(k/s_map.shape[0])])
		# in these values, we need to find thresholds and calculate auc
		thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

		r_sal_map = np.array(r_sal_map)

		# once threshs are got
		thresholds = sorted(set(thresholds))
		area = []
		area.append((0.0,0.0))
		for thresh in thresholds:
			# in the salience map, keep only those pixels with values above threshold
			temp = np.zeros(s_map.shape)
			temp[s_map>=thresh] = 1.0
			num_overlap = np.where(np.add(temp,gt)==2)[0].shape[0]
			tp = num_overlap/(num_fixations*1.0)
			
			#fp = (np.sum(temp) - num_overlap)/((np.shape(gt)[0] * np.shape(gt)[1]) - num_fixations)
			# number of values in r_sal_map, above the threshold, divided by num of random locations = num of fixations
			fp = len(np.where(r_sal_map>thresh)[0])/(num_fixations*1.0)

			area.append((round(tp,4),round(fp,4)))
		
		area.append((1.0,1.0))
		area.sort(key = lambda x:x[0])
		tp_list =  [x[0] for x in area]
		fp_list =  [x[1] for x in area]

		aucs.append(np.trapz(np.array(tp_list),np.array(fp_list)))
	
	return np.mean(aucs)

def auc_shuff(s_map,gt,other_map,splits=100,stepsize=0.1):
	gt = discretize_gt(gt)
	#print(np.max(other_map))
	#other_map = discretize_gt(other_map)
	#print(np.max(other_map))
	num_fixations = np.sum(gt)
	
	x,y = np.where(other_map==1.0)

	other_map_fixs = []
	for j in zip(x,y):
		other_map_fixs.append(j[0]*other_map.shape[0] + j[1])
	ind = len(other_map_fixs)
	assert ind==np.sum(other_map), 'something is wrong in auc shuffle'


	num_fixations_other = min(ind,num_fixations)

	num_pixels = s_map.shape[0]*s_map.shape[1]
	random_numbers = []
	for i in range(0,splits):
		temp_list = []
		t1 = np.random.permutation(ind)
		for k in t1:
			temp_list.append(other_map_fixs[k])
		random_numbers.append(temp_list)	

	aucs = []
	# for each split, calculate auc
	for i in random_numbers:
		r_sal_map = []
		for k in i:

			r_sal_map.append(s_map[k%s_map.shape[0]-1, int(k/s_map.shape[0])])
		# in these values, we need to find thresholds and calculate auc
		thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

		r_sal_map = np.array(r_sal_map)

		# once threshs are got
		thresholds = sorted(set(thresholds))
		area = []
		area.append((0.0,0.0))
		for thresh in thresholds:
			# in the salience map, keep only those pixels with values above threshold
			temp = np.zeros(s_map.shape)
			temp[s_map>=thresh] = 1.0
			num_overlap = np.where(np.add(temp,gt)==2)[0].shape[0]
			tp = num_overlap/(num_fixations*1.0)
			
			#fp = (np.sum(temp) - num_overlap)/((np.shape(gt)[0] * np.shape(gt)[1]) - num_fixations)
			# number of values in r_sal_map, above the threshold, divided by num of random locations = num of fixations
			fp = len(np.where(r_sal_map>thresh)[0])/(num_fixations*1.0)

			area.append((round(tp,4),round(fp,4)))
		
		area.append((1.0,1.0))
		area.sort(key = lambda x:x[0])
		tp_list =  [x[0] for x in area]
		fp_list =  [x[1] for x in area]

		aucs.append(np.trapz(np.array(tp_list),np.array(fp_list)))
	
	return np.mean(aucs)


def nss(s_map,gt):

	gt = discretize_gt(gt)
	s_map_norm = (s_map - np.mean(s_map))/np.std(s_map)

	x,y = np.where(gt==1)
	temp = []
	for i in zip(x,y):
		temp.append(s_map_norm[i[0],i[1]])
	return np.mean(temp)


def infogain(s_map,gt,baseline_map):
	
	gt = discretize_gt(gt)
	# assuming s_map and baseline_map are normalized
	eps = 2.2204e-16

	s_map = s_map/(np.sum(s_map)*1.0)
	baseline_map = baseline_map/(np.sum(baseline_map)*1.0)

	# for all places where gt=1, calculate info gain
	temp = []
	x,y = np.where(gt==1.0)
	#print(x,y)
	for i in zip(x,y):
		temp.append(np.log2(eps + s_map[i[0],i[1]]) - np.log2(eps + baseline_map[i[0],i[1]]))

	return np.mean(temp)


def similarity(s_map,gt):
	# here gt is not discretized nor normalized
	s_map = s_map/(np.sum(s_map)*1.0)
	gt = gt/(np.sum(gt)*1.0)
	#print(s_map.shape)
	#print(gt.shape)
	x,y = np.where(gt>0)
	sim = 0.0
	for i in zip(x,y):
		sim = sim + min(gt[i[0],i[1]],s_map[i[0],i[1]])
	return sim


def cc(s_map,gt):
	s_map_norm = (s_map - np.mean(s_map))/np.std(s_map)
	gt_norm = (gt - np.mean(gt))/np.std(gt)
	a = s_map_norm
	b= gt_norm
	r = (a*b).sum() / math.sqrt((a*a).sum() * (b*b).sum());
	return r


def kld(s_map,gt):
	s_map = s_map/(np.sum(s_map)*1.0)
	gt = gt/(np.sum(gt)*1.0)
	eps = 2.2204e-16
	return np.sum(gt * np.log(eps + gt/(s_map + eps)))


def calculate_metrics(y_pred , y_true, y_true_binary, bl_fixmap, bl_salmap):

	y_pred = np.squeeze(y_pred)
	y_true = np.squeeze(y_true)
	y_true_binary = np.squeeze(y_true_binary)
	bl_fixmap = np.squeeze(bl_fixmap)
	bl_salmap = np.squeeze(bl_salmap)
	cc_error = cc(y_pred, y_true)
	y_pred_n = normalize_map(y_pred)
	y_true_n = normalize_map(y_true)
	bl_salmap_n = normalize_map(bl_salmap)
	kld_error = kld(y_pred, y_true)#
	infog_error = infogain(y_pred_n, y_true_binary, bl_salmap_n)#?
	sim_error = similarity(y_pred_n, y_true_n)
	nss_error = nss(y_pred, y_true_binary)
	auc_error = auc_judd(y_pred_n, y_true_binary)
	auc_borji_error = auc_borji(y_pred_n, y_true_binary)
	sauc_error = auc_shuff(y_pred_n, y_true_binary, bl_fixmap)

	return kld_error , cc_error , sim_error, nss_error, auc_error, infog_error, sauc_error, auc_borji_error

In [None]:
import pysaliency
from pysaliency.baseline_utils import BaselineModel, GoldModel
import argparse
import os
import json
import numpy as np
import cv2
import tensorflow as tf
import random
import glob
import shutil
from pandas import DataFrame
#tf.compat.v1.enable_eager_execution()

def compute_saliency_metrics(data_path, use_pysaliency, csv_path):
    """computes the saliency metrics for test set.
    Args:
        data_path(str): The path to where the dataset is stored.
        use_pysaliency(bool): Whether to use pysaliency library to measure
                              saliency metrics.
        csv_path (str): The path to where a csv file containing
                        the computed saliency metrics should be stored.
    """
    test_list = []
    category = ['bottle', 'bowl' , 'cup', 'car', 'chair', 
                   'clock', 'fork', 'keyboard', 'knife', 
                   'laptop', 'microwave', 'mouse', 'oven',
                   'potted plant', 'sink', 'stop sign', 
                   'toilet', 'tv']
    test_stimuli_path = data_path + "cocosearch/stimuli/test"

    for subdir, dirs, files in os.walk(test_stimuli_path):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg")):
                test_list.append(os.path.join(subdir, file))

    dir_saliency_test = data_path + "cocosearch/saliencymap/test"
    dir_saliency_test_unblur = data_path + "cocosearch/saliencymap/test_unblur"
    dir_stimuli_test = pysaliency.FileStimuli(test_list)
    dir_saliency_test_img = os.path.join("cocosearch/saliencymap" , 'pysaliency_test_sal_img')
    
    if use_pysaliency:

        if os.path.exists(dir_saliency_test_img):
            shutil.rmtree(dir_saliency_test_img)
            os.makedirs(dir_saliency_test_img)
        else:
            os.makedirs(dir_saliency_test_img)
        
        dir_results_test_img = os.path.join(data_path + 'results/images' , 'pysaliency_result_img')

        if os.path.exists(dir_results_test_img):
            shutil.rmtree(dir_results_test_img)
            os.makedirs(dir_results_test_img)
        else:
            os.makedirs(dir_results_test_img)

        for item in glob.glob(os.path.join(dir_saliency_test , '*.jpg')):
            os.symlink(item , os.path.join(dir_saliency_test_img , os.path.basename(item)))

        for item in glob.glob(os.path.join(data_path + 'results/images' , '*.jpg')):
            os.symlink(item, os.path.join(dir_results_test_img , os.path.basename(item)))


    with open(data_path + 'coco_search18_fixations_TP_train_split1.json') as json_file:
        human_scanpaths_train = json.load(json_file)

    xs = []
    ys = []
    ts = []
    ns = []
    train_subjects = []
    min_fix_x = 100000
    max_fix_x = -100000
    min_fix_y = 100000
    max_fix_y = -100000

    for traj in human_scanpaths_train:
        for i in range(len(traj['X'])):
            if traj['X'][i] < 0 or traj['Y'][i] < 0 or traj['X'][i] > 1680 or traj['Y'][i] > 1050:
                continue

            if traj['X'][i] < min_fix_x:
                min_fix_x = traj['X'][i]

            if traj['X'][i] > max_fix_x:
                max_fix_x = traj['X'][i]

            if traj['Y'][i] < min_fix_y:
                min_fix_y = traj['Y'][i]

            if traj['Y'][i] > max_fix_y:
                max_fix_y = traj['Y'][i]

    #for cat in category:
    #print(str(cat))
    m_kld_error, m_cc_error, m_sim_error, m_nss_error, m_auc_error, m_infog_error, m_sauc_error, m_auc_b_error = [],[],[],[],[],[],[],[]
    stimuli_names = []
    #count = len(glob.glob(os.path.join(test_stimuli_path , str(cat)+ '*.jpg')))#
    count =  len(os.listdir(test_stimuli_path))
    for subdir, dirs, files in os.walk(test_stimuli_path):

        
        for n, stimulus in enumerate(files):

            #if stimulus.startswith(str(cat)):   
            base_line_salmap = np.zeros((320 , 512))
            for j in files:
                if j!= stimulus: #and j.startswith(str(cat)):
                    base_line_salmap = base_line_salmap + np.load(os.path.join(dir_saliency_test , os.path.splitext(j)[0]+'.npy'), allow_pickle=True)

            base_line_salmap /= np.max(base_line_salmap)
            M = 1
            random_ind = random.sample(range(0, count), M)
            base_line_fixmap = np.zeros((320 , 512))
            for i in random_ind:

                while(files[i]==stimulus):
                    i = random.randint(0, count-1)

                if files[i]!=stimulus: # and files[i].startswith(str(cat)):
                    base_line_fixmap = base_line_fixmap + (np.load(os.path.join(dir_saliency_test_unblur , os.path.splitext(files[i])[0]+'.npy'), allow_pickle=True))/255
                    
            base_line_fixmap[np.where(base_line_fixmap>1.0)] = 1.0

            ##gnd_map = tf.image.decode_jpeg(tf.read_file(os.path.join(dir_saliency_test , stimulus)), channels=1).numpy()
            ##gnd_bin_map = tf.image.decode_jpeg(tf.read_file(os.path.join(dir_saliency_test_unblur , stimulus)), channels=1).numpy()
            ##pred_map = tf.image.decode_jpeg(tf.read_file(os.path.join(data_path + 'results/images' , os.path.splitext(stimulus)[0]+'.png')), channels=1).numpy()

            gnd_map  = np.load(os.path.join(dir_saliency_test , os.path.splitext(stimulus)[0]+'.npy'), allow_pickle=True)      
            gnd_bin_map  = np.load(os.path.join(dir_saliency_test_unblur , os.path.splitext(stimulus)[0]+'.npy'), allow_pickle=True)
            pred_map = np.load(os.path.join(data_path + 'results/images' , os.path.splitext(stimulus)[0]+'.npy'), allow_pickle=True)

            kl_test_error, cc_test_error, sim_test_error, nss_test_error, auc_test_error, infog_test_error, sauc_test_error, auc_b_test_error = calculate_metrics(pred_map, gnd_map, gnd_bin_map, base_line_fixmap, base_line_salmap)

            stimuli_names.append(stimulus)
            m_kld_error.append(kl_test_error)
            m_cc_error.append(cc_test_error)
            m_sim_error.append(sim_test_error)
            m_nss_error.append(nss_test_error)
            m_auc_error.append(auc_test_error)
            m_infog_error.append(infog_test_error)
            m_sauc_error.append(sauc_test_error)
            m_auc_b_error.append(auc_b_test_error)
            
            if use_pysaliency:

                stimulus_size = dir_stimuli_test.sizes[n]
                height, width = stimulus_size

                for traj in human_scanpaths_train:

                    
                    if ((traj['task'] + '_' + traj['name'])==stimulus and traj['correct'] == 1):
                        subject_name = traj['subject']

                        xs_ = []
                        ys_ = []
                        durations_ = []

                        for ind in range( len(traj['X'])):
                            
                            if 1680>=traj['X'][ind]>=0 and 1050>=traj['Y'][ind]>=0:
                                if not (traj['X'][ind] in xs_ and traj['Y'][ind] in ys_):
                                    xs_.append(((traj['X'][ind] - min_fix_x) / max_fix_x)*(512))
                                    ys_.append(((traj['Y'][ind] - min_fix_y) / max_fix_y)*(320))
                                    durations_.append(traj['T'][ind])


                        time_l_ = durations_ [:-1]
                        time_l_.insert(0,0)
                        time_l_array_ = np.array(time_l_)
                        ts_ = [np.sum(time_l_array_[0:ind]) for ind in range(1, len(time_l_array_)+1)]

                        xs.append(xs_)
                        ys.append(ys_)
                        ts.append(ts_)
                        ns.append(n)
                        train_subjects.append(subject_name )

    if use_pysaliency:

        fixations = pysaliency.FixationTrains.from_fixation_trains(xs, ys, ts, ns, train_subjects, attributes=False, scanpath_attributes=None)

        my_model = pysaliency.SaliencyMapModelFromDirectory(dir_stimuli_test, dir_results_test_img)

        ground_truth = pysaliency.SaliencyMapModelFromDirectory(dir_stimuli_test,  dir_saliency_test_img)

        auc = my_model.AUC(dir_stimuli_test , fixations)
        sauc = my_model.sAUC(dir_stimuli_test , fixations)
        nss = my_model.NSS(dir_stimuli_test , fixations)
        cc = my_model.CC(dir_stimuli_test , ground_truth)
        sim = my_model.SIM(dir_stimuli_test , ground_truth)

        img_kld = my_model.image_based_kl_divergence(dir_stimuli_test , ground_truth)

        print('metrics computed by pysaliency:')
        print('AUC:', auc )
        print('sAUC:', sauc )
        print('NSS:', nss )
        print('image_KLD:', img_kld) 
        print('CC:', cc )
        print('SIM:', sim )

    kl_mean_test_error = sum(m_kld_error) / len(m_kld_error)#m_kld_error / count
    cc_mean_test_error = sum(m_cc_error) / len(m_cc_error)#m_cc_error / count
    sim_mean_test_error = sum(m_sim_error) / len(m_sim_error)#m_sim_error / count 
    nss_mean_test_error = sum(m_nss_error) / len(m_nss_error)#m_nss_error / count
    auc_mean_test_error = sum(m_auc_error) / len(m_auc_error)#m_auc_error / count
    infog_mean_test_error = sum(m_infog_error) / len(m_infog_error)#m_infog_error / count
    sauc_mean_test_error = sum(m_sauc_error) / len(m_sauc_error)#m_sauc_error / count
    auc_b_mean_test_error = sum(m_auc_b_error) / len(m_auc_b_error)#m_auc_b_error / count

    print('locally computed:')
    print('AUC:',  auc_mean_test_error)
    print('AUC Borji:',  auc_b_mean_test_error)
    print('SAUC:',  sauc_mean_test_error)
    print('NSS:',  nss_mean_test_error )
    print('image_KLD:', kl_mean_test_error) # this should be same as our loss function
    print('CC:', cc_mean_test_error )
    print('SIM:',sim_mean_test_error )
    print('INFO GAIN:', infog_mean_test_error )

    sample = {}

    sample['kl'] = sorted(zip(stimuli_names, m_kld_error), key=lambda x: x[1])
    sample['cc'] = sorted(zip(stimuli_names, m_cc_error), key=lambda x: x[1])
    sample['sim'] = sorted(zip(stimuli_names, m_sim_error), key=lambda x: x[1])
    sample['nss'] = sorted(zip(stimuli_names, m_nss_error), key=lambda x: x[1])
    sample['auc'] = sorted(zip(stimuli_names, m_auc_error), key=lambda x: x[1])
    sample['infog'] = sorted(zip(stimuli_names, m_infog_error), key=lambda x: x[1])
    sample['sauc'] = sorted(zip(stimuli_names, m_sauc_error), key=lambda x: x[1])
    sample['auc_borji'] = sorted(zip(stimuli_names, m_auc_b_error), key=lambda x: x[1])

    with open(csv_path + 'ordering_test_images_based_on_score.json', 'w') as fp:
        json.dump(str(sample), fp)

    df = DataFrame({'AUC': [auc_mean_test_error], 'AUC Borji':[auc_b_mean_test_error] , 'SAUC':[sauc_mean_test_error] , 'NSS':[nss_mean_test_error] , \
                        'image_KLD':[kl_mean_test_error], 'CC':[cc_mean_test_error], 'SIM':[sim_mean_test_error], 'INFO GAIN':[infog_mean_test_error]})

    output_path= csv_path + 'sal_metrics.xlsx'
    #output_path= csv_path + str(cat) + '_sal_metrics.xlsx'
    df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)

def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

def main():
    
    """The main function reads the command line arguments, invokes the
       creation of appropriate path variables, and starts the training
       or testing procedure for a model.
    """

    path = './'
    csv_path = './results/'
    use_pysaliency = False

    compute_saliency_metrics(path, use_pysaliency, csv_path)
        

if __name__ == "__main__":
    main()



In [None]:
from google.colab import files

!zip -r '/content/results_th30.zip' './results/images_jet'
files.download('results_th30.zip') 


zip error: Nothing to do! (try: zip -r /content/results_th30.zip . -i ./results/images_jet)


FileNotFoundError: ignored