In [1]:
import sys
sys.path.append('..')
import utils.matterport_utils as matterport_utils
from models.model_misc import compose_image_meta
from utils.flow import read_flo_file
from dataIO.a2d_dataset import A2DDataset
from cfg.config import Config

import os.path as osp
import os
import numpy as np
import random
import skimage
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset = A2DDataset(split='train', dataset_dir='/vision/u/jingweij/Datasets/A2D/Release/')
dataset.prepare()

In [3]:
def preprocess_image(dataset, video_id, frame_id, config, use_mini_mask=True):
    """Load and return ground truth data for an image (image, mask, bounding boxes).

    augment: If true, apply random image augmentation. Currently, only
        horizontal flipping is offered.
    use_mini_mask: If False, returns full-size masks that are the same height
        and width as the original image. These can be big, for example
        1024x1024x100 (for 100 instances). Mini masks are smaller, typically,
        224x224 and are generated by extracting the bounding box of the
        object and resizing it to MINI_MASK_SHAPE.

    Returns:
    image: [height, width, 3]
    image_meta: meta info of image
    rs_rgb_clip: [TIMESTEPS, height, width, 3], resized clip of rgb image
    rs_flow_clip: [TIMESTEPS, height, width, 2], resized clip of flow
    labeled_frame_id: [TIMESTEPS], one-hot vector indicating the location of this labeled frame in clip.
    bbox: [instance_count, (y1, x1, y2, x2, actor_class_id, action_class_id)]
    mask: [height, width, instance_count]. The height and width are those
        of the image unless use_mini_mask is True, in which case they are
        defined in MINI_MASK_SHAPE.
    """
    image_path = osp.join(dataset.image_dir, video_id, '%05d.png' % frame_id)
    flow_path = osp.join(dataset.flow_dir, video_id, '%05d.flo' % frame_id)
    anno_path = osp.join(dataset.anno_dir, video_id, '%05d.png' % frame_id)
    has_anno = osp.exists(anno_path)
    # Load image and mask
    image = load_image(image_path)
    flow = load_flow(flow_path)
    shape = image.shape
    image, window, scale, padding = matterport_utils.resize_image(
        image, 
        min_dim=config.IMAGE_MIN_DIM, 
        max_dim=config.IMAGE_MAX_DIM,
        padding=config.IMAGE_PADDING)
    # Active classes
    # Different datasets have different classes, so track the
    # classes supported in the dataset of this image.
    active_actor_class_ids = np.ones([dataset.num_actor_classes], dtype=np.int32)
    active_action_class_ids = np.ones([dataset.num_action_classes], dtype=np.int32)
    # Image meta data. Here image_id is set -1, to be consistent with args in
    # compose_image_meta, while distinct.
    image_meta = compose_image_meta(-1, shape, window, active_actor_class_ids, active_action_class_ids)
    
    if not has_anno:
        rtn_value = {
            'image': image,
            'flow': flow,
            'image_path': image_path,
            'image_meta': image_meta,
        }
        return rtn_value
    
    mask, actor_class_ids, action_class_ids = load_mask(dataset, anno_path)
    mask = matterport_utils.resize_mask(mask, scale, padding)
    # Bounding boxes. Note that some boxes might be all zeros
    # if the corresponding mask got cropped out.
    # bbox: [num_instances, (y1, x1, y2, x2)]
    bbox = matterport_utils.extract_bboxes(mask)
    # Add class_id as the last value in bbox
    bbox = np.hstack([bbox, actor_class_ids[:, np.newaxis], action_class_ids[:, np.newaxis]])
    # Resize masks to smaller size to reduce memory usage
    if use_mini_mask:
        mask = matterport_utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE)

    rtn_value = {
        'image': image,
        'flow': flow,
        'image_path': image_path,
        'image_meta': image_meta,
        'mask': mask,
        'bbox': bbox,
    }
    return rtn_value

In [4]:
def load_image(image_path):
    """Load the specified image and return a [H,W,3] Numpy array.
    """
    # Load image
    image = skimage.io.imread(image_path)
    # If grayscale. Convert to RGB for consistency.
    if image.ndim != 3:
        image = skimage.color.gray2rgb(image)
    return image

In [5]:
def load_mask(dataset, anno_path):
    anno = skimage.io.imread(anno_path).astype(np.int64)
    anno = anno[:,:,0]*10**6 + anno[:,:,1]*10**3 + anno[:,:,2]
    color_codes = np.unique(anno).tolist()[1:] # exclude 0 for background
    num_instances = len(np.unique(anno)) - 1
    assert num_instances == len(color_codes) # TODO: if assert passes, change to num_instances = len(color_codes)

    mask = np.empty([anno.shape[0], anno.shape[1], num_instances])
    for i in range(num_instances):
        mask[:,:,i] = anno == color_codes[i]
    actor_class_ids = np.array(list(map(lambda x: dataset.color_to_actor_class_name[x], color_codes)))
    action_class_ids = np.array(list(map(lambda x: dataset.color_to_action_class_name[x], color_codes)))
    return mask, actor_class_ids, action_class_ids

In [6]:
def load_flow(flow_path):
    return read_flo_file(flow_path)

In [7]:
class A2DConfig(Config):
    """Configuration for training on the toy shapes dataset.
    Derives from the base Config class and overrides values specific
    to the toy shapes dataset.
    """
    # Give the configuration a recognizable name
    NAME = "a2d"

    # Number of classes (including background)
    NUM_ACTOR_CLASSES = 1 + 7  # background + 7 actors
    NUM_ACTION_CLASSES = 1 + 9  # background + 9 actions

    # Use small images for faster training. Set the limits of the small side
    # the large side, and that determines the image shape.
    IMAGE_MIN_DIM = 256
    IMAGE_MAX_DIM = 256
    
config = A2DConfig()
config.display()


Configurations:
BACKBONE_SHAPES                [[64 64]
 [32 32]
 [16 16]
 [ 8  8]
 [ 4  4]]
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     2
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
GPU_COUNT                      1
IMAGES_PER_GPU                 2
IMAGE_MAX_DIM                  256
IMAGE_MIN_DIM                  256
IMAGE_PADDING                  True
IMAGE_SHAPE                    [256 256   3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.002
MASK_POOL_SIZE                 14
MASK_SHAPE                     [28, 28]
MAX_GT_INSTANCES               100
MEAN_PIXEL                     [123.7 116.8 103.9]
MINI_MASK_SHAPE                (56, 56)
NAME                           a2d
NUM_ACTION_CLASSES             10
NUM_ACTOR_CLASSES              8
POOL_SIZE                      7
POST_NMS_ROIS_INFERENCE        1000
POST_NMS_

In [8]:
all_video_ids = sorted(os.listdir(dataset.image_dir))

In [9]:
video_id = np.random.choice(all_video_ids)
frame_id = np.random.choice(len(os.listdir(osp.join(dataset.image_dir, video_id)))) + 1
#video_id = 'tB2-L7hyGKo'
#frame_id = 30

In [10]:
video_id, frame_id

('VZKTqH9gB0s', 211)

In [11]:
%%time
for i in range(100):
    video_id = np.random.choice(all_video_ids)
    frame_id = np.random.choice(len(os.listdir(osp.join(dataset.image_dir, video_id)))) + 1
    dic = preprocess_image(dataset, video_id, frame_id, config, use_mini_mask=True)
    with open('../tmp/%03d.pkl' % i, 'wb') as f:
        pickle.dump(dic, f)

CPU times: user 1.23 s, sys: 650 ms, total: 1.88 s
Wall time: 19.3 s


In [12]:
%%time
li = []
for j in range(2):
    for i in range(100):
        with open('../tmp/%03d.pkl' % i, 'rb') as f:
            li.append(pickle.load(f))

CPU times: user 130 ms, sys: 284 ms, total: 413 ms
Wall time: 1.75 s


In [13]:
[x['image_path'] for x in li]

['/vision/u/jingweij/Datasets/A2D/Release/Images/hsBHfZkpdAU/00002.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/FbVBQ6EstIc/00014.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/2xSLG0gm_og/00099.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/OuU52au3ts8/00060.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/YX1Ya8UA25c/00091.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/0RBrmdUTPkE/00236.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/3DQA-gR25aU/00009.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/MKkk74T94Os/00011.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/laAlYz7-TvI/00027.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/qeYwsddRGEY/00084.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/V2BmmY784So/00025.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/Eg6obDE4juY/00195.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/eWoGfahOyJ4/00061.png',
 '/vision/u/jingweij/Datasets/A2D/Release/Images/Mg

In [14]:
[x['image_path'] if 'mask' in x else i for i, x in enumerate(li)]

[0,
 1,
 2,
 '/vision/u/jingweij/Datasets/A2D/Release/Images/OuU52au3ts8/00060.png',
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 '/vision/u/jingweij/Datasets/A2D/Release/Images/cjGbqaFdSMw/00100.png',
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 '/vision/u/jingweij/Datasets/A2D/Release/Images/7M_HU6Gtckk/00050.png',
 48,
 49,
 50,
 51,
 52,
 '/vision/u/jingweij/Datasets/A2D/Release/Images/CqNITUm3WA8/00024.png',
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 '/vision/u/jingweij/Datasets/A2D/Release/Images/j3_XpeMbjDc/00010.png',
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 '/vision/u/jingweij/Datasets/A2D/Release/Images/OuU52au3ts8/00060.png',
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
