In [1]:
import json
import os
import random
import time
from typing import Any, Dict, List, Optional

import numpy as np
from pycocotools.coco import COCO
from tqdm import tqdm

root_path = os.path.join('..', '..')
full_annotation_file = os.path.join(root_path, 'data', 'full_2017_bboxes.json')
novel_classes_file = os.path.join(root_path, 'data', 'novel_class_ids.json')
base_classes_file = os.path.join(root_path, 'data', 'base_class_ids.json')

coco_dset = COCO(full_annotation_file)
coco_json = json.load(open(full_annotation_file, 'r'))
with open(novel_classes_file, 'r') as f:
    novel_classes = json.load(f)['novel_cat_ids']
with open(base_classes_file, 'r') as f:
    base_classes = json.load(f)['base_cat_ids']

loading annotations into memory...
Done (t=3.60s)
creating index...
index created!


In [2]:
BASE_SET_RATIOS = (80, 10, 10)
BASE_SET_SMALL_RATIOS = (8, 1, 1)

Sample images

In [3]:
def get_train_val_test_annots_and_imgs(small=False):
    base_cat_to_imgs = {k: v for k, v in coco_dset.catToImgs.items() if k in base_classes}

    train_set_base_imgs = []
    val_set_base_imgs   = []
    test_set_base_imgs  = []

    for base_cat in base_cat_to_imgs:
        imgs = set(base_cat_to_imgs[base_cat])
        
        n_sample_train  = int((BASE_SET_RATIOS[0] if not small else BASE_SET_SMALL_RATIOS[0]) / 100 * len(imgs))
        n_sample_val    = int((BASE_SET_RATIOS[1] if not small else BASE_SET_SMALL_RATIOS[1]) / 100 * len(imgs))
        n_sample_test   = int((BASE_SET_RATIOS[2] if not small else BASE_SET_SMALL_RATIOS[2]) / 100 * len(imgs))

        train_imgs  = set(random.sample(sorted(imgs), n_sample_train))
        imgs        = imgs.difference(train_imgs)
        val_imgs    = set(random.sample(sorted(imgs), n_sample_val))
        if not small:
            test_imgs = imgs.difference(val_imgs)
        else:
            imgs = imgs.difference(val_imgs)
            test_imgs = set(random.sample(sorted(imgs), n_sample_test))

        train_set_base_imgs.extend(train_imgs)
        val_set_base_imgs.extend(val_imgs)
        test_set_base_imgs.extend(test_imgs)

    print("Base images: \ntrain, val,  set")
    print(len(train_set_base_imgs), len(val_set_base_imgs), len(test_set_base_imgs))

    train_set = coco_dset.loadAnns(coco_dset.getAnnIds(imgIds=train_set_base_imgs))
    val_set   = coco_dset.loadAnns(coco_dset.getAnnIds(imgIds=val_set_base_imgs))
    test_set  = coco_dset.loadAnns(coco_dset.getAnnIds(imgIds=test_set_base_imgs))

    print()
    print("Base annotations: \ntrain, val,  set")
    print(len(train_set), len(val_set), len(test_set))

    return (train_set, val_set, test_set), \
        (coco_dset.loadImgs(train_set_base_imgs), 
         coco_dset.loadImgs(val_set_base_imgs), 
         coco_dset.loadImgs(test_set_base_imgs))

In [4]:
print("LARGE DATASET")

(train_annots, val_annots, test_annots), (train_imgs, val_imgs, test_imgs) = \
    get_train_val_test_annots_and_imgs(small=False)

print('\n\n')
print("SMALL DATASET")

(small_train_annots, small_val_annots, small_test_annots), \
    (small_train_imgs, small_val_imgs, small_test_imgs) = \
        get_train_val_test_annots_and_imgs(small=True)

LARGE DATASET
Base images: 
train, val,  set
170553 21216 21543

Base annotations: 
train, val,  set
223793 27873 28247



SMALL DATASET
Base images: 
train, val,  set
16950 2013 2013

Base annotations: 
train, val,  set
22386 2637 2673


Save datasets into coco format

In [5]:
coco_format = {
    'info': coco_json['info'],
    'licenses': coco_json['licenses'],
    'categories': [c for c in coco_json['categories'] if c['id'] in base_classes],
    'images': [],
    'annotations': []
}

In [6]:
coco_format['images'] = train_imgs
coco_format['annotations'] = train_annots
with open(os.path.join(root_path, 'data', 'base_dset', 'base_train.json'), 'w') as f:
    json.dump(coco_format, f)

coco_format['images'] = val_imgs
coco_format['annotations'] = val_annots
with open(os.path.join(root_path, 'data', 'base_dset', 'base_val.json'), 'w') as f:
    json.dump(coco_format, f)

coco_format['images'] = test_imgs
coco_format['annotations'] = test_annots
with open(os.path.join(root_path, 'data', 'base_dset', 'base_test.json'), 'w') as f:
    json.dump(coco_format, f)



coco_format['images'] = small_train_imgs
coco_format['annotations'] = small_train_annots
with open(os.path.join(root_path, 'data', 'base_dset', 'small_base_train.json'), 'w') as f:
    json.dump(coco_format, f)

coco_format['images'] = small_val_imgs
coco_format['annotations'] = small_val_annots
with open(os.path.join(root_path, 'data', 'base_dset', 'small_base_val.json'), 'w') as f:
    json.dump(coco_format, f)

coco_format['images'] = small_test_imgs
coco_format['annotations'] = small_test_annots
with open(os.path.join(root_path, 'data', 'base_dset', 'small_base_test.json'), 'w') as f:
    json.dump(coco_format, f)