# Getting the Labels Ready

To look for:

* **CHOOSE requires action** 

* CONSTRUCT
* CHECK
* SAVE

## Imports

In [1]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import pandas as pd
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

In [2]:
# https://stackoverflow.com/questions/18035595/powersets-in-python-using-itertools
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [3]:
def reverse_one_to_many_dict(one_to_many_dict):
    one_to_one_dict = {}
    
    for key, arr in one_to_many_dict.items():
        for a in arr:
            assert a not in one_to_one_dict, "Trying to replace existing key: {}".format(a)
            one_to_one_dict[a] = key
            
    return one_to_one_dict


# sample_dict_good = {
#     'a': [1, 2, 3],
#     'b': [4, 5, 6]
# }
# print(reverse_one_to_many_dict(sample_dict_good)) # all good

# sample_dict_bad = {
#     'a': [1, 2, 3],
#     'b': [3, 5, 6]
# }
# print(reverse_one_to_many_dict(sample_dict_bad)) # supposed to fail with 3

## CHOOSE: Images and Annotations Directories

In [4]:
dataDir='..'
dataType='train2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir, dataType)

In [5]:
# initialize COCO api for instance annotations
coco=COCO(annFile)

loading annotations into memory...
Done (t=7.11s)
creating index...
index created!


**Note**, `cat` and `cats` stand for **category** and **categories** NOT for cat that is like a dog, but a cat.

In [6]:
# display COCO categories and supercategories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n\n{}\n'.format(' '.join(nms)))

nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n\t{}'.format(' '.join(nms)))

COCO categories: 

person bicycle car motorcycle airplane bus train truck boat traffic light fire hydrant stop sign parking meter bench bird cat dog horse sheep cow elephant bear zebra giraffe backpack umbrella handbag tie suitcase frisbee skis snowboard sports ball kite baseball bat baseball glove skateboard surfboard tennis racket bottle wine glass cup fork knife spoon bowl banana apple sandwich orange broccoli carrot hot dog pizza donut cake chair couch potted plant bed dining table toilet tv laptop mouse remote keyboard cell phone microwave oven toaster sink refrigerator book clock vase scissors teddy bear hair drier toothbrush

COCO supercategories: 
	electronic food sports accessory furniture kitchen outdoor appliance indoor person animal vehicle


## CHOOSE: categories of Interest

In [7]:
CATEGORIES_OF_INTEREST = sorted(['person', 'dog', 'cat', 'bird'])
CATEGORIES_OF_INTEREST

['bird', 'cat', 'dog', 'person']

## CONSTRUCT a mapping from the categories of interest to image IDs

In [8]:
%%time

catNms_to_imgIds = {}
for include_cats_names in powerset(CATEGORIES_OF_INTEREST):
    include_cats_names = tuple(sorted(include_cats_names)) # sort to have the same keys every time
    
    # include_cats_names should be a tuple (immutable) so that we can use it as a key to a dictionary
    exclude_cats_names = set(CATEGORIES_OF_INTEREST) - set(include_cats_names)
    exclude_cats_names = tuple(sorted(exclude_cats_names)) # sort to have the same keys every time
    
    catIds = coco.getCatIds(catNms=include_cats_names)
    exclCatIDs = coco.getCatIds(catNms=exclude_cats_names)
    
    imgIds = coco.getImgIds(catIds=catIds, exclCatIds=exclCatIDs)

    catNms_to_imgIds[include_cats_names] = imgIds

# correct the Ids for the images which do not have any of the categories in the CATEGORIES_OF_INTEREST
# correction needed because
#    include_cats_names = () results in
#    catIds = All Coco categories but the CATEGORIES_OF_INTEREST
#    There are no images with all those categories present
catNms_to_imgIds[()] = coco.getImgIds(exclCatIds=coco.getCatIds(CATEGORIES_OF_INTEREST))


# correct the Ids for the images with all of the CATEGORIES_OF_INTEREST present
# correction needed because
#    exclude_cats_names = () results in
#    exclCatIds = All Coco categories but the CATEGORIES_OF_INTEREST
#    We exclude all the images with any of the exclusion categories present, which is likely to exclude all the images
catNms_to_imgIds[tuple(sorted(CATEGORIES_OF_INTEREST))] = coco.getImgIds(catIds=coco.getCatIds(CATEGORIES_OF_INTEREST))

CPU times: user 480 ms, sys: 3.73 ms, total: 484 ms
Wall time: 483 ms


## CHECK that the IDs in disjoint sets are a partition of the original set of images

1. $\cup s_i = S$, where $S$ - original set of IDs
2. $ s_i \cap s_j = \emptyset, \forall i \neq j$ - indexes of different categories

### CHECK that the sets created are disjoint

2. $ s_i \cap s_j = \emptyset, \forall i \neq j$ - indexes of different categories

In [9]:
%%time

for key1 in catNms_to_imgIds:
    for key2 in catNms_to_imgIds:
        if key1 == key2:
            pass
        else:
            s1 = set(catNms_to_imgIds[key1])
            s2 = set(catNms_to_imgIds[key2])
            inters = s1.intersection(s2)
            
            # # for debugging
            # print(key1, len(s1))
            # print(key2, len(s2))
            # print('Intersection:', inters, '\n')
            
            assert len(inters) == 0, "The subsets have to be disjoint"

CPU times: user 145 ms, sys: 6.29 ms, total: 151 ms
Wall time: 150 ms


### Calculate how many images are in each disjoint category

In [10]:
catNms_to_ImgIdsNum = {key: len(value) for (key, value) in catNms_to_imgIds.items()}
catNms_to_ImgIdsNum

{(): 46223,
 ('bird',): 2351,
 ('cat',): 3199,
 ('dog',): 2153,
 ('person',): 60669,
 ('bird', 'cat'): 63,
 ('bird', 'dog'): 24,
 ('bird', 'person'): 753,
 ('cat', 'dog'): 155,
 ('cat', 'person'): 641,
 ('dog', 'person'): 1964,
 ('bird', 'cat', 'dog'): 4,
 ('bird', 'cat', 'person'): 3,
 ('bird', 'dog', 'person'): 36,
 ('cat', 'dog', 'person'): 46,
 ('bird', 'cat', 'dog', 'person'): 3}

In [11]:
summary = {
    'category': [],
    'total': []
}
for key in catNms_to_ImgIdsNum:
    summary['category'].append(key)
    summary['total'].append(catNms_to_ImgIdsNum[key])
    
summary_df = pd.DataFrame(summary)


assert summary_df['total'].sum() == len(set(coco.getImgIds())), "Some images were missed"
summary_df

Unnamed: 0,category,total
0,(),46223
1,"(bird,)",2351
2,"(cat,)",3199
3,"(dog,)",2153
4,"(person,)",60669
5,"(bird, cat)",63
6,"(bird, dog)",24
7,"(bird, person)",753
8,"(cat, dog)",155
9,"(cat, person)",641


### CHECK that the union adds up to the original set

1. $\cup s_i = S$, where $S$ - original set of IDs

In [12]:
all_ids = set(coco.getImgIds())
len(all_ids)

my_all_ids = set().union(*[catNms_to_imgIds[key] for key in catNms_to_imgIds])
len(my_all_ids)


# all ids we have now should be at least the subset of the original ones, uncomment the next line to see it work
# my_all_ids.add(1) 
assert my_all_ids.issubset(all_ids), "Got some new ids that were not present in Coco"


# all ids from the original set have to be contained in the union of the disjoint sets of ids
assert all_ids.issubset(my_all_ids), "We are missing some of the images from the original set"


# which pictures have we missed?
print("IDs missed:", all_ids - my_all_ids)
for sample_id in list(all_ids - my_all_ids):
    print(sample_id)
    
    sample_img_info = coco.loadImgs(sample_id)[0]

    print('Fields available:', list(sample_img_info.keys()), '\n')
    for key, value in sample_img_info.items():
        print(key, (13-len(key))*' ', ':', value)
        
    print('\n', '-'*80, '\n')



IDs missed: set()


## CONSTRUCT a mapping from Categories to Labels

`catNms_to_labels`

In [13]:
CATEGORIES_OF_INTEREST

['bird', 'cat', 'dog', 'person']

In [14]:
catNms_to_labels = {}

for cat_name in catNms_to_imgIds:
    multilabel = []
    for c in CATEGORIES_OF_INTEREST:
        multilabel.append(int(c in set(cat_name)))
    
    catNms_to_labels[cat_name] = multilabel

catNms_to_labels

{(): [0, 0, 0, 0],
 ('bird',): [1, 0, 0, 0],
 ('cat',): [0, 1, 0, 0],
 ('dog',): [0, 0, 1, 0],
 ('person',): [0, 0, 0, 1],
 ('bird', 'cat'): [1, 1, 0, 0],
 ('bird', 'dog'): [1, 0, 1, 0],
 ('bird', 'person'): [1, 0, 0, 1],
 ('cat', 'dog'): [0, 1, 1, 0],
 ('cat', 'person'): [0, 1, 0, 1],
 ('dog', 'person'): [0, 0, 1, 1],
 ('bird', 'cat', 'dog'): [1, 1, 1, 0],
 ('bird', 'cat', 'person'): [1, 1, 0, 1],
 ('bird', 'dog', 'person'): [1, 0, 1, 1],
 ('cat', 'dog', 'person'): [0, 1, 1, 1],
 ('bird', 'cat', 'dog', 'person'): [1, 1, 1, 1]}

## CONSTRUCT a mapping from Images to Disjoint Categories

`imgIds_to_cats`

In [15]:
imgIds_to_catNms = reverse_one_to_many_dict(catNms_to_imgIds)

# for key in np.random.choice(list(imgIds_to_catNms.keys()), 1000):
#     print(key, imgIds_to_catNms[key])

## CONSTRUCT a mapping from Images to Multi-Labels

`imgIds_to_labels`

In [16]:
imgIds_to_labels = {}

coco.getImgIds()
for imgId in coco.getImgIds():
    catNm = imgIds_to_catNms[imgId]
    imgIds_to_labels[imgId] = catNms_to_labels[catNm]
    
imgIds_to_labels

{391895: [0, 0, 0, 1],
 522418: [0, 0, 0, 1],
 184613: [0, 0, 0, 1],
 318219: [0, 0, 0, 1],
 554625: [0, 0, 0, 1],
 574769: [0, 1, 0, 1],
 60623: [0, 0, 0, 1],
 309022: [0, 0, 0, 0],
 5802: [0, 0, 0, 1],
 222564: [0, 0, 0, 1],
 118113: [0, 0, 0, 0],
 193271: [0, 0, 0, 0],
 224736: [0, 0, 0, 0],
 483108: [0, 0, 0, 1],
 403013: [0, 0, 0, 0],
 374628: [0, 0, 0, 0],
 328757: [0, 0, 0, 1],
 384213: [0, 0, 0, 0],
 293802: [0, 0, 0, 1],
 86408: [0, 0, 0, 0],
 372938: [0, 0, 0, 1],
 386164: [0, 0, 0, 0],
 223648: [0, 0, 0, 0],
 204805: [0, 0, 0, 1],
 113588: [0, 0, 0, 1],
 384553: [0, 0, 0, 1],
 337264: [0, 0, 0, 1],
 368402: [0, 0, 0, 1],
 12448: [0, 0, 0, 1],
 79841: [0, 0, 0, 1],
 515289: [1, 0, 0, 1],
 562150: [0, 1, 0, 1],
 542145: [0, 0, 0, 0],
 412151: [0, 0, 0, 1],
 579003: [0, 0, 0, 1],
 540186: [0, 0, 0, 0],
 242611: [0, 0, 0, 0],
 51191: [0, 0, 0, 0],
 269105: [0, 0, 0, 0],
 294832: [0, 0, 0, 0],
 462565: [0, 0, 0, 1],
 144941: [0, 0, 0, 0],
 173350: [0, 0, 1, 0],
 60760: [0, 0, 0, 

In [17]:
len(imgIds_to_labels)

118287

### SAVE: `imgIds_to_labels` in a json file

In [18]:
import json

with open('../my_annotations/imgIds_to_labels.json', 'w') as f:
    json.dump(imgIds_to_labels, f, sort_keys=True)

In [19]:
with open('../my_annotations/imgIds_to_labels.json', 'r') as f:
    imgIds_to_labels = json.load(f)

### CHECK: that the numbers for generic categories are the same

In [20]:
imgIds_to_labels_df = pd.DataFrame(imgIds_to_labels).T
imgIds_to_labels_df

Unnamed: 0,0,1,2,3
9,0,0,0,0
25,0,0,0,0
30,0,0,0,0
34,0,0,0,0
36,0,0,0,1
...,...,...,...,...
581906,0,0,0,0
581909,0,0,0,0
581913,0,0,0,0
581921,0,0,0,1


In [21]:
for i, name in enumerate(CATEGORIES_OF_INTEREST):
    print(i, name)

0 bird
1 cat
2 dog
3 person


In [22]:
imgIds_to_labels_df.sum(axis=0)

0     3237
1     4114
2     4385
3    64115
dtype: int64

In [23]:
for i, catNm in enumerate(sorted(CATEGORIES_OF_INTEREST)):
    catId = coco.getCatIds(catNms=[catNm])
    imgIds = coco.getImgIds(catIds=catId)
    print(i, len(imgIds))
    
    assert len(imgIds) == imgIds_to_labels_df.sum(axis=0)[i], \
        "Numbers for primary category ({}: '{}') do not match".format(i, CATEGORIES_OF_INTEREST[i])

0 3237
1 4114
2 4385
3 64115
