In [43]:
import pickle
import json
from pathlib import Path
from collections import Counter, defaultdict

In [44]:
coco_path = Path("D:/ml/code/datasets/coco/annotations/instances_val2017.json")
unique_ids_path = Path("D:/ml/code/datasets/coco/annotations/unique_ids.pkl")
ids_num_path = Path("D:/ml/code/datasets/coco/annotations/ids_num.pkl")

In [45]:
assert coco_path.exists(), f"{coco_path} not exists"

# load json

In [31]:
with open(coco_path, encoding="utf-8", mode="r") as f:
    data = json.load(f)
data.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [32]:
categories = [categories["name"] for categories in data["categories"]]
categories

['person',
 'bicycle',
 'car',
 'motorcycle',
 'airplane',
 'bus',
 'train',
 'truck',
 'boat',
 'traffic light',
 'fire hydrant',
 'stop sign',
 'parking meter',
 'bench',
 'bird',
 'cat',
 'dog',
 'horse',
 'sheep',
 'cow',
 'elephant',
 'bear',
 'zebra',
 'giraffe',
 'backpack',
 'umbrella',
 'handbag',
 'tie',
 'suitcase',
 'frisbee',
 'skis',
 'snowboard',
 'sports ball',
 'kite',
 'baseball bat',
 'baseball glove',
 'skateboard',
 'surfboard',
 'tennis racket',
 'bottle',
 'wine glass',
 'cup',
 'fork',
 'knife',
 'spoon',
 'bowl',
 'banana',
 'apple',
 'sandwich',
 'orange',
 'broccoli',
 'carrot',
 'hot dog',
 'pizza',
 'donut',
 'cake',
 'chair',
 'couch',
 'potted plant',
 'bed',
 'dining table',
 'toilet',
 'tv',
 'laptop',
 'mouse',
 'remote',
 'keyboard',
 'cell phone',
 'microwave',
 'oven',
 'toaster',
 'sink',
 'refrigerator',
 'book',
 'clock',
 'vase',
 'scissors',
 'teddy bear',
 'hair drier',
 'toothbrush']

In [33]:
id2file_name = {}
for image in data["images"]:
    id2file_name[image["id"]] = Path(image["file_name"]).stem
id2file_name

{397133: '000000397133',
 37777: '000000037777',
 252219: '000000252219',
 87038: '000000087038',
 174482: '000000174482',
 403385: '000000403385',
 6818: '000000006818',
 480985: '000000480985',
 458054: '000000458054',
 331352: '000000331352',
 296649: '000000296649',
 386912: '000000386912',
 502136: '000000502136',
 491497: '000000491497',
 184791: '000000184791',
 348881: '000000348881',
 289393: '000000289393',
 522713: '000000522713',
 181666: '000000181666',
 17627: '000000017627',
 143931: '000000143931',
 303818: '000000303818',
 463730: '000000463730',
 460347: '000000460347',
 322864: '000000322864',
 226111: '000000226111',
 153299: '000000153299',
 308394: '000000308394',
 456496: '000000456496',
 58636: '000000058636',
 41888: '000000041888',
 184321: '000000184321',
 565778: '000000565778',
 297343: '000000297343',
 336587: '000000336587',
 122745: '000000122745',
 219578: '000000219578',
 555705: '000000555705',
 443303: '000000443303',
 500663: '000000500663',
 418281

In [34]:
annotations = data["annotations"]
len(annotations)

36781

In [35]:
print(annotations[0]["image_id"])
print(annotations[0]["category_id"])

289343
18


In [36]:
data = defaultdict(list)
for annotation in annotations:
    data[id2file_name[annotation["image_id"]]].append(annotation["category_id"])
data

defaultdict(list,
            {'000000289343': [18, 1, 15, 2],
             '000000061471': [18, 44, 70],
             '000000472375': [18, 4, 47, 47],
             '000000520301': [18],
             '000000579321': [18, 1],
             '000000494869': [18, 1, 50, 51, 51, 44, 51, 51, 1],
             '000000554002': [18, 1, 1, 1, 1, 1, 1, 1, 31, 31, 1, 62, 1, 1],
             '000000078823': [18, 64, 3, 3, 3],
             '000000419974': [18,
              44,
              44,
              62,
              67,
              67,
              1,
              64,
              64,
              64,
              64,
              48,
              49,
              49,
              15,
              44,
              46,
              64,
              1,
              47,
              49,
              49,
              48,
              64,
              79],
             '000000404484': [18, 64, 72, 1, 88],
             '000000329219': [18,
              1,
              49,
 

# get unique id list

In [37]:
new_data = {}
for k, v in data.items():
    new_data[k] = list(sorted(set(v)))
new_data

{'000000289343': [1, 2, 15, 18],
 '000000061471': [18, 44, 70],
 '000000472375': [4, 18, 47],
 '000000520301': [18],
 '000000579321': [1, 18],
 '000000494869': [1, 18, 44, 50, 51],
 '000000554002': [1, 18, 31, 62],
 '000000078823': [3, 18, 64],
 '000000419974': [1, 15, 18, 44, 46, 47, 48, 49, 62, 64, 67, 79],
 '000000404484': [1, 18, 64, 72, 88],
 '000000329219': [1, 18, 47, 48, 49, 50, 81],
 '000000068078': [18, 44, 70, 81],
 '000000170893': [18, 70],
 '000000065485': [3, 18],
 '000000498286': [3, 8, 18],
 '000000424162': [1, 2, 3, 8, 18, 27],
 '000000061108': [2, 3, 15, 18],
 '000000067213': [1, 3, 18],
 '000000365207': [3, 15, 18],
 '000000131273': [18],
 '000000279278': [1, 2, 18, 31, 41, 47],
 '000000482100': [64, 70],
 '000000540502': [50, 51, 53, 55, 62, 64, 67, 78, 79, 80, 81, 82, 86],
 '000000127182': [49, 64, 78, 79, 81, 82, 86],
 '000000565776': [47, 50, 51, 53, 64, 79, 81, 82, 86],
 '000000575970': [47, 51, 52, 53, 62, 64, 67, 78, 79, 81, 82, 86],
 '000000462614': [62, 64, 

In [38]:
with open(unique_ids_path, mode="wb") as f:
    pickle.dump(new_data, f)

In [39]:
with open(unique_ids_path, mode="rb") as f:
    print(pickle.load(f))

{'000000289343': [1, 2, 15, 18], '000000061471': [18, 44, 70], '000000472375': [4, 18, 47], '000000520301': [18], '000000579321': [1, 18], '000000494869': [1, 18, 44, 50, 51], '000000554002': [1, 18, 31, 62], '000000078823': [3, 18, 64], '000000419974': [1, 15, 18, 44, 46, 47, 48, 49, 62, 64, 67, 79], '000000404484': [1, 18, 64, 72, 88], '000000329219': [1, 18, 47, 48, 49, 50, 81], '000000068078': [18, 44, 70, 81], '000000170893': [18, 70], '000000065485': [3, 18], '000000498286': [3, 8, 18], '000000424162': [1, 2, 3, 8, 18, 27], '000000061108': [2, 3, 15, 18], '000000067213': [1, 3, 18], '000000365207': [3, 15, 18], '000000131273': [18], '000000279278': [1, 2, 18, 31, 41, 47], '000000482100': [64, 70], '000000540502': [50, 51, 53, 55, 62, 64, 67, 78, 79, 80, 81, 82, 86], '000000127182': [49, 64, 78, 79, 81, 82, 86], '000000565776': [47, 50, 51, 53, 64, 79, 81, 82, 86], '000000575970': [47, 51, 52, 53, 62, 64, 67, 78, 79, 81, 82, 86], '000000462614': [62, 64, 70, 81], '000000407614': [

# get id counter

In [40]:
new_data = {}
for k, v in data.items():
    new_data[k] = dict(Counter(v))
new_data

{'000000289343': {18: 1, 1: 1, 15: 1, 2: 1},
 '000000061471': {18: 1, 44: 1, 70: 1},
 '000000472375': {18: 1, 4: 1, 47: 2},
 '000000520301': {18: 1},
 '000000579321': {18: 1, 1: 1},
 '000000494869': {18: 1, 1: 2, 50: 1, 51: 4, 44: 1},
 '000000554002': {18: 1, 1: 10, 31: 2, 62: 1},
 '000000078823': {18: 1, 64: 1, 3: 3},
 '000000419974': {18: 1,
  44: 3,
  62: 1,
  67: 2,
  1: 2,
  64: 6,
  48: 2,
  49: 4,
  15: 1,
  46: 1,
  47: 1,
  79: 1},
 '000000404484': {18: 1, 64: 1, 72: 1, 1: 1, 88: 1},
 '000000329219': {18: 1, 1: 1, 49: 1, 47: 13, 48: 1, 50: 3, 81: 1},
 '000000068078': {18: 1, 44: 1, 70: 1, 81: 1},
 '000000170893': {18: 1, 70: 1},
 '000000065485': {18: 1, 3: 1},
 '000000498286': {18: 1, 3: 1, 8: 1},
 '000000424162': {18: 1, 2: 4, 3: 3, 1: 3, 27: 1, 8: 1},
 '000000061108': {18: 1, 2: 1, 3: 4, 15: 3},
 '000000067213': {18: 1, 3: 5, 1: 8},
 '000000365207': {18: 1, 3: 3, 15: 1},
 '000000131273': {18: 1},
 '000000279278': {18: 1, 1: 7, 2: 11, 41: 1, 47: 1, 31: 2},
 '000000482100': {6

In [41]:
with open(ids_num_path, mode="wb") as f:
    pickle.dump(new_data, f)

In [42]:
with open(ids_num_path, mode="rb") as f:
    print(pickle.load(f))

{'000000289343': {18: 1, 1: 1, 15: 1, 2: 1}, '000000061471': {18: 1, 44: 1, 70: 1}, '000000472375': {18: 1, 4: 1, 47: 2}, '000000520301': {18: 1}, '000000579321': {18: 1, 1: 1}, '000000494869': {18: 1, 1: 2, 50: 1, 51: 4, 44: 1}, '000000554002': {18: 1, 1: 10, 31: 2, 62: 1}, '000000078823': {18: 1, 64: 1, 3: 3}, '000000419974': {18: 1, 44: 3, 62: 1, 67: 2, 1: 2, 64: 6, 48: 2, 49: 4, 15: 1, 46: 1, 47: 1, 79: 1}, '000000404484': {18: 1, 64: 1, 72: 1, 1: 1, 88: 1}, '000000329219': {18: 1, 1: 1, 49: 1, 47: 13, 48: 1, 50: 3, 81: 1}, '000000068078': {18: 1, 44: 1, 70: 1, 81: 1}, '000000170893': {18: 1, 70: 1}, '000000065485': {18: 1, 3: 1}, '000000498286': {18: 1, 3: 1, 8: 1}, '000000424162': {18: 1, 2: 4, 3: 3, 1: 3, 27: 1, 8: 1}, '000000061108': {18: 1, 2: 1, 3: 4, 15: 3}, '000000067213': {18: 1, 3: 5, 1: 8}, '000000365207': {18: 1, 3: 3, 15: 1}, '000000131273': {18: 1}, '000000279278': {18: 1, 1: 7, 2: 11, 41: 1, 47: 1, 31: 2}, '000000482100': {64: 5, 70: 2}, '000000540502': {64: 1, 82: 1