In [34]:
import os
import json
from urllib import request

import pandas as pd
from rich import print

In [2]:
def load_meta(anno_path):
    with open(anno_path, 'r') as f:
        meta = json.load(f)

    anno_meta = pd.DataFrame(meta["annotations"])
    img_meta = pd.DataFrame(meta["images"])

    return anno_meta, img_meta


def target_id(img_meta, imgs_dir):
    imgs_id = set(img_meta.image_id.unique())
    existent_imgs = os.listdir(imgs_dir)
    existent_img_ids = set([int(img.replace('.jpg') for img in existent_imgs)])

    inexistent_img_ids = imgs_id - existent_img_ids

    return inexistent_img_ids


def query_img(inexistent_img_ids, img_meta):
    target_meta = pd.DataFrame([img_meta[img_meta.id == i]
                          for i in inexistent_img_ids])
    return target_meta


def downloader(target_meta, imgs_dir):
    url_list = target_meta.coco_url
    for url in url_list:
        request.urlretrieve(url, imgs_dir)

    return None


def valid_anno_existent(anno_meta, img_meta):
    id_1 = set(anno_meta.image_id.unique())
    id_2 = set(img_meta.id.unique())
    judge = id_1 - id_2
    assert judge == {}, f"In spite of anno meta existent, image meta is inexistent. {judge} is missing"

    return None


def main(anno_path, imgs_dir):
    anno_meta, img_meta = load_meta(anno_path)
    valid_anno_existent(anno_meta, img_meta)
    inexistent_img_ids = target_id(img_meta, imgs_dir)
    target_meta = query_img(inexistent_img_ids)
    downloader(target_meta, imgs_dir)

    return None


In [16]:
meta_anno = pd.DataFrame(meta["annotations"]).sort_values("image_id")
meta_anno

Unnamed: 0,segmentation,area,iscrowd,image_id,bbox,category_id,id
765531,"[[376.2, 61.55, 391.86, 46.35, 424.57, 40.36, ...",2239.29240,0,9,"[376.2, 40.36, 75.55, 46.53]",55,1913551
765615,"[[473.92, 85.64, 469.58, 83.47, 465.78, 78.04,...",1658.89130,0,9,"[465.78, 38.97, 58.07, 46.67]",55,1913746
496775,"[[357.03, 69.03, 311.73, 15.1, 550.11, 4.31, 6...",44434.75110,0,9,"[311.73, 4.31, 319.28, 228.68]",51,1039564
503967,"[[249.6, 348.99, 267.67, 311.72, 291.39, 294.7...",49577.94435,0,9,"[249.6, 229.27, 316.24, 245.08]",56,1058555
765729,"[[458.81, 24.94, 437.61, 4.99, 391.48, 2.49, 3...",2975.27600,0,9,"[364.05, 2.49, 94.76, 71.07]",55,1914001
...,...,...,...,...,...,...,...
155718,"[[309.27, 366.57, 283.15, 305.9, 282.3, 258.71...",38299.31380,0,581913,"[282.3, 169.38, 216.58, 201.41]",60,1079230
410872,"[[291.7, 217.82, 306.1, 185.19, 306.1, 171.76,...",12501.82855,0,581921,"[165.04, 7.68, 213.98, 220.69]",1,459186
463879,"[[286.15, 214.61, 278.23, 236.7, 278.23, 236.7...",1618.13685,0,581921,"[275.31, 159.6, 41.26, 106.69]",36,618855
115195,"[[259.1, 175.49, 245.86, 188.07, 243.88, 210.5...",22303.98445,0,581929,"[243.88, 162.92, 203.14, 176.01]",19,55070


In [15]:
meta_img = pd.DataFrame(meta["images"]).sort_values("id")
meta_img

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
98486,3,000000000009.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-19 20:40:11,http://farm5.staticflickr.com/4026/4622125393_...,9
63326,1,000000000025.jpg,http://images.cocodataset.org/train2017/000000...,426,640,2013-11-16 14:11:30,http://farm1.staticflickr.com/94/241612385_d9e...,25
96855,4,000000000030.jpg,http://images.cocodataset.org/train2017/000000...,428,640,2013-11-24 03:32:32,http://farm4.staticflickr.com/3377/3573516590_...,30
86232,6,000000000034.jpg,http://images.cocodataset.org/train2017/000000...,425,640,2013-11-18 16:32:48,http://farm5.staticflickr.com/4024/4599442031_...,34
91250,3,000000000036.jpg,http://images.cocodataset.org/train2017/000000...,640,481,2013-11-18 06:56:10,http://farm8.staticflickr.com/7216/7200825264_...,36
...,...,...,...,...,...,...,...,...
92309,3,000000581906.jpg,http://images.cocodataset.org/train2017/000000...,428,640,2013-11-18 07:15:43,http://farm7.staticflickr.com/6205/6045723042_...,581906
107620,1,000000581909.jpg,http://images.cocodataset.org/train2017/000000...,426,640,2013-11-18 02:28:50,http://farm1.staticflickr.com/229/505814994_41...,581909
12287,1,000000581913.jpg,http://images.cocodataset.org/train2017/000000...,375,500,2013-11-20 17:16:02,http://farm2.staticflickr.com/1426/626848713_0...,581913
37159,1,000000581921.jpg,http://images.cocodataset.org/train2017/000000...,427,640,2013-11-20 13:14:15,http://farm4.staticflickr.com/3612/3348961791_...,581921


In [68]:
anno = {1,2,3,4,5}
img = {1,2,3,4}
judge = anno - img
assert judge == {}, f"In spite of anno meta existent, image meta is inexistent. lack meta of ids which is {judge}"

AssertionError: In spite of anno meta existent, image meta is inexistent. lack meta of ids which is {5}

In [64]:
f"In spite of anno meta existent, image meta is inexistent. lack meta of ids which is {judge}"


'In spite of anno meta existent, image meta is inexistent. lack meta of ids which is {5}'