In [1]:
'''This notebook heavily reuses code from https://patrickwasp.com/create-your-own-coco-style-dataset/'''
'''Thank you to @waspinator for making this wonderful tool.'''


import datetime
import json
import os
import re
import fnmatch
from PIL import Image
import numpy as np
from pycococreatortools import pycococreatortools

In [26]:
DATASET_NAME = 'tesri_rndspd_100_valid'
ROOT_DIR = '/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100'



IMAGE_DIR = os.path.join(ROOT_DIR,DATASET_NAME)
ANNOTATION_DIR = os.path.join(ROOT_DIR, 'mask_'+DATASET_NAME)

INFO = {
    "description": "Moth Dataset_%s" %(DATASET_NAME),
    "url": "none",
    "version": "0.1.0",
    "year": 2018,
    "contributor": "KatieHYT",
    "date_created": datetime.datetime.utcnow().isoformat(' ')
}

LICENSES = [
    {
        "id": 1,
        "name": "yeah",
        "url": "yeah"
    }
]

CATEGORIES = [
    {
        'id': 1,
        'name': 'moth',
        'supercategory': 'Moth',
    },
  
]

In [27]:
def filter_for_jpeg(root, files):
    file_types = ['*.jpeg', '*.jpg']
    file_types = r'|'.join([fnmatch.translate(x) for x in file_types])
    files = [os.path.join(root, f) for f in files]
    files = [f for f in files if re.match(file_types, f)]
    
    return files

def filter_for_annotations(root, files, image_filename):
    file_types = ['*.png']
    file_types = r'|'.join([fnmatch.translate(x) for x in file_types])
    basename_no_extension = os.path.splitext(os.path.basename(image_filename))[0]
    file_name_prefix = basename_no_extension + '_.*'  # fix the regexp
    files = [os.path.join(root, f) for f in files]
    files = [f for f in files if re.match(file_types, f)]
    files = [f for f in files if re.match(file_name_prefix, os.path.splitext(os.path.basename(f))[0])]
    print(file_name_prefix)

    return files

In [28]:
SAVE_DIR = os.path.join(ROOT_DIR,'annotations')
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
save_path = os.path.join(SAVE_DIR, 'instances_%s.json' % DATASET_NAME)

with open(save_path, 'w') as output_json_file:
    json.dump(coco_output, output_json_file)

In [29]:
coco_output = {
    "info": INFO,
    "licenses": LICENSES,
    "categories": CATEGORIES,
    "images": [],
    "annotations": []
}

image_id = 1  # should put outside since it give all pics in dir an idx 
segmentation_id = 1  # should put outside since it give all pics in dir an idx 


# filter for jpeg images

for root, _, files in os.walk(IMAGE_DIR):
    image_files = filter_for_jpeg(root, files)

    # go through each image
    for image_filename in image_files:
        image = Image.open(image_filename)
        image_info = pycococreatortools.create_image_info(
            image_id, os.path.basename(image_filename), image.size)
        coco_output["images"].append(image_info)

        # filter for associated png annotations
        for root, _, files in os.walk(ANNOTATION_DIR):
            annotation_files = filter_for_annotations(root, files, image_filename)
            check_mask_count = 1
            # go through each associated annotation
            for annotation_filename in annotation_files:

                
                class_id = [x['id'] for x in CATEGORIES if x['name'] in annotation_filename][0]

                category_info = {'id': class_id, 'is_crowd': 'crowd' in image_filename}
                binary_mask = np.asarray(Image.open(annotation_filename)
                    .convert('1')).astype(np.uint8)

                annotation_info = pycococreatortools.create_annotation_info(
                    segmentation_id, image_id, category_info, binary_mask,
                    image.size, tolerance=2)

                if annotation_info is not None:
                    coco_output["annotations"].append(annotation_info)
                print('IMAGE: %d/%d, MASK: %d/%d' % (image_id, len(image_files), check_mask_count, len(annotation_files)))
                print(annotation_filename)
                check_mask_count = check_mask_count + 1
                segmentation_id = segmentation_id + 1
                

        image_id = image_id + 1
SAVE_DIR = os.path.join(ROOT_DIR,'annotations')
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
save_path = os.path.join(SAVE_DIR, 'instances_%s.json' % DATASET_NAME)

with open(save_path, 'w') as output_json_file:
    json.dump(coco_output, output_json_file)

# will print out processed mask name


tesri_rndspd_100_valid_25_.*
IMAGE: 1/30, MASK: 1/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_25_moth_2.png
IMAGE: 1/30, MASK: 2/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_25_moth_4.png
IMAGE: 1/30, MASK: 3/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_25_moth_3.png
IMAGE: 1/30, MASK: 4/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_25_moth_0.png
IMAGE: 1/30, MASK: 5/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_25_moth_1.png
tesri_rndspd_100_valid_1_.*
IMAGE: 2/30, MASK: 1/6
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_1_moth_4.png
IMAGE: 2/30, MASK: 2/6
/home/put_data/moth/data/CO

IMAGE: 10/30, MASK: 2/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_4_moth_0.png
IMAGE: 10/30, MASK: 3/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_4_moth_2.png
IMAGE: 10/30, MASK: 4/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_4_moth_1.png
tesri_rndspd_100_valid_23_.*
IMAGE: 11/30, MASK: 1/9
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_23_moth_3.png
IMAGE: 11/30, MASK: 2/9
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_23_moth_6.png
IMAGE: 11/30, MASK: 3/9
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_23_moth_1.png
IMAGE: 11/30, MASK: 4/9
/home/put_data/moth/data/COCOdataset_format/TESRI_

IMAGE: 20/30, MASK: 2/3
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_28_moth_0.png
IMAGE: 20/30, MASK: 3/3
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_28_moth_1.png
tesri_rndspd_100_valid_7_.*
IMAGE: 21/30, MASK: 1/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_7_moth_1.png
IMAGE: 21/30, MASK: 2/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_7_moth_0.png
IMAGE: 21/30, MASK: 3/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_7_moth_4.png
IMAGE: 21/30, MASK: 4/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_7_moth_2.png
IMAGE: 21/30, MASK: 5/5
/home/put_data/moth/data/COCOdataset_format/TESRI_RN

IMAGE: 29/30, MASK: 1/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_11_moth_3.png
IMAGE: 29/30, MASK: 2/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_11_moth_1.png
IMAGE: 29/30, MASK: 3/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_11_moth_2.png
IMAGE: 29/30, MASK: 4/4
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_11_moth_0.png
tesri_rndspd_100_valid_6_.*
IMAGE: 30/30, MASK: 1/2
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_6_moth_0.png
IMAGE: 30/30, MASK: 2/2
/home/put_data/moth/data/COCOdataset_format/TESRI_RNDSPD_100/mask_tesri_rndspd_100_valid/tesri_rndspd_100_valid_6_moth_1.png


In [None]:
# # don;t know why cannot directly save in annotations_dir
# ! mv ../moth/random_spread/instances_random_spread.json ../moth/random_spread/annotations/