In [None]:
# https://github.com/microsoft/CameraTraps/blob/main/data_management/lila/create_lila_test_set.py
# https://github.com/cindyweng/coco-to-yolo-by-category/blob/5fcd1ae51af89c1c678d903a4aff5d32cba25b0b/coco-to-yolo-by-category.py#L41

In [15]:
import json
import random
import numpy as np
import shutil
import os
import glob
from pathlib import Path
import matplotlib.pyplot as plt
from operator import itemgetter 
from itertools import groupby
from os.path import exists


random.seed(42)

In [16]:
basepath = './data/islands/images/images/' # refactor to match path naming
metadata_path = './data/islands/metadata.json'
train_path = './data/ultralytics/images/train/'
val_path = './data/ultralytics/images/val/'
test_path = './data/ultralytics/images/test/'
label_path = './data/ultralytics/labels/'

In [17]:
with open(metadata_path) as f:
    d = json.load(f)

In [4]:
# Remove images not found
ne = []
i = 0
for img in d['images']:
    if not exists(basepath + img['file_name']):
        ne.append(img)
        i += 1
print("{} images do not exist (images may overlap)".format(i))

remove_missing_id = set([n.get('id') for n in ne])

5071 images do not exist (images may overlap)


In [18]:
def gen_dataset(d, remove_missing_id, n_empty= 1000, n_nempty=1000):
    n_empty_images_per_dataset = n_empty
    n_non_empty_images_per_dataset = n_nempty

    category_id_to_name = {c['id']:c['name'] for c in d['categories']}
    category_name_to_id = {c['name']:c['id'] for c in d['categories']}


    human_category_id = category_name_to_id['human'] if 'human' in category_name_to_id.keys() else -1 # filter out humans


    if 'empty' not in category_name_to_id:
        print('Warning: no empty images available for {}'.format('dataset'))
        empty_category_id = -1
        empty_annotations = []
        empty_annotations_to_download = []
    else:
        empty_category_id = category_name_to_id['empty']        
        empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] == empty_category_id and ann['image_id'] not in remove_missing_id]
        empty_annotations_to_download = random.sample(empty_annotations, n_empty_images_per_dataset)        
        
    non_empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] not in (empty_category_id, human_category_id) and ann['image_id'] not in remove_missing_id]

    non_empty_annotations_to_download = random.sample(non_empty_annotations, n_non_empty_images_per_dataset)
    annotations_to_download = empty_annotations_to_download + non_empty_annotations_to_download
    image_ids_to_download = set([ann['image_id'] for ann in annotations_to_download])
    assert len(image_ids_to_download) == len(set(image_ids_to_download))

    images_to_download = []
    for im in d['images']:
        if im['id'] in image_ids_to_download:
            images_to_download.append(im)
    assert len(images_to_download) == len(image_ids_to_download)
    
    return images_to_download

In [31]:
images_to_download = gen_dataset(d, remove_missing_id, 20000, 2000)
train, validate, test = np.split(images_to_download, [int(.8*len(images_to_download)), int(.9*len(images_to_download))])

In [20]:
trlabelpath, vlabelpath, telabelpath = label_path + "train/", label_path + "val/", label_path + 'test/'

for p in (train_path, val_path, test_path, trlabelpath, vlabelpath, telabelpath):
    files = glob.glob(p)
    for f in glob.glob(p + '\*', recursive=True):
        if f.endswith('.jpg') or f.endswith('.txt'):
            os.remove(f)

for im in train: 
    newfile = shutil.copy2(basepath+im['file_name'], train_path)
    newname = os.path.dirname(newfile) + "/" + im['file_name'].replace("/", "-") # name should match label - multiple 001, 002 files...
    os.rename(newfile, newname)
for im in validate: 
    newfile = shutil.copy2(basepath+im['file_name'], val_path)
    newname = os.path.dirname(newfile) + "/" + im['file_name'].replace("/", "-") # name should match label - multiple 001, 002 files...
    os.rename(newfile, newname)
for im in test: 
        newfile = shutil.copy2(basepath+im['file_name'], test_path)
        newname = os.path.dirname(newfile) + "/" + im['file_name'].replace("/", "-") # name should match label - multiple 001, 002 files...
        os.rename(newfile, newname)

In [21]:

def truncate(n, decimals=0):
    multiplier = 10 ** decimals
    return int(n * multiplier) / multiplier


def createLabelsSingle(imageList, basedir, labeldirname, metadata_full):
    # For single objects only

    ids = [i.get('id') for i in imageList]
    # generate lookup for bbox and category id based on image id


    print("!WARNING: hardcoded fix for islands dataset")

    lookup = {}
    for meta in metadata_full["annotations"]:
        if meta["image_id"] not in ids: continue

        bb = [0, 0, 1919, 1079] #TODO this is hardcoded fix/default for the islands dataset 

        try:
            bb = meta['bbox']
        except KeyError:
            if meta['category_id'] != 0:
                raise KeyError('Keyerror on boundingbox but not an empty image!')

        lookup[meta['image_id']] = {"bbox": bb, "category_id": meta["category_id"]}


    for im in imageList:

        ann = lookup.get(im['id'])

        dw = 1. / im['width']
        dh = 1. / im['height']
        
        
        filename = im['file_name'].replace(".jpg", ".txt").replace("/", "-")
        # print(Path(basedir).parent.__str__() + "/labels/" + labeldirname + filename, "a")
        with open(Path(basedir).parent.parent.__str__() + "/labels/" + labeldirname + filename, "a") as myfile:
            xmin = ann["bbox"][0]
            ymin = ann["bbox"][1]
            xmax = ann["bbox"][2] + ann["bbox"][0]
            ymax = ann["bbox"][3] + ann["bbox"][1]
            
            x = (xmin + xmax)/2
            y = (ymin + ymax)/2
            
            w = xmax - xmin
            h = ymax-ymin
            
            x = x * dw
            w = w * dw
            y = y * dh
            h = h * dh
            
            mystring = str(str(ann['category_id']) + " " + str(truncate(x, 7)) + " " + str(truncate(y, 7)) + " " + str(truncate(w, 7)) + " " + str(truncate(h, 7)))
            myfile.write(mystring)
            myfile.write("\n")

        myfile.close()

In [22]:
createLabelsSingle(train, train_path, 'train/', d)
createLabelsSingle(validate, val_path, 'val/', d)
createLabelsSingle(test, test_path, 'test/', d)

# sanity check: 
assert((len(train)+ len(validate) + len(test)) == (len(glob.glob(label_path + "/test/*")) + len(glob.glob(label_path + "/train/*")) + len(glob.glob(label_path + "/val/*"))))

# TODO perpaps add labels?
with open(Path(label_path).parent.__str__() + '/description', 'w') as f:
    f.write('train, val, test,\n{}, {}, {}'.format(len(train), len(validate), len(test)))



In [23]:
assert((len(train)+ len(validate) + len(test)) == (len(glob.glob(label_path + "/test/*")) + len(glob.glob(label_path + "/train/*")) + len(glob.glob(label_path + "/val/*"))))

In [None]:
# TODO auto generate YAML
# import yaml

# data = dict('path' : './data/islands',  # dataset root dir
#     'train' : 'images/train2017',  # train images (relative to 'path') 128 images
#     'val' : 'images/train2017',  # val images (relative to 'path') 128 images
#     A = 'a',
#     B = dict(
#         C = 'c',
#         D = 'd',
#         E = 'e',
#     )
# )


# with open('data.yml', 'w') as outfile:
#     yaml.dump(data, outfile, default_flow_style=False)