# Join all LabelAR collects in a given folder
So that we have one annotation file and one image folder for easy training. (One each for train and val, actually)
The folder structure will be:
 - `data/annotations/new-annotation-file.json`
 - `data/images/new-image-folder/*.png`
 
__Before running this notebook__, you should have copied all the labelar collection folders you want to join into a single folder in `data/` and name that folder some unique dataset name so that you have:
  - `data/unique-dataset-name/collect-id-1/`, `data/unique-dataset-name/collect-id-2/`, ...

In [None]:
import os
import subprocess
from pathlib import Path
import contextlib

root_dir = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip().decode("utf-8"))
os.chdir(root_dir)

import glob
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
%matplotlib inline
from PIL import Image

Importing from pycocotools can be problematic.  Did you activate the right conda env e.g. `conda activate labelar_demo`?

In [None]:
import sys
sys.path.append('vendor/cocoapi/PythonAPI') # not needed as long as you have "labelar_demo" selected as the conda environment
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils

# Set variables
Set some path variables so that `collection_path` leads to a directory of labelar collection folders

In [None]:
# SET THESE PATH VARIABLES
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
name = 'demo-mugs' # the name of your new dataset  !! - this needs to match you unique dataset name you gave the collection folder e.g. data/alphamugs/ (see top cell instructions)
split = 'train'
#split = 'val'
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

data_dir = os.path.join(os.getcwd(),'data')

#this should lead to a directory of labelar collection folders
collection_path = os.path.join(data_dir,name)

# get list of ids of those collection folders
collection_ids = os.listdir(collection_path)
print(collection_ids)

# View the disjoint annotations
Gather the annotation files and display over the images...

In [None]:
#pick one of those collection ids
cid = collection_ids[1] # <- change this number to view different collects

#path to folder that contains annotations and images
src_folder = os.path.join(collection_path,cid) 
# get list of annotation files
ann_files = glob.glob(os.path.join(src_folder,'*'+'.json' )) 

for ann_file in ann_files:
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
        coco = COCO(ann_file) #TODO: suppress this

    coco_imgs = coco.loadImgs(ids=coco.getImgIds())

    # view the collected labelar images
    for coco_im in coco_imgs:

        im = Image.open(os.path.join(src_folder,coco_im['file_name']))

        fig = plt.figure(figsize=(10, 10)) #change display size of image here

        plt.title('Image id: {}'.format(coco_im['id']))
        plt.axis('off')
        imshow(im)

        # display annotations
        ann_ids = coco.getAnnIds(imgIds=[coco_im['id']])
        anns = coco.loadAnns(ann_ids)

        #print(anns)

        #quick formatting mod
        for ann in anns:
            ann['segmentation'] = [ann['segmentation'][0]['points']]
            #for seg in ann['segmentation']:
                #seg['points'] = [seg['points']]
            ann['bbox'] = [ann['box']]


        coco.showAnns(anns)

# Check for category mispelling

In [None]:
#check for category mispellings
for i, cid in enumerate(collection_ids):
    #get folder path that contains annotations and images
    src_folder = os.path.join(collection_path,cid) 
    
    #get list of annotation files in the folder
    json_files = glob.glob(os.path.join(src_folder,'*'+'.json' )) 
    
    #load data as a COCO object
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
        coco_misp = COCO(json_files[0]); 
    print('\nFor cid:', cid, '\n', coco_misp.dataset['categories'], '\n', "i: ", i)
    

## Make a spelling correction here
If we mispelled anything, input the `{'collection_id':{'mispelling':'correction'}}` in the cell below

In [None]:
#if we mispelled anything, input the `{'collection_id':{'mispelling':'correction'}}` here
mispelled = {
    'FPLM': {'mug-blue-s':'mug-blu-s'},
    'SWKW': {},
    '3RPC': {'0': 'mug-wht-s', '1': 'mug-blu-t', '2': 'mug-wht-t', '3': 'mug-blu-s', '4': 'mug-red'},
    'VHR7': {},
    '1DKT': {'mug-white-t': 'mug-wht-t', 'mug-white-s': 'mug-wht-s'},
    'LYVP': {'mug-white-t': 'mug-wht-t', 'mug-white-s': 'mug-wht-s'},
    'KS9A': {},
    
}

#else, leave as an empty dictionary for the respective collection id, at a minimum:
#mispelled = {'FPLM':{},'SWKW':{}}

## Merge Categories from all the collects into one category set

...and create forwards and backwards mappings between the categories from individual collects and the merged&remapped categories. The cells after this that process annotations and images can use the old_to_new_cats mapping to remap references to the old/individual collect categories

In [None]:
new_cats = {}
old_to_new_cats = {}
for i, cid in enumerate(collection_ids):
    src_folder = os.path.join(collection_path,cid) 
    json_files = glob.glob(os.path.join(src_folder,'*'+'.json' )) 
    #load data as a COCO object
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
        coco = COCO(json_files[0]); 
    cats = coco.dataset["categories"]
    for c in cats:
        corrected_name = c["name"]
        if cid in mispelled:
            if c["name"] in mispelled[cid]:
                corrected_name = mispelled[cid][c["name"]]
        # If new cat name already registered, use the id already assigned to that cat name:
        if corrected_name in new_cats:
            new_cat_id = new_cats[corrected_name]["id"]
        else:
            new_cat_id = len(new_cats)
        # Update new_cats, and the old_to_new mapping:
        new_cats[corrected_name] = {
            "supercategory": ""
            , "id": new_cat_id
            , "name": corrected_name
        }
        old_to_new_cats[(cid,c["id"])] = {
            "cid": cid
            , "old_file": json_files[0]
            , "old_cat": c
            , "old_id": c["id"]
            , "old_name": c["name"]
            , "id": new_cat_id
            , "name": corrected_name
        }
        
print("\nNEW CATS (merged & remapped):")
for k, v in new_cats.items():
    print(f"{k}: {v}")

print("\nOLD TO NEW:")
for k, v in old_to_new_cats.items():
    print(k, v)

# Create and save the joined json (along with the new joined image folder)
Go through each collect and add images and annotations to respective master lists

In [None]:
from shutil import copyfile

ann_counter = 0
old_to_new_imgs = {}
master_imgs, master_anns = [],[]

newImgFolder = os.path.join(data_dir,'images','{}_{}'.format(name,split))

#New image directory
if not os.path.exists(newImgFolder):
    os.makedirs(newImgFolder)

#for each collect
for cid in collection_ids:
    print("CID: ", cid)
    #get folder path that contains annotations and images
    src_folder = os.path.join(collection_path,cid) 
    
    #get list of annotation files in the folder
    json_files = glob.glob(os.path.join(src_folder,'*'+'.json' ))
    
    #for each json file in the folder
    for json_file in json_files:
        print("JSON_FILE: ", json_file)
        #load data as a COCO object
        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
            coco = COCO(json_file);

        #APPEND IMAGES TO MASTER LIST
        # -  -  -  -  -  -  -  -  -  -  -  -
        img = coco.loadImgs(ids=coco.getImgIds())[0]
        new_img = img.copy()
        new_img['id'] = len(master_imgs)
        new_img['collection_id'] = cid
        new_img['old_file_name'] = str(img['id']).zfill(12) + '.png'
        new_img['file_name'] = str(new_img['id']).zfill(12) + '.png'
        new_img['old_id'] = img['id']
        old_to_new_imgs[img["id"]] = new_img['id']
        master_imgs.append(new_img)
        
        I = Image.open(os.path.join(src_folder,img['file_name']))
        npim = np.array(I)
        # If png has 4 channels, save only 3
        src = os.path.join(src_folder, img['file_name'])
        dst = os.path.join(newImgFolder, str(new_img['id']).zfill(12) + '.png')
        if npim.shape[2] == 4:
            img_3chan = Image.fromarray(npim[...,:3])
            portrait = False
            if portrait:
                img_rot = img_3chan.rotate(270) # if in portrait mode
                img_rot.save(dst)
            else:
                img_3chan.save(dst)
            print('Saved image: {}'.format(dst))
        else:
            shutil.copy(src,dst)

    #APPEND ANNOTATIONS TO MASTER LIST
    # -  -  -  -  -  -  -  -  -  -  -  -

        #get the category dictionary for this collect
        anns = coco.loadAnns(coco.getAnnIds())
#         ac += len(anns)
        for ann in anns:
            new_ann = ann.copy()
            #quick formatting mods
            new_ann['segmentation'] = [ann['segmentation'][0]['points']]
            new_ann['bbox'] = ann['box']
            #update to keep category-id consistent across multiple collects
            new_ann['category_id'] = old_to_new_cats[(cid, ann['category_id'])]["id"]
            new_ann['id'] = ann_counter
            new_ann["image_id"] = old_to_new_imgs[ann["image_id"]]
            ann_counter += 1
            master_anns.append(new_ann)

new_instances = {'images':master_imgs, 'annotations':master_anns, 'categories':list(new_cats.values())}

newAnnFile = os.path.join(data_dir,'annotations','instances_{}_{}.json'.format(name,split))

#Check 'annotations' directory
annotations_folder = os.path.join(data_dir,'annotations')
if not os.path.exists(annotations_folder):
    os.makedirs(annotations_folder)

with open(newAnnFile, 'w') as outfile:
    json.dump(new_instances, outfile)

print('New instance annotations save as {}'.format(newAnnFile))

In [None]:
print("img_count: ", len(master_imgs))
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppresses output
    coco_all = COCO(os.path.join(annotations_folder,newAnnFile))
print("num images: ", len(coco_all.dataset["images"]))
print("num anns: ", len(coco_all.dataset["annotations"]))
print("ann_counter: ", ann_counter)
#2, 5, 5, 2, 5, 5, 3


# Visually verify the new annotations

In [None]:
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppresses output
    coco_all = COCO(os.path.join(annotations_folder,newAnnFile))

coco_imgs = coco_all.loadImgs(ids=coco_all.getImgIds())
print(f"Found {len(coco_imgs)} images in dataset.")

# view the collected labelar images
for coco_im in coco_imgs:

    im = Image.open(os.path.join(newImgFolder,coco_im['file_name']))

    fig = plt.figure(figsize=(10, 10)) #change display size of image here

    plt.title('Image id: {}'.format(coco_im['id']))
    plt.axis('off')
    imshow(im)

    # display annotations
    ann_ids = coco_all.getAnnIds(imgIds=[coco_im['id']])
    anns = coco_all.loadAnns(ann_ids)


    coco_all.showAnns(anns)

## Check categories one at a time to see if all images in the merged coco map same objects to same categories

e.g., check that "small-blue-mug" refers to the same small blue mug across all collects


In [None]:
cat_name = 'mug-red'

with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppresses output
    coco_all = COCO(os.path.join(annotations_folder,newAnnFile))

cat_ids = coco_all.getCatIds(catNms=[cat_name]);
print("cat_ids: ", cat_ids)
coco_imgs = coco_all.loadImgs(ids=coco_all.getImgIds(catIds=cat_ids))

# view the collected labelar images
for coco_im in coco_imgs:
    im = Image.open(os.path.join(newImgFolder, coco_im['file_name']))
    fig = plt.figure(figsize=(10, 10)) #change display size of image here
    plt.title(f"{coco_im['file_name']} - Image id: {coco_im['id']}")
    plt.axis('off')
    imshow(im)
    # display annotations
    ann_ids = coco_all.getAnnIds(imgIds=[coco_im['id']], catIds=cat_ids)
    anns = coco_all.loadAnns(ann_ids)
    coco_all.showAnns(anns)