# Join all LabelAR collects in a given folder
So that we have one annotation file and one image folder for easy training. (One each for train and val, actually)
The folder structure will be:
 - `data/annotations/new-annotation-file.json`
 - `data/images/new-image-folder/*.png`
 
__Before running this notebook__, you should have copied all the labelar collection folders you want to join into a single folder in `data/` and name that folder some unique dataset name so that you have:
  - `data/unique-dataset-name/collect-id-1/`, `data/unique-dataset-name/collect-id-2/`, ...

In [1]:
import os
import subprocess

import contextlib

root_dir = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
os.chdir(root_dir)

import glob
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
%matplotlib inline
from PIL import Image

Importing from pycocotools can be problematic.  Did you activate the right conda env e.g. `conda activate labelar_demo`?

In [2]:
import sys
sys.path.append('vendor/cocoapi/PythonAPI')
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils

# Set variables
Set some path variables so that `collection_path` leads to a directory of labelar collection folders

In [55]:
# SET THESE PATH VARIABLES
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
name = 'alphamugs' # the name of your new dataset  !! - this needs to match you unique dataset name you gave the collection folder e.g. data/alphamugs/ (see top cell instructions)
split = 'train'
#split = 'val'
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

data_dir = os.path.join(os.getcwd(),'data')

#this should lead to a directory of labelar collection folders
collection_path = os.path.join(data_dir,name)

# get list of ids of those collection folders
collection_ids = os.listdir(collection_path)
print(collection_ids)

['SWKW', 'FPLM']


# View the disjoint annotations
Gather the annotation files and display over the images...

In [None]:
#pick one of those collection ids
cid = collection_ids[1] # <- change this number to view different collects

#path to folder that contains annotations and images
src_folder = os.path.join(collection_path,cid) 
# get list of annotation files
ann_files = glob.glob(os.path.join(src_folder,'*'+'.json' )) 

for ann_file in ann_files:
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
        coco = COCO(ann_file) #TODO: suppress this

    coco_imgs = coco.loadImgs(ids=coco.getImgIds())

    # view the collected labelar images
    for coco_im in coco_imgs:

        im = Image.open(os.path.join(src_folder,coco_im['file_name']))

        fig = plt.figure(figsize=(10, 10)) #change display size of image here

        plt.title('Image id: {}'.format(coco_im['id']))
        plt.axis('off')
        imshow(im)

        # display annotations
        ann_ids = coco.getAnnIds(imgIds=[coco_im['id']])
        anns = coco.loadAnns(ann_ids)

        #print(anns)

        #quick formatting mod
        for ann in anns:
            ann['segmentation'] = [ann['segmentation'][0]['points']]
            #for seg in ann['segmentation']:
                #seg['points'] = [seg['points']]
            ann['bbox'] = [ann['box']]


        coco.showAnns(anns)

# Check for category mispelling

In [58]:
#check for category mispellings
for cid in collection_ids:
    #get folder path that contains annotations and images
    src_folder = os.path.join(collection_path,cid) 
    
    #get list of annotation files in the folder
    json_files = glob.glob(os.path.join(src_folder,'*'+'.json' )) 
    
    #load data as a COCO object
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
        coco_misp = COCO(json_files[0]); 
    print('\nFor cid:', cid, '\n', coco_misp.dataset['categories'], '\n')
    


For cid: SWKW 
 [{'supercategory': '', 'id': 0, 'name': 'mug-blu-s'}, {'supercategory': '', 'id': 1, 'name': 'mug-blu-t'}, {'supercategory': '', 'id': 2, 'name': 'mug-red'}, {'supercategory': '', 'id': 3, 'name': 'mug-wht-s'}, {'supercategory': '', 'id': 4, 'name': 'mug-wht-t'}] 


For cid: FPLM 
 [{'supercategory': '', 'id': 0, 'name': 'mug-blu-t'}, {'supercategory': '', 'id': 1, 'name': 'mug-wht-t'}, {'supercategory': '', 'id': 2, 'name': 'mug-red'}, {'supercategory': '', 'id': 3, 'name': 'mug-wht-s'}, {'supercategory': '', 'id': 4, 'name': 'mug-blue-s'}] 



## Make a spelling correction here
If we mispelled anything, input the `{'collection_id':{'mispelling':'correction'}}` in the cell below

In [38]:
#if we mispelled anything, input the `{'collection_id':{'mispelling':'correction'}}` here
mispelled = {'FPLM':{'mug-blue-s':'mug-blu-s'},'SWKW':{}}

#else, leave as an empty dictionary for the respective collection id, at a minimum:
#mispelled = {'FPLM':{},'SWKW':{}}


# Create and save the joined json (along with the new joined image folder)
Go through each collect and add images and annotations to respective master lists

In [None]:
from shutil import copyfile

ann_counter = 0

master_imgs, master_anns = [],[]

newImgFolder = os.path.join(data_dir,'images','{}_{}'.format(name,split))

#New image directory
if not os.path.exists(newImgFolder):
    os.makedirs(newImgFolder)

#for each collect
for cid in collection_ids:
    
    #get folder path that contains annotations and images
    src_folder = os.path.join(collection_path,cid) 
    
    #get list of annotation files in the folder
    json_files = glob.glob(os.path.join(src_folder,'*'+'.json' )) 
    
    
    #for each json file in the folder
    for json_file in json_files:
        
        #load data as a COCO object
        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppressed output
            coco = COCO(json_file); #TODO: suppress this

        #APPEND IMAGES TO MASTER LIST
        # -  -  -  -  -  -  -  -  -  -  -  -
        
        imgs = coco.loadImgs(ids=coco.getImgIds())

        #for each image
        for img in imgs:
            
            new_img = img.copy()
            
            new_img['collection_id'] = cid
            
            new_img['file_name'] = str(img['id']).zfill(12) + '.png'

            master_imgs.append(new_img)
            
            I = Image.open(os.path.join(src_folder,img['file_name']))
    
            npim = np.array(I)

            # If png has 4 channels, save only 3
            src = os.path.join(src_folder,img['file_name'])
            dst = os.path.join(newImgFolder,str(img['id']).zfill(12) + '.png')
            if npim.shape[2] == 4:
                img_3chan = Image.fromarray(npim[...,:3])

                portrait = False
                if portrait:
                    img_rot = img_3chan.rotate(270) # if in portrait mode
                    img_rot.save(dst)
                else:
                    img_3chan.save(dst)

                print('Saved image: {}'.format(dst))
            else:
                #new_path = os.path.join(img_dst,new_name)
                shutil.copy(src,dst)
            
            #Copy image over to new image folder

            #copyfile(src,dst)
            
            
        #APPEND ANNOTATIONS TO MASTER LIST
        # -  -  -  -  -  -  -  -  -  -  -  -
        
        #get the category dictionary for this collect
        anns = coco.loadAnns(coco.getAnnIds())
        collect_cats = coco.dataset['categories']
        
        #correct mispelled category names
        if mispelled[cid]:
            print('Correcting mispelling for collection:', cid, '\n\n')
            for d in collect_cats:
                if d['name'] in mispelled[cid]:
                    d['name'] = mispelled[cid][d['name']]
            
        
        #sort alphabetically on category names then re-assign category ids
        sorted_cats = sorted(collect_cats, key=lambda k: k['name']) 
        for di,new_id in zip(sorted_cats, [0,1,2,3,4]): 
            di['id'] = new_id
            
        #create a mapping index from the old id to the new id
        oldId2NewId = {x['id']:y['id'] for x,y in zip(collect_cats,sorted_cats)}

        for ann in anns:

            new_ann = ann.copy()
                               
            #quick formatting mods
            new_ann['segmentation'] = [ann['segmentation'][0]['points']]
            new_ann['bbox'] = ann['box']
                               
            #update to keep category-id consistent across multiple collects
            new_ann['category_id'] = oldId2NewId[ann['category_id']]
            
            new_ann['id'] = ann_counter
            
            ann_counter += 1
                               
            master_anns.append(new_ann)
            
        master_cats = sorted_cats

new_instances = {'images':master_imgs, 'annotations':master_anns, 'categories':master_cats}

newAnnFile = os.path.join(data_dir,'annotations','instances_{}_{}.json'.format(name,split))

#Check 'annotations' directory
annotations_folder = os.path.join(data_dir,'annotations')
if not os.path.exists(annotations_folder):
    os.makedirs(annotations_folder)

with open(newAnnFile, 'w') as outfile:
    json.dump(new_instances, outfile)

print('New instance annotations save as {}'.format(newAnnFile))

# Visually verify the new annotations

In [None]:
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): #suppresses output
    coco_all = COCO(os.path.join(annotations_folder,newAnnFile))

coco_imgs = coco_all.loadImgs(ids=coco_all.getImgIds())

# view the collected labelar images
for coco_im in coco_imgs:

    im = Image.open(os.path.join(newImgFolder,coco_im['file_name']))

    fig = plt.figure(figsize=(10, 10)) #change display size of image here

    plt.title('Image id: {}'.format(coco_im['id']))
    plt.axis('off')
    imshow(im)

    # display annotations
    ann_ids = coco_all.getAnnIds(imgIds=[coco_im['id']])
    anns = coco_all.loadAnns(ann_ids)


    coco_all.showAnns(anns)