# mix_annotations.ipynb
Combines coco .json files into one

### Import json files by a dict of paths

In [1]:
import json 

def read_annotations(ann_paths):
    #Create empty list to store the datasets info
    my_dicts = []
    #Iterate for every dataset path
    for ann_path in ann_paths:
        # Read the .json file
        with open(ann_path) as f:
            data = json.load(f)
            my_dicts.append(data)
            print("Completed: ", ann_path )
    return my_dicts

### Mix annotations from the loaded dicts

In [2]:
import copy
def mix_annotations(my_dicts):
    combined_dict = copy.deepcopy(my_dicts[0])
    im_id = len(my_dicts[0]["images"])
    ann_id = len(my_dicts[0]["annotations"])
    # print(im_id, ann_id)
    for d in my_dicts[1:]:
        for image in d["images"]:
            temp = image.copy()
            temp["id"] = im_id + temp["id"] 
            combined_dict["images"].append(temp)
        for ann in d["annotations"]:
            temp = ann.copy()
            temp["id"] = ann_id + temp["id"]
            temp["image_id"] = im_id + temp["image_id"] 
            combined_dict["annotations"].append(temp)

        im_id = len(d["images"])
        ann_id = len(d["annotations"])
    return combined_dict


## Mixing images example

In [3]:
ann_paths = [
    "/home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/filtered_mixed_ucb_gait.json",
    "/home/josmar/proyectos/codes/datasets/ucb_gait_frames/annotations/filtered_ucb_gait_poly.json",
    ]
my_dicts= read_annotations(ann_paths)

Completed:  /home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/filtered_mixed_ucb_gait.json
Completed:  /home/josmar/proyectos/codes/datasets/ucb_gait_frames/annotations/filtered_ucb_gait_poly.json


In [4]:
index = 0
for d in my_dicts:
    print("\nDataset ",index)
    print("Image IDs:\t" , len(d["images"]))
    print("Annotation IDs:\t" , len(d["annotations"]))
    index+=1


Dataset  0
Image IDs:	 10000
Annotation IDs:	 20000

Dataset  1
Image IDs:	 11750
Annotation IDs:	 11750


In [5]:
combined_dict = mix_annotations(my_dicts)

In [6]:
print("\nCombined dataset")
print("Image IDs:\t" , len(combined_dict["images"]))
print("Annotation IDs:\t" , len(combined_dict["annotations"]))


Combined dataset
Image IDs:	 21750
Annotation IDs:	 31750


### Saving the generated dataset

In [7]:
out_path = "/home/josmar/proyectos/codes/datasets/ucb_gait_combined/annotations/ucb_gait_combined.json"
with open(out_path, 'w') as fp:
    json.dump(combined_dict, fp)

## Creating Train Test and Val datasets with the generated json
Based on https://github.com/akarazniewicz/cocosplit/blob/master/cocosplit.py

In [1]:
def save_coco(file, info, licenses, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'info': info, 'licenses': licenses, 'images': images, 
            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)

In [2]:
def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)

In [3]:
import random
import json
import funcy
from sklearn.model_selection import train_test_split

out_path = "/home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_images_ucb.json"
has_annotations = True
train_split = 0.8
val_split = 0.1
train_file = "/home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_train_ucb_gait.json"
val_file = "/home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_val_ucb_gait.json"
test_file = "/home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_test_ucb_gait.json"
with open(out_path, 'rt', encoding='UTF-8') as annotations:
    
    coco = json.load(annotations)
    info = coco['info']
    licenses = coco['licenses']
    images = coco['images']
    annotations = coco['annotations']
    categories = coco['categories']

    number_of_images = len(images)

    images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)

    if has_annotations:
        images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images)

    x, y = train_test_split(images, train_size=train_split, shuffle=True)

    val_split = round(val_split/(1-train_split) , 2)
    y, z = train_test_split(y, train_size=val_split, shuffle=True)
    
    save_coco(train_file, info, licenses, x, filter_annotations(annotations, x), categories)
    save_coco(val_file, info, licenses, y, filter_annotations(annotations, y), categories)
    save_coco(test_file, info, licenses, z, filter_annotations(annotations, z), categories)

    print("Saved\n \
        {} entries in {}\n \
        {} entries in {}\n \
        {} entries in {}".format(len(x), train_file, len(y), val_file, len(z), test_file))

Saved
         17400 entries in /home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_train_ucb_gait.json
         2175 entries in /home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_val_ucb_gait.json
         2175 entries in /home/josmar/proyectos/codes/01_annotation_tools/test_data/annotations/filtered/cropped_test_ucb_gait.json


In [30]:
train_split = 0.8
val_split = 0.1

print(val_split)

0.5


## Creating ORDERED Train Test and Val datasets with the generated json

In [1]:
import json
import pandas as pd
import numpy as np

dataset_path = "/home/josmar/proyectos/codes/datasets/ucb_gait_cropped/cropped_images_ucb.json"

with open(dataset_path) as f:
    dataset = json.load(f)

In [2]:
dataset.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [5]:
bg_dict = {}
for image in dataset["images"]:
    place = image["file_name"].split("_")[1]
    if place.find("-") != -1:
        place = "lab"
    if place in bg_dict:
        bg_dict[place]+=1
    else:
        bg_dict[place]=1
total = sum(bg_dict.values()) 
print("Total images: ",total)
print("Total backgrounds: ", len(bg_dict), "\n")

idx=0
df = pd.DataFrame(columns=('Background', 'N° images', 'Percentage'))
for key, value in bg_dict.items():

    percent = round(value/total*100,2)
    df.loc[idx] = [key, value, percent]
    idx+=1
display(df)

Total images:  21750
Total backgrounds:  33 



Unnamed: 0,Background,N° images,Percentage
0,abbey,277,1.27
1,basket,288,1.32
2,beach,292,1.34
3,buildings2,328,1.51
4,buildings3,316,1.45
5,buildings,289,1.33
6,coffee,314,1.44
7,curtain,293,1.35
8,desert2,314,1.44
9,desert,302,1.39


In [6]:
print(bg_dict.keys())

dict_keys(['abbey', 'basket', 'beach', 'buildings2', 'buildings3', 'buildings', 'coffee', 'curtain', 'desert2', 'desert', 'elevator', 'fall', 'forest', 'lab', 'monastery', 'out', 'park', 'parking2', 'parking3', 'parking', 'road', 'room', 'snow2', 'snow', 'stage', 'street2', 'street3', 'street', 'toy', 'train', 'valley', 'wall', 'whitehouse'])


In [7]:
train_bg = ['abbey', 'basket', 'beach', 'buildings', 'curtain', 'desert', 'forest', 'lab', 'monastery', 'out', 'parking', 'snow', 'stage', 'street', 'toy', 'train', 'whitehouse']
eval_bg = ['buildings2', 'parking2', 'street2', 'desert2', 'fall', 'elevator', 'park', 'wall']
test_bg = ['buildings3', 'parking3', 'street3', 'snow2', 'valley', 'coffee', 'road', 'room']

In [8]:
dataset["images"][0]

{'license': 0,
 'file_name': 'crop_abbey_002-009_032-227.jpg',
 'width': 512,
 'height': 512,
 'id': 0}

we store the images per type inside dataset_images and create a mapping list called dataset_convert that will help us to distribute the annotations 

In [9]:
bg_dict = {}
bg_list = [train_bg, eval_bg, test_bg]
dataset_images = [[],[],[]]
dataset_convert = [[],[],[]]
counter_images = [0,0,0]
for image in dataset["images"]:
    place = image["file_name"].split("_")[1]
    if place.find("-") != -1:
        place = "lab"
    for idx in range(len(bg_list)):
        if place in bg_list[idx]:
            current_id = image["id"]
            new_image = dict(image)
            new_image["id"] = counter_images[idx]
            dataset_images[idx].append(new_image)
            dataset_convert[idx].append([current_id, counter_images[idx]])
            counter_images[idx] += 1

Now we create a list of the original values for the dataset images for each dataset type

In [10]:
convert_originals = []
for conv in dataset_convert:
    transposed = list(zip(*conv))
    convert_originals.append(transposed[0])


In [11]:
print(convert_originals[0][1800])

3670


In [13]:
print(len(dataset_images[0])+len(dataset_images[1])+len(dataset_images[2]))
print(len(dataset_convert[0])+len(dataset_convert[1])+len(dataset_convert[2]))

21750
21750


In [14]:
idx=1800
print(dataset_images[0][idx])
print(dataset_convert[0][idx])

{'license': 0, 'file_name': 'crop_forest_012-174_040-239.jpg', 'width': 512, 'height': 512, 'id': 1800}
[3670, 1800]


In [15]:
len (dataset["annotations"])

25349

In [16]:
counter_annotations=[0,0,0]
dataset_anns = [[],[],[]]
for ann in dataset["annotations"]:
    for idx in range(len(dataset_convert)):
        if ann["image_id"] in convert_originals[idx]:
            i = convert_originals[idx].index(ann["image_id"])
            new_ann = dict(ann)
            new_ann["image_id"] = i
            new_ann["id"]=counter_annotations[idx]
            dataset_anns[idx].append(new_ann)
            counter_annotations[idx]+=1


In [302]:
len(dataset_images)

3

In [30]:
image_ids = [[],[],[]]
real_image_ids = [[],[],[]]
for idx in range(len(dataset_images)):
    for ann in dataset_anns[idx]:
        image_ids[idx].append(ann["image_id"])
    for img in dataset_images[idx]:
        real_image_ids[idx].append(img["id"])
image_ids = [list(set(x)) for x in image_ids]
real_image_ids = [list(set(x)) for x in real_image_ids]

In [41]:
wrong_counter=0
for idx in range (len(real_image_ids)):
    for im_id in image_ids[idx]:
        if not im_id in real_image_ids[idx]:
            wrong_counter+=1
print("Wrong images:", wrong_counter)

Wrong images: 0


In [39]:
a=[2,4,6,8]
b=3
not b in a

True

In [314]:
import copy
dataset_titles = ["Reordered Train ucb-gait crop", "Reordered Val ucb-gait crop", "Reordered Test ucb-gait crop"]
dataset_dicts = []

for idx in range(len(dataset_images)):
    new_dict = dict(dataset)
    new_dict["info"]["description"]=dataset_titles[idx]
    new_dict["annotations"] = dataset_anns[idx]
    new_dict["images"] = dataset_images[idx]

    dataset_dicts.append(new_dict)

In [319]:
#Saving
dataset_files = ["/home/josmar/proyectos/codes/datasets/ucb_gait_cropped/train_reordered_ucb-gait_crop.json",
                    "/home/josmar/proyectos/codes/datasets/ucb_gait_cropped/val_reordered_ucb-gait_crop.json",
                    "/home/josmar/proyectos/codes/datasets/ucb_gait_cropped/test_reordered_ucb-gait_crop.json"]
for idx in range(len(dataset_dicts)):
    with open(dataset_files[idx], 'w') as json_file:
        json.dump(dataset_dicts[idx], json_file)

In [311]:
a["info"]

{'description': 'crop_ucb_gait',
 'url': '',
 'version': '0.1',
 'year': 2020,
 'contributor': 'Josmar Suarez',
 'date_created': '2020/07/14'}