# Data Preperation 
Prepares the TEACO data for training with Keras YOLO v3.
Converts the annotations from the COCO dataset format to the the Yolo v3 data format.


## Setup & Imports

In [38]:
from pycocotools.coco import COCO
import os
import numpy as np
import skimage.io as matplotlib
import matplotlib.pyplot 

%matplotlib inline

## Load Coco Annotations
Load COCO annotations using PY coco API

In [47]:
DATA_DIR = "./data/TACO/data"
ANNOTATIONS = "./data/TACO/data/annotations.json"
dataset_coco = COCO(ANNOTATIONS)

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!


## Extract Bounding Boxes
Extract bounding box annotations for each image

In [29]:
def unpack_annotation(dataset_coco, annotation_id):
    annotations = dataset_coco.loadAnns(annotation_id)
    assert len(annotations) == 1
    bound_box = annotations[0]["bbox"]
    class_id = annotations[0]["category_id"]
    
    return {"bound_box": bound_box, "class": class_id}

def extract_bound_boxes(dataset_coco, image_id):
    # find annotations for image id 
    annotate_ids = dataset_coco.getAnnIds(imgIds=[image_id])
    # extract bounding box & class for each annotation
    return list(map((lambda aid: unpack_annotation(dataset_coco, 
                                                   aid)), 
                                                   annotate_ids))

In [37]:
boxes_annotations = list(map((lambda img_id: extract_bound_boxes(dataset_coco, img_id)), 
                             dataset_coco.getImgIds()))

## Compile Image Paths

In [48]:
def get_file_path(dataset_coco, dataset_path, img_id):
    metadata = dataset_coco.loadImgs(img_id)[0]
    return os.path.join(dataset_path, metadata["file_name"])

In [55]:
img_paths = list(map((lambda img_id: get_file_path(dataset_coco, DATA_DIR, img_id)), 
                     dataset_coco.getImgIds()))

## Convert to Yolo v3 Annotation Format
According to the [docs](https://github.com/qqwweee/keras-yolo3):
```
One row for one image;
Row format: image_file_path box1 box2 ... boxN;
Box format: x_min,y_min,x_max,y_max,class_id (no space).
```

In [77]:
def convert_format(img_path, boxes_annotations):
    # convert box annotations to box format
    boxes_str = ""
    for annotation in boxes_annotations:
        box_str = [ str(int(x)) for x in annotation["bound_box"] ]
        boxes_str += ",".join(box_str)
        boxes_str += "," + str(annotation["class"])

    # add img path
    return img_path + " " + boxes_str

In [84]:
annotation_rows =  list(map((lambda args: convert_format(*args)), 
                            zip(img_paths, boxes_annotations)))
annotation = "\n".join(annotation_rows) + "\n"

## Commit Annotation to Disk for Training

In [85]:
with open("./tools/keras-yolo3/train.txt", "w") as f:
    f.write(annotation)

## Save Class Names

In [99]:
class_metas = [ dataset_coco.loadCats(i)[0] for i in dataset_coco.getCatIds() ]
classes = [ meta["name"] for meta in class_metas ]
classes = "\n".join(classes) + "\n"

In [102]:
with open("./tools/keras-yolo3/model_data/voc_classes.txt", "w") as f:
    f.write(classes)