# Coco dataset Preprocessor

The purpose of this notebook is to preprocess given Coco dataset into the correct format that YOLO models desire.  
YOLO needs the following directory tree:
- images
    - train
    - val
    - test
 - labels
    - train
    - val
    - test

Moreover, labels need to be in the .txt format and the name of the file should be corresponding to the image name. The .txt files should have an appropriate structure. Each bounding box should be placed in the new line and follow the following structure:  

**_classId x y width height_**  

Where:
* classId: Id of the class of an object represented by the bounding box
* x: x coordinate of the midpoint of the bounding box (relative to the image width ([0-1] values))
* y: y coordinate of the midpoint of the bounding box (relative to the image height ([0-1] values))
* width: width of the bounding box (relative to the image width ([0-1] values))
* height: height of the bounding box (relative to the image height ([0-1] values))

In [1]:
import os
import shutil
import commonPaths
import commonCocoPreprocessingFunctions as preprocFuncs
from importlib import reload
reload(commonPaths)
reload(preprocFuncs);

In [3]:
def preprocessBoundingBoxCoordinates(bbox, imageHeight, imageWidth):
    '''
        ### preprocessBoundingBoxCoordinates
        takes Coco coordinates of bbox and returns coordinates applicable for YOLO models.

        :param bbox: 4 element array: at index 0: X coordinate of top left corner; 1: Y coordinate of top left corner;
        2: width of bbox; 3: height of bbox
        :param imageHeight: height of the original image
        :param imageWidth: width of the original image

        :return: 4 elements: xMid: X coordinate of the middle point of the bbox; yMid: Y coordinate 
        of the middle point of the bbox; width: width of the bbox; height: height of the bbox
        Note: every returned value is relative to the image dimensions!
    '''
    xTopLeft = float(bbox[0])
    yTopLeft = float(bbox[1])
    width = float(bbox[2])
    height = float(bbox[3])
    xMid = xTopLeft + width/2
    yMid = yTopLeft + height/2
    
    xMid = round(xMid / imageWidth, 6)
    yMid = round(yMid / imageHeight, 6)
    width = round(width / imageWidth, 6)
    height = round(height / imageHeight, 6)
    return xMid, yMid, width, height

In [4]:
def createTxtFilesWithBoundingBoxes(destinationDirLabels, imageIdToPropsAndAnnots, categoryIdToNameAndYoloId, safe=True, copyImages=False, sourceDirImages=None, destinationDirImages=None ):
    '''
        ### createTxtFilesWithBoundingBoxes
        function that creates txt files with bbox information that is needed by YOLO models

        :param destinationDirLabels: directory in which to create txt files (with trailing slash)
        :param imageIdToPropsAndAnnots: directory where key=imageId, value=CocoImage object (e.g. result of associateImageIdWithItsPropsAndAnnots)
        :param categoryIdToNameAndYoloId: directory where key=categoryId, value=CocoCategory object (e.g. result of associateCategoryIdWithItsNameAndYoloId)
        :param safe: whether to check if the directory is empty before creating files there. Set to True by default. [If set
          to False, existing files may be overwritten!]
        :param copyImages: whether to copy images as well. Not recommended! It is faster to copy it by hand. Set to False by default
        :param sourceDirImages: source directory from which to copy the images. Set to None by default (copying is not recommended)
        :param destinationDirImages: destination directory to which to copy the images. Set to None by default (copying is not recommended)
    '''
    shouldCopyImages = copyImages and sourceDirImages != None and destinationDirImages != None
    if(safe):
        numberOfFilesInDir = len(os.listdir(destinationDirLabels))
        if( numberOfFilesInDir != 0 ):
            msg = f"The labels destination directory \"{destinationDirLabels}\" is not empty! If you wish to create files anyway,"
            msg += " run the function with safe=False. Some files may be overwritten!"
            
            raise Exception(msg)
        if(shouldCopyImages):
            numberOfFilesInDir = len(os.listdir(destinationDirImages))
            if( numberOfFilesInDir != 0 ):
                msg = f"The images destination directory \"{destinationDirImages}\" is not empty! If you wish to create files anyway,"
                msg += " run the function with safe=False. Some files may be overwritten!"

                raise Exception(msg)
      
    for image in imageIdToPropsAndAnnots.values():
        fileName = image.fileName
        imageHeight = image.height
        imageWidth = image.width
        fileName = fileName.split(".")[0]+".txt"

        if(shouldCopyImages):
            shutil.copy(sourceDirImages+image.fileName, destinationDirImages+image.fileName)

        file = open(destinationDirLabels+fileName, 'w')
        firstAnnot = True
        for ann in image.annotations:
            
            bbox = ann['bbox']
            yoloId = categoryIdToNameAndYoloId[ann["category_id"]].yoloId

            x, y, width, height = preprocessBoundingBoxCoordinates(bbox, imageHeight, imageWidth)
            line = str(yoloId) + " " + str(x) + " " + str(y) + " " + str(width) + " " + str(height)
            if(not firstAnnot):
                line = "\n" + line
            firstAnnot = False

            file.write(line)
        file.close()

In [6]:
def createConfigFile(configFilePath, yoloDirectory, relativeImgTrainDir, relativeImgValDir, categoryIdToNameAndYoloId, safe=True):
    '''
        ### createConfigFile
        creates appropriate config file for YOLO models.

        :param configFilePath: path to config file
        :param yoloDirectory: path to directory containing appropriate directory structure with images and labels
        :param relativeImgTrainDir: name of the directory with training images. Relative to yoloDirectory
        :param relativeImgValDir: name of the directory with validation images. Relative to yoloDirectory
        :param categoryIdToNameAndYoloId: directory where key=categoryId, value=CocoCategory object (e.g. result of associateCategoryIdWithItsNameAndYoloId)
        :param safe: whether to check if the config file exists (when safe=False, the file may be overwritten!)
    '''
    if(safe):
        if(os.path.isfile(configFilePath)):
            msg = f"Config file already exists at \"{configFilePath}\""
            raise Exception(msg)
    line1 = "path: " + yoloDirectory + "\n"
    line2 = "train: " + relativeImgTrainDir + "\n"
    line3 = "val: " + relativeImgValDir + "\n\n\n"
    line4 = "names:"
    file = open(configFilePath, "w")
    file.write(line1)
    file.write(line2)
    file.write(line3)
    file.write(line4)
    for cat in categoryIdToNameAndYoloId.values():
        line = "\n\t" + str(cat.yoloId) + ": \"" + cat.categoryName +"\""
        file.write(line)
    file.close()

In [8]:
def providePaths(forTrain=False):
    '''
        ### providePaths
        provides paths for data preprocessing for preprocessCocoData function. There are different paths whether
        one want to preprocess train or val set.
        :param forTrain: defines if paths should be returned for train set. (They will be for val set otherwise)
        :return: dictionary with paths to directories applicable for test or val set
    '''
    paths = {}
    if (forTrain):
        paths["COCO_ANNOT_DIR"] = commonPaths.COCO_TRAIN_ANNOT_DIR
        paths["ANNOT_FILENAME"] = commonPaths.TRAIN_ANNOT_FILENAME
        paths["COCO_IMG_DIR"] = commonPaths.COCO_TRAIN_IMG_DIR
        paths["COCO_DUMP_IMG_DIR"] = commonPaths.COCO_DUMP_TRAIN_IMG_DIR
        paths["ANNOT_YOLO_JSON_FILE"] = commonPaths.ANNOT_YOLO_TRAIN_JSON_FILE
        paths["YOLO_LABELS_DIR"] = commonPaths.YOLO_TRAIN_LABELS_DIR
    else:
        paths["COCO_ANNOT_DIR"] = commonPaths.COCO_VAL_ANNOT_DIR
        paths["ANNOT_FILENAME"] = commonPaths.VAL_ANNOT_FILENAME
        paths["COCO_IMG_DIR"] = commonPaths.COCO_VAL_IMG_DIR
        paths["COCO_DUMP_IMG_DIR"] = commonPaths.COCO_DUMP_VAL_IMG_DIR
        paths["ANNOT_YOLO_JSON_FILE"] = commonPaths.ANNOT_YOLO_VAL_JSON_FILE
        paths["YOLO_LABELS_DIR"] = commonPaths.YOLO_VAL_LABELS_DIR
        
    paths["YOLO_CONFIG_FILE"] = commonPaths.YOLO_CONFIG_FILE
    paths["YOLO_DATA_DIR_FOR_CONFIG"] = commonPaths.YOLO_DATA_DIR[:-1]
    return paths

In [9]:
def preprocessCocoData(preprocessTrain=False):
    '''
        ### preprocessCocoData
        Preprocesses coco dataset for YOLO models to use. It clears the not annotated data, creates applicable
        labels for training of YOLO models along with YOLO config file.

        :param preprocessTrain: defines whether preprocessing should be done for train or val set. [val set by default]
    '''
    paths = providePaths(preprocessTrain)
    instancesJSON = preprocFuncs.getInstancesAsJSON(paths["COCO_ANNOT_DIR"], paths["ANNOT_FILENAME"])
    imageIdToPropsAndAnnots = preprocFuncs.associateImageIdWithItsPropsAndAnnots(instancesJSON)
    categoryIdToNameAndYoloId = preprocFuncs.associateCategoryIdWithItsNameAndYoloId(instancesJSON)
    imageIdToPropsAndAnnots = preprocFuncs.clearDataSetFromNotAnnotatedImgs(paths["COCO_IMG_DIR"], paths["COCO_DUMP_IMG_DIR"], imageIdToPropsAndAnnots)
    preprocFuncs.createAnnotJSONForYolo(categoryIdToNameAndYoloId, imageIdToPropsAndAnnots, paths["ANNOT_YOLO_JSON_FILE"])
    createConfigFile(paths["YOLO_CONFIG_FILE"], paths["YOLO_DATA_DIR_FOR_CONFIG"], "images/train", "images/val", categoryIdToNameAndYoloId)
    createTxtFilesWithBoundingBoxes(paths["YOLO_LABELS_DIR"], imageIdToPropsAndAnnots, categoryIdToNameAndYoloId)

In [13]:
preprocessCocoData()