In [1]:
import numpy as np
from datetime import date
import cv2
import os
import json
import ffmpeg

In [2]:
"""
- image (a list that we already know the length by the length of the video, so we can make it ordered)
- annotation (a list that will be randomized order, but annotation id would be the same)

Parameters:
    filepath
    fileid - for making the image and annotation id

get the overall video length
get the general width and height of the video

image
    - go through and create the image files 

annotation
    - it's a dictionary, we just iterate through them
    - get the time stamp so that we can create the annotation id (how to deal with the object annoation part)
    - have a dictionary (we could initalize it): for each time stamp (key), we keep track of the curr object annotation id that we should use
    - steal; "area": float, "bbox": [x,y,width,height],
    - iscrowd = 0
"""

'\n- image (a list that we already know the length by the length of the video, so we can make it ordered)\n- annotation (a list that will be randomized order, but annotation id would be the same)\n\nParameters:\n    filepath\n    fileid - for making the image and annotation id\n\nget the overall video length\nget the general width and height of the video\n\nimage\n    - go through and create the image files \n\nannotation\n    - it\'s a dictionary, we just iterate through them\n    - get the time stamp so that we can create the annotation id (how to deal with the object annoation part)\n    - have a dictionary (we could initalize it): for each time stamp (key), we keep track of the curr object annotation id that we should use\n    - steal; "area": float, "bbox": [x,y,width,height],\n    - iscrowd = 0\n'

In [3]:
class CocoIdGenerator:
    def __init__(self, file_id, digits_for_file=3, digits_for_image=5, digits_for_obj=3):
        self.file_id = file_id
        self.digits_for_file = digits_for_file
        self.digits_for_image = digits_for_image
        self.digits_for_obj = digits_for_obj

    def generateImageId(self, curr_time):
        file_sec = str(self.file_id)
        file_sec = file_sec.zfill(self.digits_for_file)
        
        image_sec = str(int(curr_time / 0.1))
        image_sec = image_sec.zfill(self.digits_for_image)

        return int(file_sec + image_sec)

    def generateAnnId(self, curr_time, obj_id):
        file_sec = str(self.file_id)
        file_sec = file_sec.zfill(self.digits_for_file)
        
        image_sec = str(int(curr_time / 0.1))
        image_sec = image_sec.zfill(self.digits_for_image)
        
        obj_sec = str(obj_id)
        obj_sec = obj_sec.zfill(self.digits_for_obj)

        return int(file_sec + image_sec + obj_sec)

In [4]:
def getLabelAndId(attrDict, attrConfigDict):
    label = None
    object_id = None

    object_present_exist = False
    object_present_attr_key = ""
    object_id_exist = False
    object_id_attr_key = ""
    object_label_exist = False
    object_label_attr_key = ""

    mapToStandardLabel = {}

    for k in attrDict:
      if attrConfigDict[k]["aname"] == "object_present":
          object_present_exist = True
          object_present_attr_key = k
      elif attrConfigDict[k]["aname"] == "object_id":
          object_id_exist = True
          object_id_attr_key = k
      elif attrConfigDict[k]["aname"] == "object_label":
          object_label_exist = True
          object_label_attr_key = k
          
          cat_options = attrConfigDict[k]["options"]
          for og_cat_id in cat_options:
            if "shark" in cat_options[og_cat_id]:
              mapToStandardLabel[int(og_cat_id)] = 1
            elif "human" in cat_options[og_cat_id]:
              mapToStandardLabel[int(og_cat_id)] = 2
            else:
              print(f"not recognizable options in cat_options: {cat_options}")
              input("error at getLabelAndId")
      else:
          print(f"attrDict key: {k}")
          input("error at getLabelAndId")

    # default map
    #   shark = 0 -> 1
    #   human = 1 -> 2
    if len(mapToStandardLabel):
      mapToStandardLabel = {0:1, 1:2}

    if object_label_exist:
        original_label = int(attrDict[object_label_attr_key])
        label = mapToStandardLabel[original_label]
    elif object_present_exist:
        if "shark" in attrDict[object_present_attr_key]:
            label = 1  # coco's 0 category id is 
        elif "human" in attrDict[object_present_attr_key]:
            label = 2
        else:
            print(f"cannot identify object id based on: { attrDict[object_present_attr_key]}")
            input("error at getLabelAndId")
    else:
        print("cannot use any attribute to find label")
        input("error at getLabelAndId")

    if object_id_exist:
      object_id = int(attrDict[object_id_attr_key])

    return label, object_id

def getSegmentation(xy_region):
  _, x, y, w, h = xy_region
  return [[x, y, x+w, y, x+w, y+h, x, y+h]]

def getBBox(xy_region):
  _, x, y, w, h = xy_region
  return [x, y, w, h]

def getArea(xy_region):
  _, _, _, w, h = xy_region
  return w * h

def getCurrObjId(curr_time, cat_id, og_obj_id, curr_obj_id_dict):
  if og_obj_id != None and (og_obj_id in curr_obj_id_dict[float(curr_time)]["existing_obj_ids"][cat_id]):
    print(f"Ann (cat_id={cat_id}, og_obj_id={og_obj_id}, t={curr_time}) already got added, ignore this one")
    return None, curr_obj_id_dict
  else:
    curr_obj_id_dict[float(curr_time)]["existing_obj_ids"][cat_id] += [og_obj_id]
    curr_obj_id = curr_obj_id_dict[float(curr_time)]["curr_obj_id"]
    curr_obj_id_dict[float(curr_time)]["curr_obj_id"] += 1
    return curr_obj_id, curr_obj_id_dict

ANN_AREA_FILTER_THRESHOLD = 5

# https://stackoverflow.com/questions/42021972/truncating-decimal-digits-numpy-array-of-floats
def trunc(values, decs=1):
    return np.trunc(values*10**decs)/(10**decs)

"""
- get the time stamp so that we can create the annotation id (how to deal with the object annoation part)
- have a dictionary (we could initalize it): for each time stamp (key), we keep track of the curr object annotation id that we should use
- steal; "area": float, "bbox": [x,y,width,height],
- iscrowd = 0
- errorhandling
"""
def createCocoAnnotationDict(viaObjAnnotation, viaCatConfig, higest_z, idGenerator):
  curr_obj_id_dict = {}
  for z in np.arange(0.0, higest_z, 0.1):
    z_trunc = trunc(z)
    curr_obj_id_dict[z_trunc] = {"curr_obj_id": 0, "existing_obj_ids": {1: [], 2:[]}}

  cocoAnnotations = []

  for k in viaObjAnnotation:
      ann = viaObjAnnotation[k]
      # if len of z is more than 1, not a bounding box annotation
      if len(ann["z"]) == 1:
          curr_time = ann["z"][0]
          cat_id, og_obj_id = getLabelAndId(ann["av"], viaCatConfig)
          area = getArea(ann["xy"])
          if area < ANN_AREA_FILTER_THRESHOLD:
            print(f"Ann (cat_id={cat_id}, og_obj_id={og_obj_id}, t={curr_time}) has unreasonably small area: {area}")
          else:
            curr_obj_id, curr_obj_id_dict = getCurrObjId(curr_time, cat_id, og_obj_id, curr_obj_id_dict)
            if curr_obj_id != None:
              cocoAnnotations.append({
                                      "id": idGenerator.generateAnnId(curr_time, curr_obj_id), 
                                      "image_id": idGenerator.generateImageId(curr_time), 
                                      "category_id": cat_id, 
                                      "segmentation": getSegmentation(ann["xy"]), 
                                      "area": area, 
                                      "bbox": getBBox(ann["xy"]), 
                                      "iscrowd": 0,
                                    })

  return cocoAnnotations


def createCocoImageDict(w, h, higest_z, video_filename, idGenerator):
  cocoImages = []

  for z in np.arange(0.0, higest_z, 0.1):
    curr_time = trunc(z)

    image_num_in_filename = str(int(curr_time / 0.1))
    image_num_in_filename = image_num_in_filename.zfill(idGenerator.digits_for_image)

    image_filename = f"{video_filename}_" + image_num_in_filename + ".jpg"
    
    cocoImages.append({
                        "id": idGenerator.generateImageId(curr_time), 
                        "width": w, 
                        "height": h, 
                        "file_name": image_filename, 
                        "license": 0, 
                        "flickr_url": "", 
                        "coco_url": "", 
                        "date_captured": date.today().strftime("%m/%d/%Y"),
    })

  return cocoImages


def createCocoInfoDict(video_filename):
  return {
          "year": date.today().year, 
          "version": "", 
          "description": video_filename, 
          "contributor": "", 
          "url": "", 
          "date_created": date.today().strftime("%m/%d/%Y"),
  }

# from the original conveter
def createCocoLisenses():
  return [{
            "id": 0,
            "name": "Unknown License",
            "url": ""
        }]

def createCocoCategories():
  return [{
            "supercategory": "object_label",
            "id": 1,
            "name": "shark"
          },
          {
            "supercategory": "object_label",
            "id": 2,
            "name": "human"
          }]

def getFilenameWithoutPath(file_path):
  path_without_extension = os.path.splitext(file_path)[0]

  return path_without_extension.split("/")[-1]

def convertToCocoFormat(via_json_path, video_path, video_id, coco_json_dir):
  vid_info = ffmpeg.probe(video_path)
  # https://stackoverflow.com/questions/7362130/getting-video-dimension-resolution-width-x-height-from-ffmpeg
  height = int(vid_info['streams'][0]['height'])
  width = int(vid_info['streams'][0]['width'])
  print(f"h = {height}, w = {width}")

  # https://stackoverflow.com/questions/3844430/how-to-get-the-duration-of-a-video-in-python
  vid_length = float(vid_info['format']['duration'])
  print(f"vid length (in sec): {vid_length}")

  video_name = getFilenameWithoutPath(video_path)
  coco_json_save_path = coco_json_dir + video_name + "_coco.json"

  idGen = CocoIdGenerator(file_id = video_id)

  with open(via_json_path, 'r') as f:
    via_json = json.load(f)

  coco_json = {
                  "info": createCocoInfoDict(video_name), 
                  "images": createCocoImageDict(width, height, vid_length, video_name, idGen), 
                  "annotations": createCocoAnnotationDict(via_json["metadata"], via_json["attribute"],vid_length, idGen), 
                  "categories": createCocoCategories(),
                  "licenses": createCocoLisenses(),
                }

  with open(coco_json_save_path, 'w') as f:
    json.dump(coco_json, f)

  

In [5]:
convertToCocoFormat("via_annotations/20200805_OneSharkSUPSurfers.json", "videos/20200805_OneSharkSUPSurfers.mp4", 1, "coco_annotations/")

h = 2160, w = 3840
vid length (in sec): 16.182833
Ann (cat_id=2, og_obj_id=12, t=6.5) already got added, ignore this one
Ann (cat_id=2, og_obj_id=8, t=3.7) already got added, ignore this one
Ann (cat_id=2, og_obj_id=6, t=3.9) already got added, ignore this one
Ann (cat_id=2, og_obj_id=12, t=7.7) already got added, ignore this one
Ann (cat_id=2, og_obj_id=0, t=9.1) already got added, ignore this one
Ann (cat_id=2, og_obj_id=2, t=11.3) already got added, ignore this one
Ann (cat_id=2, og_obj_id=0, t=12.6) already got added, ignore this one
Ann (cat_id=2, og_obj_id=4, t=0.1) already got added, ignore this one
Ann (cat_id=2, og_obj_id=None, t=0.3) has unreasonably small area: 0.0
Ann (cat_id=2, og_obj_id=None, t=0.3) has unreasonably small area: 0.0
