In [19]:
# !pip install openimages

In [13]:
#import basic libraries
import numpy as np
import subprocess as sbp
import os
import datetime
import json
#import cv2 for image visualization,resizing
import cv2 

#for downloading images and their annotations from openimages api
from openimages.download import download_dataset
#for parsing xml
from bs4 import BeautifulSoup as bs

In [None]:
#downloading dataset
download_dataset("/dataset_open_images", ["Bottle", "Fedora"] , annotation_format="pascal" , limit = 1000)

In [1]:
 def create_directories(dataset = "/dataset") :
  '''
    This function creates train, test, validation directories in a given path

    params :
      dataset : path where the train, test and validation directories are to be made -> default = "../dataset" 
  '''
  if not os.path.isdir(dataset):
    print("Creating directory")
    os.makedirs(dataset)
    os.makedirs(os.path.join(dataset, "train"))
    os.makedirs(os.path.join(dataset, "test"))
    os.makedirs(os.path.join(dataset, "val"))
  else:
    print("Directory already exists")

In [5]:
def get_xml_paths (folder) :
  '''
    This function takes path of folder that contains xml paths and returns list of .xml files paths 

  '''
  xml_paths = []
  for root, dirs, filenames in os.walk(folder):
    for name in filenames:
        if('.xml' in name ) :
          xml_paths.append(os.path.join(root,name))
  return xml_paths

In [6]:
def parse_xml (xml_path):
  '''
    This function takes xml path and returns beautiful soup content of xml
  '''
  content = [] 
  with open(xml_path , "r") as file :
    content = file.readlines()
    content = "".join(content)
    bs_content = bs (content , "lxml")
  return bs_content

In [7]:
def create_info(year , month , day) :
  '''
    This function creates and returns info that is required in coc0_annotations format 
  '''
  info = {}
  info["year"] = year
  info["version"] = 1 
  info["description"] = ""
  info["contributor"] = "AKI"
  info["url"] = ""
  info["date_created"] = '{}-{}-{}'.format(year, month, day)
  return info

In [8]:
def resizing_image(img_path , x_resolution = 416 , y_resolution = 416):
  '''
    This function takes image path and returns resized image, xscale and yscale

    params :
      img_path : path to .jpg , .jpeg, .png file
      x_resolution : no of pixels required on x-axis , by default 416
      y_resolution : no of pixels required on y-axis , by default 416

    returns :
     img_resized : image resized to x_resolution * y_resolution
     x_scale, y_scale : used for calculations of bounding box

  '''
  image_to_predict = cv2.imread(img_path,3)

  y_ = image_to_predict.shape[0]
  x_ = image_to_predict.shape[1]

  x_scale = x_resolution/x_
  y_scale = y_resolution/y_
  # print(x_scale,y_scale)
  img_resized = cv2.resize(image_to_predict,(x_resolution,y_resolution)); 
  return img_resized,x_scale , y_scale     

In [15]:
def roboflow(image_root, xml_paths, dataset_location, coco_annotation_json_path=None, x_resolution = 416 , y_resolution = 416):
  ''' 
    This function converts the annotations given in xml format to COCO annotations format
    
    params : 
      image_root : folder that contains images 
      xml_paths : list of xml paths 
      dataset_location : path where the images and annotation files have to be stored
      coco_annotation_json_path : path to annotations json file
      x_resolution : no of pixels required on x-axis , by default 416
      y_resolution : no of pixels required on y-axis , by default 416

    
  '''

  now = datetime.datetime.now()
  year = '{:02d}'.format(now.year)
  month = '{:02d}'.format(now.month)
  day = '{:02d}'.format(now.day)

  #initialising values
  annotation_id = 0 
  id=0
  category_id = 0
  license_id = 1
  category_list = []
  counter_existing = 0 


  #checking for pre-existing JSON file
  if coco_annotation_json_path is not None: 
    with open(coco_annotation_json_path , "r") as json_file :
      coco_annotation = json.load(json_file)
      counter = len(coco_annotation.get("images")) 
      counter_ann = len(coco_annotation.get("annotations"))

      category_list = list(map(lambda catagory: catagory['name'], coco_annotation.get("categories")))
      category_id = len(coco_annotation.get("categories")) #TODO: fix how id number is calculated, might collide with other IDs if a category is deleted

      if not (category_id == len(category_list)):
        print("Length mismatch while getting category names")

  else:
    counter =0
    counter_ann=0
    coco_annotation = {}
    coco_annotation["info"] = create_info(year , month , day)
    coco_annotation["licenses"] =  [{"id": license_id,"url": "","name": "Unknown"}]
    coco_annotation["categories"] = []
    coco_annotation["images"] = []
    coco_annotation["annotations"] = []
    
  for image_index, xml_path in enumerate(xml_paths) :
    parsed_xml = parse_xml(xml_path)
    image_path = os.path.join(dataset_location, parsed_xml.find("filename").text)
    if os.path.isfile(image_path):
      # print(image_index,xml_path)
      counter_existing = counter_existing+1
      continue 
    image = {}
    image["id"] = image_index + counter - counter_existing
    image["license"] = license_id
    image["file_name"] = parsed_xml.find("filename").text
    image["height"] = y_resolution
    image["width"] = x_resolution
    image["date_captured"] = '{}-{}-{}'.format(year, month, day)

    # Add image to dataset folder
    image_path = os.path.join(image_root, image["file_name"])
    img_resized , x_scale , y_scale = resizing_image(img_path=image_path , x_resolution= x_resolution , y_resolution= y_resolution)
    cv2.imwrite(os.path.join(dataset_location,image["file_name"]) , img_resized)
  

    #Adding image to JSON
    coco_annotation["images"].append(image)
    names = parsed_xml.find_all("name")
    xmins = parsed_xml.find_all("xmin")
    xmaxs = parsed_xml.find_all("xmax")
    ymins = parsed_xml.find_all("ymin")
    ymaxs = parsed_xml.find_all("ymax")

    for i in range(len(names)) :
      name = names[i].text 

      if name not in category_list :
        category_list.append(name)
        dic = {'id' : category_id , "name" : name , "supercategory" : "none"}
        coco_annotation["categories"].append(dic)
        id += 1 # TODO: Redo how IDs are calculated

      #Creating the annotation
      annotation = {}
      annotation["id"] = annotation_id + counter_ann # TODO: Redo how annotation IDs are calculated 
      annotation_id += 1 
      annotation["image_id"] = image["id"]
      annotation["category_id"] = category_list.index(name)
      annotation["bbox"] = [
              x_scale*int(xmins[i].text) , 
              y_scale*int(ymins[i].text),
              x_scale*(int(xmaxs[i].text)-int(xmins[i].text)),
              y_scale*(int(ymaxs[i].text)-int(ymins[i].text))
      ]
      annotation["area"] = ((x_scale*(int(xmaxs[i].text)-int(xmins[i].text))) * (y_scale*(int(ymaxs[i].text)-int(ymins[i].text))))
      annotation["segmentation"] = []
      annotation["iscrowd"] = 0

      #Adding the annotation to JSON
      coco_annotation["annotations"].append(annotation)

      
  with open(os.path.join(dataset_location, "_annotations.coco.json"), "w") as json_file :
    json.dump(coco_annotation , json_file)

In [10]:
def add_to_dataset(paths, dataset = "/dataset", percentages = [90, 5, 5]):
  '''
    This function splits and adds images in train, test, validation dirctories

    params : 
      paths : paths to directory where the data is stored
      dataset : path to directory where the images and annotations file have to be stored
      percentages : Split perecntage of total data into train, validation and test
  '''
  for path in paths:
    print("Started: " + path)
    create_directories(dataset)
    image_root = os.path.join(path, "images")
    pascal = os.path.join(path, "pascal")
    xml_paths = get_xml_paths(pascal)

    if(sum(percentages) != 100):
      print("Split sumcheck failed going with 90, 5, 5 split")
      percentages = [90, 5, 5]

    train, val, test = np.split(xml_paths, [int(len(xml_paths)*percentages[0]/100), int(len(xml_paths)*(percentages[0] + percentages[1])/100)])

    # TODO: find better conversion method
    train = train.tolist()
    val = val.tolist()
    test = test.tolist()

    splits = {
        "train" : train,
        "val" : val,
        "test": test
    }

    print(len(train), len(test), len(val))
    print(type(train), type(test), type(val))

    for split in ["train", "val", "test"]:
      dataset_location = os.path.join(dataset, split)
      annotation_file_path = os.path.join(dataset_location, "_annotations.coco.json")
      annotation_file_path = annotation_file_path if os.path.isfile(annotation_file_path) else None
      roboflow(image_root, splits[split], dataset_location, coco_annotation_json_path = annotation_file_path)

    print("Completed: " + path)

In [11]:
def generate_paths(root, names):
  return [os.path.join(root, name) for name in names]

In [None]:
add_to_dataset(paths = generate_paths("/dataset_open_images", ["fedora", "bottle"]))


In [None]:
for split in ["train", "val", "test"]:
    with open("/dataset/{}/_annotations.coco.json".format(split) , "r") as json_file :
      coco_annotation = json.load(json_file)
      counter = len(coco_annotation.get("images")) 
    print(counter)