In [1]:
import numpy as np 
import pandas as pd
import os
from glob import glob
from tqdm.notebook import tqdm
tqdm.pandas()
import json
from sklearn.model_selection import train_test_split
import shutil
import yaml

In [2]:
BASE_DIR = '../datasets'
annotation_list = glob(f'{BASE_DIR}/ArTaxOr/*/annotations/*.json')

In [3]:
def read_json_data(jsonfile):
    rows = {"img_dir":[] ,"img_w":[] , "img_h" : [], "sp_type": [] ,"xc":[] , "yc":[], "bb_height" : [] , "bb_width": [] , "bb_left":[] , "bb_top":[]}
    
    # read file
    json_f = open(jsonfile)
    json_f = json.load(json_f)
 
    for region in json_f["regions"]:
        img_dir = json_f["asset"]["path"][7:]
        
        sp_type = region["tags"][0]
        
        img_w = json_f["asset"]["size"]["width"]
        img_h = json_f["asset"]["size"]["height"]
        
        # Normalize Bounding Box 
        bb_height = region["boundingBox"]["height"] / img_h
        bb_width = region["boundingBox"]["width"] / img_w
        
        bb_left = region["boundingBox"]["left"] / img_w
        bb_top = region["boundingBox"]["top"] / img_h
        
        xcenter = region['boundingBox']['left']/img_w+0.5*bb_width
        ycenetr = region['boundingBox']['top']/img_h+0.5*bb_height
        
        rows["img_dir"].append(BASE_DIR+img_dir)
        rows["sp_type"].append(sp_type)
        rows["img_w"].append(img_w)
        rows["img_h"].append(img_h)
        rows["bb_height"].append(bb_height)
        rows["bb_width"].append(bb_width)
        rows["bb_left"].append(bb_left)
        rows["bb_top"].append(bb_top)
        rows["xc"].append(xcenter)
        rows["yc"].append(ycenetr)
        
    return rows

In [4]:
df = pd.DataFrame(columns=["img_dir","img_w","img_h","sp_type","xc","yc","bb_height","bb_width","bb_left","bb_top"])
df = df.astype({"img_dir":object,"img_w":np.int64,"img_h":np.int64,"sp_type":object,"xc":float,"yc":float,"bb_height":float,"bb_width":float,"bb_left":float,"bb_top":float})

In [5]:
for i, json_file in enumerate(annotation_list):
    df = pd.concat([df, pd.DataFrame.from_dict(read_json_data(json_file))],axis=0,ignore_index=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19605 entries, 0 to 19604
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   img_dir    19605 non-null  object 
 1   img_w      19605 non-null  int64  
 2   img_h      19605 non-null  int64  
 3   sp_type    19605 non-null  object 
 4   xc         19605 non-null  float64
 5   yc         19605 non-null  float64
 6   bb_height  19605 non-null  float64
 7   bb_width   19605 non-null  float64
 8   bb_left    19605 non-null  float64
 9   bb_top     19605 non-null  float64
dtypes: float64(6), int64(2), object(2)
memory usage: 1.5+ MB


In [7]:
classes_name = ["Hymenoptera","Hemiptera","Lepidoptera","Coleoptera","Diptera","Araneae","Odonata"]
classes_num = [0,1,2,3,4,5,6]

df.loc[:,"sp_type"] = df.loc[:,"sp_type"].replace(classes_name, classes_num).infer_objects(copy=False)

  df.loc[:,"sp_type"] = df.loc[:,"sp_type"].replace(classes_name, classes_num).infer_objects(copy=False)


In [8]:
os.makedirs('../datasets/ArTaxOrYolo/test/images/', exist_ok=True)
os.makedirs('../datasets/ArTaxOrYolo/train/images/', exist_ok=True)
os.makedirs('../datasets/ArTaxOrYolo/test/labels/', exist_ok=True)
os.makedirs('../datasets/ArTaxOrYolo/train/labels/', exist_ok=True)

In [9]:
train, test = train_test_split(df, test_size=0.2)

In [10]:
def add_data_to_folder(file_type: str, data: pd.DataFrame) -> None:
    for index, row in tqdm(data.iterrows(), total=len(data)):
        shutil.copy(row.loc["img_dir"], f"../datasets/ArTaxOrYolo/{file_type}/images/")
        with open(f'../datasets/ArTaxOrYolo/{file_type}/labels/{row.loc["img_dir"].split("/")[-1][:-4]}.txt', "w") as f:
            f.write(f'{row["sp_type"]} {row["xc"]} {row["yc"]} {row["bb_width"]} {row["bb_height"]}\n')


In [11]:
add_data_to_folder("train" , train)
add_data_to_folder("test" , test)

  0%|          | 0/15684 [00:00<?, ?it/s]

  0%|          | 0/3921 [00:00<?, ?it/s]

In [12]:
yaml_dict = dict(
    train = '../../../datasets/ArTaxOrYolo/train/images',
    val = '../../../datasets/ArTaxOrYolo/test/images',
    nc    = len(classes_num),
    names = classes_name
)

with open('../datasets/ArTaxOrYolo/data.yaml', 'w') as outfile:
    yaml.dump(yaml_dict, outfile, default_flow_style=False)