In [None]:
import numpy as np 
import pandas as pd
import os
from glob import glob
from tqdm.notebook import tqdm
tqdm.pandas()
import json
from sklearn.model_selection import train_test_split
import shutil
import yaml

In [None]:
BASE_DIR = '/kaggle/input/arthropod-taxonomy-orders-object-detection-dataset'
annotation_list = glob(f'{BASE_DIR}/ArTaxOr/*/annotations/*.json')

In [None]:
classes_name = ["Hymenoptera", "Diptera", "Lepidoptera"]
classes_num = [0, 1, 2]

In [None]:
def read_json_data(jsonfile):
    rows = {"img_dir":[], "img_w":[], "img_h":[], "sp_type":[], "xc":[], "yc":[], "bb_height":[], "bb_width":[], "bb_left":[], "bb_top":[]}
    
    # read file
    json_f = open(jsonfile)
    json_f = json.load(json_f)
    
    for region in json_f["regions"]:
        img_dir = json_f["asset"]["path"][7:]
        
        sp_type = region["tags"][0]
        
        # Check if the current sp_type is in the desired classes
        if sp_type in classes_name:
            img_w = json_f["asset"]["size"]["width"]
            img_h = json_f["asset"]["size"]["height"]
            
            # Normalize Bounding Box 
            bb_height = region["boundingBox"]["height"] / img_h
            bb_width = region["boundingBox"]["width"] / img_w
            
            bb_left = region["boundingBox"]["left"] / img_w
            bb_top = region["boundingBox"]["top"] / img_h
            
            xcenter = region['boundingBox']['left']/img_w+0.5*bb_width
            ycenter = region['boundingBox']['top']/img_h+0.5*bb_height
            
            rows["img_dir"].append(BASE_DIR+img_dir)
            rows["sp_type"].append(classes_name.index(sp_type))
            rows["img_w"].append(img_w)
            rows["img_h"].append(img_h)
            rows["bb_height"].append(bb_height)
            rows["bb_width"].append(bb_width)
            rows["bb_left"].append(bb_left)
            rows["bb_top"].append(bb_top)
            rows["xc"].append(xcenter)
            rows["yc"].append(ycenter)
        
    return rows

In [None]:
df = pd.DataFrame(columns=["img_dir","img_w","img_h","sp_type","xc","yc","bb_height","bb_width","bb_left","bb_top"])
df = df.astype({"img_dir":object,"img_w":np.int64,"img_h":np.int64,"sp_type":object,"xc":float,"yc":float,"bb_height":float,"bb_width":float,"bb_left":float,"bb_top":float})

In [None]:
for i, json_file in enumerate(annotation_list):
    df = pd.concat([df, pd.DataFrame.from_dict(read_json_data(json_file))],axis=0,ignore_index=True)

In [None]:
df.info()

In [None]:
os.makedirs('../datasets/ArTaxOrYolo/test/images/', exist_ok=True)
os.makedirs('../datasets/ArTaxOrYolo/train/images/', exist_ok=True)
os.makedirs('../datasets/ArTaxOrYolo/test/labels/', exist_ok=True)
os.makedirs('../datasets/ArTaxOrYolo/train/labels/', exist_ok=True)

In [None]:
train, test = train_test_split(df, test_size=0.2)

In [None]:
def add_data_to_folder(file_type: str, data: pd.DataFrame) -> None:
    for index, row in tqdm(data.iterrows(), total=len(data)):
        shutil.copy(row.loc["img_dir"], f"../datasets/ArTaxOrYolo/{file_type}/images/")
        with open(f'../datasets/ArTaxOrYolo/{file_type}/labels/{row.loc["img_dir"].split("/")[-1][:-4]}.txt', "w") as f:
            f.write(f'{row["sp_type"]} {row["xc"]} {row["yc"]} {row["bb_width"]} {row["bb_height"]}\n')

In [None]:
add_data_to_folder("train" , train)
add_data_to_folder("test" , test)

In [None]:
yaml_dict = dict(
    train = '../../../datasets/ArTaxOrYolo/train/images',
    val = '../../../datasets/ArTaxOrYolo/test/images',
    nc    = len(classes_num),
    names = classes_name
)

with open('../datasets/ArTaxOrYolo/data.yaml', 'w') as outfile:
    yaml.dump(yaml_dict, outfile, default_flow_style=False)

In [None]:
import os
import zipfile

# Zip the train folder
train_folder = '../datasets/ArTaxOrYolo/train'
train_zip_file = 'train.zip'

with zipfile.ZipFile(train_zip_file, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    for root, dirs, files in os.walk(train_folder):
        for file in files:
            file_path = os.path.join(root, file)
            zip_file.write(file_path, os.path.relpath(file_path, train_folder))

# Zip the val folder
val_folder = '../datasets/ArTaxOrYolo/test'  # Assuming 'test' is the validation folder
val_zip_file = 'val.zip'

with zipfile.ZipFile(val_zip_file, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    for root, dirs, files in os.walk(val_folder):
        for file in files:
            file_path = os.path.join(root, file)
            zip_file.write(file_path, os.path.relpath(file_path, val_folder))

# Display download links
from IPython.display import FileLink

FileLink(train_zip_file)
FileLink(val_zip_file)

In [None]:
# Zip the data.yaml file
yaml_zip_file = 'data.zip'
with zipfile.ZipFile(yaml_zip_file, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    yaml_file_path = '../datasets/ArTaxOrYolo/data.yaml'
    zip_file.write(yaml_file_path)