# SGG YOLO CONVERTER

### This script can convert any SGG annotations file in COCO format (PSG dataset) to YOLO format for object detection training

In [24]:
import os
import shutil
import random
import h5py
import json
from collections import Counter
from tqdm import tqdm

data = json.load(open('/home/maelic/Documents/PhD/MyModel/SGG-Benchmark/datasets/psg/psg_train_val.json'))

img_path = '/home/maelic/Documents/Datasets/VG/VG_100K'
img_data = "/home/maelic/Documents/PhD/MyModel/SGG-Benchmark/datasets/vg/image_data.json"
img_data = json.load(open(img_data))

In [29]:
print(data.keys())
print(data['data'][4])
print(img_data[50000])
for d in data['data']:
    if d['coco_image_id'] == '548168':
        print(d)
        break

dict_keys(['data', 'thing_classes', 'stuff_classes', 'predicate_classes', 'test_image_ids'])
{'file_name': 'train2017/000000285388.jpg', 'height': 500, 'width': 327, 'image_id': '107914', 'pan_seg_file_name': 'panoptic_train2017/000000285388.png', 'segments_info': [{'id': 5463914, 'category_id': 22, 'iscrowd': 0, 'isthing': 1, 'attribute_ids': [], 'gqa_category_id': 813, 'area': 45716}, {'id': 14605006, 'category_id': 113, 'iscrowd': 0, 'isthing': 0, 'attribute_ids': [], 'gqa_category_id': 1645, 'area': 8121}, {'id': 4805982, 'category_id': 116, 'iscrowd': 0, 'isthing': 0, 'attribute_ids': [], 'gqa_category_id': 1578, 'area': 28523}, {'id': 3488836, 'category_id': 126, 'iscrowd': 0, 'isthing': 0, 'attribute_ids': [], 'gqa_category_id': 1582, 'area': 49620}], 'relations': [[0, 3, 3], [1, 0, 3], [2, 1, 3]], 'location': '', 'weather': '', 'annotations': [{'bbox': [21.0, 64.0, 304.0, 481.0], 'bbox_mode': 0, 'category_id': 22}, {'bbox': [0.0, 101.0, 327.0, 183.0], 'bbox_mode': 0, 'category_

In [30]:
img_prefix = "/home/maelic/Documents/Datasets/VG/VG_100K"
img_path = img_prefix + '/2370584.jpg'

import cv2
# img size
img = cv2.imread(img_path)
print(img.shape)

(334, 500, 3)


In [14]:
object_to_idx = {}
THING_CLASSES = data['thing_classes']
STUFF_CLASSES = data['stuff_classes']
CLASSES = THING_CLASSES + STUFF_CLASSES
for i, cls in enumerate(CLASSES):
    object_to_idx[cls] = str(i)

OUT_PATH = "/home/maelic/Documents/Datasets/VG/PSG/YOLO_anno/"
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)

with open(os.path.join(OUT_PATH, 'classes.txt'), 'w') as f:
    for key,v in object_to_idx.items():
        f.write(key + '\n')

In [21]:
dest_folder = os.path.join(OUT_PATH, 'annotations')
image_dest_folder = os.path.join(OUT_PATH, 'images_all')

import numpy as np

if not os.path.exists(dest_folder):
    os.makedirs(dest_folder)
if not os.path.exists(image_dest_folder):
    os.makedirs(image_dest_folder)

for idx in tqdm(range(len(data['data']))):
    image_data = data['data']
    img_width = image_data[idx]['width']
    img_height = image_data[idx]['height']
    image_id = image_data[idx]['image_id']

    anno = image_data[idx]['annotations']

    for obj in anno:
        obj_cat = obj['category_id']
        assert 0 <= obj_cat < 133

        box = obj['bbox']
        # convert to yolo format
        x_center = (box[0] + box[2] / 2) / img_width
        y_center = (box[1] + box[3] / 2) / img_height
        w = box[2] / img_width
        h = box[3] / img_height

        box = [x_center, y_center, w, h]

        # clip to [0, 1]
        box = np.clip(box, 0, 1)

        with open(os.path.join(dest_folder, str(image_id) + '.txt'), 'a') as f:
            f.write(str(obj_cat) + ' ' + str(box[0]) + ' ' + str(box[1]) + ' ' + str(box[2]) + ' ' + str(box[3])+'\n')
    
    # copy image to dest folder
    shutil.copyfile(os.path.join(img_path, str(image_id)+'.jpg'), os.path.join(image_dest_folder, str(image_id)+'.jpg'))

100%|██████████| 46697/46697 [00:24<00:00, 1915.21it/s]


### With val split

In [10]:
from sklearn.model_selection import train_test_split

base_path = OUT_PATH
dest_folder = os.path.join(base_path,'annotations')
image_dest_folder = os.path.join(base_path,'images_all')

if not os.path.exists(dest_folder):
    os.makedirs(dest_folder)

if not os.path.exists(image_dest_folder):
    os.makedirs(image_dest_folder)

images = [os.path.join(image_dest_folder, x) for x in os.listdir(image_dest_folder)]
annotations = [os.path.join(dest_folder, x) for x in os.listdir(dest_folder) if x[-3:] == "txt"]

images.sort()
annotations.sort()
train_images, val_images, train_annotations, val_annotations = train_test_split(images, annotations, test_size = 0.3, random_state = 1)
val_images, test_images, val_annotations, test_annotations = train_test_split(val_images, val_annotations, test_size = 0.5, random_state = 1)

root_path = 'images/'
folders = [base_path+'/train',base_path+'/test',base_path+'/val']
for folder in folders:
    os.makedirs(os.path.join(folder,root_path))
    
root_path = 'labels/'
for folder in folders:
    os.makedirs(os.path.join(folder,root_path))    

def move_files_to_folder(list_of_files, destination_folder):
    for f in list_of_files:
        try:
            shutil.move(f, destination_folder)
        except:
            print(f)
            assert False

move_files_to_folder(train_images, base_path+'/train/images/')
move_files_to_folder(val_images, base_path+'/val/images/')
move_files_to_folder(test_images, base_path+'/test/images/')
move_files_to_folder(train_annotations, base_path+'/train/labels/')
move_files_to_folder(val_annotations, base_path+'/val/labels/')
move_files_to_folder(test_annotations, base_path+'/test/labels/')

### No val split

In [22]:
from sklearn.model_selection import train_test_split

base_path = OUT_PATH
dest_folder = os.path.join(base_path,'annotations')
image_dest_folder = os.path.join(base_path,'images_all')

if not os.path.exists(dest_folder):
    os.makedirs(dest_folder)

if not os.path.exists(image_dest_folder):
    os.makedirs(image_dest_folder)

images = [os.path.join(image_dest_folder, x) for x in os.listdir(image_dest_folder)]
annotations = [os.path.join(dest_folder, x) for x in os.listdir(dest_folder) if x[-3:] == "txt"]

images.sort()
annotations.sort()
train_images, test_images, train_annotations, test_annotations = train_test_split(images, annotations, test_size = 0.2, random_state = 1)

root_path = 'images/'
folders = [base_path+'/train',base_path+'/val']
for folder in folders:
    os.makedirs(os.path.join(folder,root_path))
    
root_path = 'labels/'
for folder in folders:
    os.makedirs(os.path.join(folder,root_path))    

def move_files_to_folder(list_of_files, destination_folder):
    for f in list_of_files:
        try:
            shutil.move(f, destination_folder)
        except:
            print(f)
            assert False

move_files_to_folder(train_images, base_path+'/train/images/')
move_files_to_folder(test_images, base_path+'/val/images/')
move_files_to_folder(train_annotations, base_path+'/train/labels/')
move_files_to_folder(test_annotations, base_path+'/val/labels/')

In [11]:
# write the yaml file
classes = object_to_idx.keys()

train_path = base_path+'train/'
val_path = base_path+'val/'
test_path = base_path+'test/'

n_classes = len(classes)

with open(base_path+'/train.yaml', 'w') as f:
    f.write('train: '+train_path+'\n')
    f.write('val: '+val_path+'\n')
    f.write('test: '+test_path+'\n')
    f.write('nc: '+str(n_classes)+'\n')
    f.write('names: '+str(list(classes)))