### **Preparing the [QMUL-OpenLogo](https://hangsu0730.github.io/qmul-openlogo/) Dataset for yolov5**

This notebook should
- Download the QMUL-OpenLogo dataset from Google Drive
- Create `/datasets/openlogo` with
    - yolo labels (.txt) in `/labels`
    - images (.jpg) in `/images`
    - classes and class splits (.txt) in `/details`
    - train/val/test split (.txt)
- Delete leftover files from download

In [3]:
import xml.etree.ElementTree as et
import os
import shutil
import math
import tarfile
import gdown

In [4]:
# Download openlogo.tar
gdown.download(id='1p1BWofDJOKXqCtO0JPT5VyuIPOsuxOuj', output='openlogo.tar', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1p1BWofDJOKXqCtO0JPT5VyuIPOsuxOuj
To: c:\Users\Hang\source\repos\SmartSee\model\openlogo.tar
100%|██████████| 4.71G/4.71G [03:12<00:00, 24.5MB/s]


'openlogo.tar'

In [5]:
# Extract openlogo.tar
tarfile.open('openlogo.tar').extractall()
os.remove('openlogo.tar')

In [6]:
# Create directories 
os.mkdir('datasets')

os.mkdir('datasets/openlogo')

os.mkdir('datasets/openlogo/images')
os.mkdir('datasets/openlogo/labels')
os.mkdir('datasets/openlogo/details')

In [8]:
# Create class name to id map
# Initalize list to hold names
class_names = []

# Iterate through each XMLs
for filename in os.listdir('openlogo/annotations'):
    xml_file = os.path.join('openlogo/annotations', filename)
    root = et.parse(xml_file).getroot()

    # Get class name and add if needed
    for elem in root:
        if elem.tag == 'object':
            for subelem in elem:
                if subelem.tag == 'name' and subelem.text not in class_names:
                    class_names.append(subelem.text)

# Assign ids to classes
class_names.sort()
class_name_id_map = {}
for i in range(len(class_names)):
    class_name_id_map[class_names[i]] = i

# Save list and map in .txt
print(class_names, file=open('datasets/openlogo/details/class-list.txt', 'w'))
print(class_name_id_map, file=open('datasets/openlogo/details/class-id-map.txt', 'w'))

In [9]:
# Generate yolo labels from each XML
# Iterate through each XML
for filename in os.listdir('openlogo/annotations'):
    xml_file = os.path.join('openlogo/annotations', filename)
    root = et.parse(xml_file).getroot()

    # Initialize dict to hold annotation data
    voc_annotation = {}
    voc_annotation['bnboxes'] = []

    # Parse XML Tree
    for elem in root:
        
        # Get file name
        if elem.tag == 'filename':
            voc_annotation['filename'] = elem.text

        # Get image size
        elif elem.tag == 'size':
            image_size = {}
            for subelem in elem:
                image_size[subelem.tag] = int(subelem.text)
            voc_annotation['size'] = image_size

        # Get bounding box(es)
        elif elem.tag == 'object':
            bnbox = {}
            for subelem in elem:
                if subelem.tag == 'name':
                    bnbox['class'] = subelem.text

                elif subelem.tag == 'bndbox':
                    for subsubelem in subelem:
                        bnbox[subsubelem.tag] = int(subsubelem.text)
            voc_annotation['bnboxes'].append(bnbox)

    # Initialize list to hold yolo label
    yolo_label = []

    # Iterate through each annotation box
    for bnbox in voc_annotation['bnboxes']:

        # Normalize Pascal VOC info for yolo label
        image_width = voc_annotation['size']['width']
        image_height = voc_annotation['size']['height']

        class_id = class_name_id_map[bnbox['class']]

        bnbox_x_center = ((bnbox['xmin'] + bnbox['xmax']) / 2) / image_width
        bnbox_y_center = ((bnbox['ymin'] + bnbox['ymax']) / 2) / image_height
        bnbox_width = (bnbox['xmax'] - bnbox['xmin']) / image_width
        bnbox_height = (bnbox['ymax'] - bnbox['ymin']) / image_height

        # Save normalized info to list
        yolo_label.append('{} {:.6f} {:.6f} {:.6f} {:.6f}'.format(class_id, bnbox_x_center, bnbox_y_center, bnbox_width, bnbox_height))

    # Save formatted annotation to .txt
    save_filename = os.path.splitext(voc_annotation['filename'])[0] + '.txt'
    save_file = os.path.join('datasets/openlogo/labels', save_filename)
    print('\n'.join(yolo_label), file=open(save_file, 'w'))

In [10]:
# Move images
for filename in os.listdir('openlogo/JPEGImages',):
    jpg_file = os.path.join('openlogo/JPEGImages', filename)
    shutil.move(jpg_file, 'datasets/openlogo/images')

In [11]:
# Split dataset into train/val/test
# Remove excess class files
excess_files = ['all_train.txt', 'all_test.txt', 'train.txt', 'test.txt']
for filename in excess_files:
    excess_file = os.path.join('openlogo/ImageSets/class_sep', filename)
    os.remove(excess_file)

# Initialize lists to hold dataset split
train_ids = []
val_ids = []
test_ids = []

# Iterate through every class file
for filename in os.listdir('openlogo/ImageSets/class_sep'):
    
    # Get IDs from file
    split_file = os.path.join('openlogo/ImageSets/class_sep', filename)
    split_ids = open(split_file, 'r').read().split('\n')
    split_ids.pop()

    # Get split type
    split_type = filename.split("_")[-1]

    # Iterate through ID if test
    if split_type == 'test.txt':

        # Add each id to test split
        for id in split_ids:
            jpg_file = os.path.join('./labels', id + '.jpg')
            test_ids.append(jpg_file)
    
    # Iterate through ID if train
    elif split_type == 'train.txt':
        for i in range(len(split_ids)):
            jpg_file = os.path.join('./labels', split_ids[i] + '.jpg')
            
            # Add 1/7 of test data to val split
            if i < math.ceil(len(split_ids) / 7):
                val_ids.append(jpg_file)
            
            # Add rest to train split
            else:
                train_ids.append(jpg_file)

# Create .txts
print('\n'.join(train_ids), file=open('datasets/openlogo/train.txt', 'w'))
print('\n'.join(val_ids), file=open('datasets/openlogo/val.txt', 'w'))
print('\n'.join(test_ids), file=open('datasets/openlogo/test.txt', 'w'))

In [12]:
# DESTRUCTIVE: Cleanup
# Move class split files
shutil.move('openlogo/ImageSets/class_sep', 'datasets/openlogo/details')
os.rename('datasets/openlogo/details/class_sep', 'datasets/openlogo/details/class-sep')

# Remove leftover openlogo files
shutil.rmtree('openlogo')

# Remove dataset for reset
# shutil.rmtree('datasets')