# Convert Data from RDD2022 to YOLO Format
<h4>
En este notebook preparamos los datos que vamos a utilizar para entrenar nuestros modelos YOLO. Los entrenamientos se van a realizar en Google Colabs y posteriormente nos descargaremos los pesos para utilizarlos localmente en la detección de objetos en nuestras imágenes test. Es por ello que vamos a ignorar las imágenes test en el datasets que subimos a Google Drive.
</h4>

In [19]:
import os, sys
import xml.etree.ElementTree as ET
from tqdm import tqdm

In [20]:
path = '../datasets/RDD2022/images'

In [21]:
# Remove contents inside ../datasets/RDD2022/
os.system('rm -r ../datasets/RDD2022/*')

# Copy ../data/RDD2022/ folder to ../datasets/RDD2022/
os.system('cp -r ../data/RDD2022/ ../datasets/RDD2022/')

rm: ../datasets/RDD2022/images/Czech/train: Permission denied
rm: ../datasets/RDD2022/images/Czech: Permission denied
rm: ../datasets/RDD2022/images: Permission denied


0

In [22]:
subdatasets = ['China_Drone', 'China_MotorBike', 'Czech', 'India', 'Japan', 'United_States']

class_name_to_id_dict = {
    'D00': 0,
    'D10': 1,
    'D20': 2,
    'D40': 3,
}
class_list = list(class_name_to_id_dict.keys())

problematic_files = []

print('Removing problematic files...')
# Remove any object in anotations that has a name not in class list
class_list = class_name_to_id_dict.keys()
for subdataset in tqdm(subdatasets):
    train_path = os.path.join(path, subdataset, 'train')
    test_path = os.path.join(path, subdataset, 'test')
    anotations_path = os.path.join(train_path, 'annotations')
    labels_path = os.path.join(train_path, 'labels')
    xmls_path = os.path.join(anotations_path, 'xmls')
    file_names = os.listdir(xmls_path)

    # Check if the {subdataset}/test folder exist and delete it
    if os.path.exists(test_path):
        os.system(f'rm -r {test_path}')

    # For each xml file, check every object and remove those that have names not in class list
    for file_name in file_names:
        try:
            with open(os.path.join(xmls_path, file_name), 'r') as xml:
                tree = ET.parse(xml)
                root = tree.getroot()
                for obj in root.findall('object'):
                    class_name = obj.find('name').text
                    if class_name not in class_list:
                        root.remove(obj)
                    # He decidido dejar aquellas imagenes de background que no tienen ningun objeto para que el modelo aprenda a distinguirlo. No obstante,
                    # es posible que sea necesario eliminar algunas ya que al solo quedarnos con cuatro etiquetas, es posible que haya muchas imagenes de background.
                    tree.write(os.path.join(xmls_path, file_name)) # Save the xml with the removed objects
        except Exception as e:
            problematic_files.append(file_name)
            continue

# For each subdataset open the train/anotations/xmls folder and add to train/labels/ a file_name.txt with the labels in YOLO format
# For example, for China_Drone_000000.xml we will create a file China_Drone_000000.txt with the anotations in YOLO format at train/labels/
# YOLO format: class x_center y_center width height with values normalized between 0 and 1
print('Creating anotations in YOLO format...')
for subdataset in tqdm(subdatasets):
    train_path = os.path.join(path, subdataset, 'train')
    anotations_path = os.path.join(train_path, 'annotations')
    labels_path = os.path.join(train_path, 'labels')
    xmls_path = os.path.join(anotations_path, 'xmls')
    file_names = os.listdir(xmls_path)
    
    for file_name in file_names:
        try:
            file_name = file_name.split('.')[0]
            # If labels folder doesn't exist, create it
            if not os.path.exists(labels_path):
                os.makedirs(labels_path)
            with open(os.path.join(labels_path, file_name + '.txt'), 'w') as file:
                with open(os.path.join(xmls_path, file_name + '.xml'), 'r') as xml:
                    tree = ET.parse(xml)
                    root = tree.getroot()
                    # Get the width and height of the image to normalize the anotations
                    size = root.find('size')
                    img_width = int(size.find('width').text)
                    img_height = int(size.find('height').text)

                    # Write the anotations in YOLO format
                    for obj in root.findall('object'):
                        # Class id:
                        class_name = obj.find('name').text
                        class_id = class_name_to_id_dict[class_name]
                        # Bounding box:
                        bbox = obj.find('bndbox')
                        x_center = (int(bbox.find('xmin').text) + int(bbox.find('xmax').text)) / 2
                        y_center = (int(bbox.find('ymin').text) + int(bbox.find('ymax').text)) / 2
                        width = int(bbox.find('xmax').text) - int(bbox.find('xmin').text) 
                        height = int(bbox.find('ymax').text) - int(bbox.find('ymin').text)
                        # Normalize the values
                        x_center /= img_width
                        y_center /= img_height
                        width /= img_width
                        height /= img_height
                        # Write the anotation
                        file.write(f'{class_id} {x_center} {y_center} {width} {height}\n')
                       
        except Exception as e:
            # Most of the errors are due to label not in ['D00', 'D10', 'D20', 'D40']
            # print(f'Error with {file_name}: {e}')
            problematic_files.append(file_name)
            continue

    # Remove the anotations folder
    if os.path.exists(anotations_path):
        os.system(f'rm -r {anotations_path}')

    # Create dataset_{subdataset}.yaml in yaml/ folder with YOLO dataset configuration
    if not os.path.exists('yaml'):
        os.makedirs('yaml')
    with open(f'yaml/dataset_{subdataset}.yaml', 'w') as file:
        file.write(f'path: ../tfg-informatica/datasets/RDD2022/images\n')
        file.write(f'train: {subdataset}/train/images\n')
        file.write(f'val: {subdataset}/train/images\n') # We don't have validation data. For now...
        # Check if there is a {subdataset}/test folder exist
        if os.path.exists(os.path.join(path, subdataset, 'test')):
            file.write(f'test: {subdataset}/test/images\n')
        # Class names
        file.write(f'\nnc: 4\n')
        file.write(f'names:\n  0: D00 - Longitudinal Crack\n  1: D10 - Transverse Crack\n  2: D20 - Alligator Crack\n  3: D40 - Potholes\n')

    # Create dataset_{subdataset}.yaml in yaml_gdrive/ folder with YOLO dataset configuration with the paths in MY Google Drive
    if not os.path.exists('yaml_gdrive'):
        os.makedirs('yaml_gdrive')
    with open(f'yaml_gdrive/dataset_{subdataset}.yaml', 'w') as file:
        file.write(f'path: gdrive/MyDrive/tfg-informatica/datasets/RDD2022/images\n')
        file.write(f'train: {subdataset}/train/images\n')
        file.write(f'val: {subdataset}/train/images\n')
        # Check if there is a {subdataset}/test folder exist
        if os.path.exists(os.path.join(path, subdataset, 'test')):
            file.write(f'test: {subdataset}/test/images\n')
        # Class names
        file.write(f'\nnc: 4\n')
        file.write(f'names:\n  0: D00 - Longitudinal Crack\n  1: D10 - Transverse Crack\n  2: D20 - Alligator Crack\n  3: D40 - Potholes\n')

print(f'Num. Problematic files: {len(problematic_files)}')

Removing problematic files...


100%|██████████| 6/6 [00:13<00:00,  2.24s/it]


Creating anotations in YOLO format...


100%|██████████| 6/6 [00:11<00:00,  1.86s/it]

Num. Problematic files: 0





In [23]:
# We create a dataset_All.yaml with all the subdatasets
# Create dataset_All.yaml in yaml/ folder with YOLO dataset configuration
if not os.path.exists('yaml'):
        os.makedirs('yaml')
with open(f'yaml/dataset_All.yaml', 'w') as file:
    file.write(f'path: ../tfg-informatica/datasets/RDD2022/images\n')
    file.write(f'train: \n')
    for subdataset in subdatasets:
        train_path = os.path.join(path, subdataset, 'train')
        if os.path.exists(train_path):
            file.write(f'- {subdataset}/train/images\n')
    file.write(f'val: \n')
    for subdataset in subdatasets:
        train_path = os.path.join(path, subdataset, 'train')
        if os.path.exists(train_path):
            file.write(f'- {subdataset}/train/images\n')
    # Class names
    file.write(f'\nnc: 4\n')
    file.write(f'names:\n  0: D00 - Longitudinal Crack\n  1: D10 - Transverse Crack\n  2: D20 - Alligator Crack\n  3: D40 - Potholes\n')

# Create dataset_All.yaml in yaml_gdrive/ folder with YOLO dataset configuration with the paths in MY Google Drive
if not os.path.exists('yaml_gdrive'):
    os.makedirs('yaml_gdrive')
with open(f'yaml_gdrive/dataset_All.yaml', 'w') as file:
    file.write(f'path: gdrive/MyDrive/tfg-informatica/datasets/RDD2022/images\n')
    file.write(f'train: \n')
    for subdataset in subdatasets:
        train_path = os.path.join(path, subdataset, 'train')
        if os.path.exists(train_path):
            file.write(f'- {subdataset}/train/images\n')
    file.write(f'val: \n')
    for subdataset in subdatasets:
        train_path = os.path.join(path, subdataset, 'train')
        if os.path.exists(train_path):
            file.write(f'- {subdataset}/train/images\n')
    # Class names
    file.write(f'\nnc: 4\n')
    file.write(f'names:\n  0: D00 - Longitudinal Crack\n  1: D10 - Transverse Crack\n  2: D20 - Alligator Crack\n  3: D40 - Potholes\n')