In [16]:
# Immport
from xml.dom import minidom
import bs4 as bs
import os
from pathlib import Path
import glob
from tqdm import tqdm
import random
import shutil
import pandas as pd

In [2]:
PRM_INPUT_ROOT_PATH = 'datasets_raw'
PRM_OUTPUT_ROOT_PATH = 'datasets'
PRM_OUTPUT_BASE_PATH = 'labels/val'
PRM_OUTPUT_MAIN_PATH = f'{PRM_OUTPUT_ROOT_PATH}/{PRM_OUTPUT_BASE_PATH}'
PRM_WORDS_TO_IGNORE = ['Zone.Identifier']

In [26]:
def filter_words(list_of_words):
  for filter_word in PRM_WORDS_TO_IGNORE:
    list_of_words = [word for word in list_of_words if filter_word.upper() not in word.upper()]
  return list_of_words

def get_nasty_files(list_of_words):
  for filter_word in PRM_WORDS_TO_IGNORE:
    list_of_words = [word for word in list_of_words if filter_word.upper() in word.upper()]
  return list_of_words

def delete_nasty_files(path):
  # get files list
  if os.path.exists(path):
    list_files = os.listdir(path)

    # gest nasty files
    nasty_files = get_nasty_files(list_files)

    # validate if exists then remove it
    if len(nasty_files) > 0:
      for file in nasty_files:
        file_path = f'{path}/{file}'
        # check if exists
        if os.path.exists(file_path):
            os.remove(file_path)

In [4]:
def convertPascal2YOLOv8(filePath):

    class_mapping = {
        "D00": 0,
        "D10": 1,
        "D20": 2,
        "D40": 3,
        "D01": 4,
        "D11": 5,
        "D43": 6,
        "D44": 7,
        "D50": 8
    }
    
    # reading content
    file = open(filePath, "r")
    contents = file.read()

    # parsing
    soup = bs.BeautifulSoup(contents, 'xml')
    image_size = soup.find_all("size")[0]
    image_width = int(image_size.find_all("width")[0].get_text())
    image_height = int(image_size.find_all("height")[0].get_text())
    # print("w,h :", image_width, image_height)

    # Process Bounding Box
    objects = soup.find_all("object")

    # Placeholder
    bounding_box_list = []
    class_list = []

    for object in objects:
        
        # Object Class
        _class = object.find_all("name")[0].get_text()
        
        # Map the class to int number, if not defined > 10
        _class = class_mapping.get(_class, 10)
        class_list.append(_class)
        
        # Object Bounding Box
        _xmin = float(object.find_all("xmin")[0].get_text())
        _ymin = float(object.find_all("ymin")[0].get_text())
        _xmax = float(object.find_all("xmax")[0].get_text())
        _ymax = float(object.find_all("ymax")[0].get_text())

        # Convert to YOLOv8 Annotation
        # class x_center y_center width height
        w = (_xmax - _xmin)
        h = (_ymax - _ymin)
        cx = _xmin + (w/2)
        cy = _ymin + (h/2)

        # Normalize
        w = round((w / image_width), 4)
        h = round((h / image_height), 4)
        cx = round((cx / image_width), 4)
        cy = round((cy / image_height), 4)

        _bbox = [cx, cy, w, h]

        # print(_class, cx, cy, w, h)

        bounding_box_list.append(_bbox)

    # Get the filename
    outputFilename = os.path.split(filePath)[1]
    outputFilename = outputFilename.replace(".xml", ".txt")

    # Output Path
    outputDir = Path(filePath).parents[2]
    outputDir = outputDir / "labels"

    # Check if the directory exists
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    # Final output path
    outputPath = outputDir / outputFilename
    # print(outputPath)

    # Write to .txt file
    with open(outputPath, 'w') as f:
        for i in range(len(class_list)):

            # Filter the class, drop unused class
            # 0: D00 > Longitudinal Crack
            # 1: D10 > Transverse Crack
            # 2: D20 > Alligator Crack
            # 3: D40 > Potholes
            
            if class_list[i] < 4:
                anno = str(class_list[i]) + " " +  str(bounding_box_list[i][0]) + " " +  str(bounding_box_list[i][1]) + " " +  str(bounding_box_list[i][2]) + " " +  str(bounding_box_list[i][3]) + "\n"
                f.write(anno)

Remove nasty files

In [27]:
# Dataset Root Directory
ROOTDIR = "datasets_raw"

country_dir = ["RDD2022_all_countries/Japan/Japan",
               "RDD2022_all_countries/India/India",
               "RDD2022_all_countries/China_Drone/China_Drone",
               "RDD2022_all_countries/China_MotorBike/China_MotorBike",
               "RDD2022_all_countries/Czech/Czech",
               "RDD2022_all_countries/Norway/Norway",
               "RDD2022_all_countries/United_States/United_States"]

sufix_dir = ['train/images',
             'train/annotations/xmls',
             'test/images']

for country in country_dir:
    for sufix in sufix_dir:
        path_to_delete = f'{ROOTDIR}/{country}/{sufix}'
        delete_nasty_files(path_to_delete)

Count files

In [29]:
# declare list that stores records
records = list()

for country in country_dir:
    path_train = f'{ROOTDIR}/{country}/train/images'
    path_test = f'{ROOTDIR}/{country}/test/images'
    path_labels = f'{ROOTDIR}/{country}/train/annotations/xmls'

    record = dict()
    record['Country'] = country.split('/')[1]
    record['Train_size'] = len(os.listdir(path_train)) if os.path.exists(path_train) else 0
    record['Test_size'] = len(os.listdir(path_test)) if os.path.exists(path_test) else 0
    record['Label_size'] = len(os.listdir(path_labels)) if os.path.exists(path_labels) else 0

    # save record
    records.append(record)

records = pd.DataFrame(records)
records['Total_size'] = records['Train_size'] + records['Test_size']
records

Unnamed: 0,Country,Train_size,Test_size,Label_size,Total_size
0,Japan,10506,2627,10506,13133
1,India,7706,1959,7706,9665
2,China_Drone,2401,0,2401,2401
3,China_MotorBike,1977,500,1977,2477
4,Czech,2829,709,2829,3538
5,Norway,8161,2040,8161,10201
6,United_States,4805,1200,4805,6005


In [34]:
records.drop(columns = 'Country').sum(axis = 0)

Train_size    38385
Test_size      9035
Label_size    38385
Total_size    47420
dtype: int64

In [32]:
# Dataset Root Directory
ROOTDIR = "datasets_raw/"

# Base Directory
CountryListDir = ["RDD2022_all_countries/Japan/Japan/train/annotations/xmls",
                  "RDD2022_all_countries/India/India/train/annotations/xmls",
                  "RDD2022_all_countries/China_Drone/China_Drone/train/annotations/xmls",
                  "RDD2022_all_countries/China_MotorBike/China_MotorBike/train/annotations/xmls",
                  "RDD2022_all_countries/Czech/Czech/train/annotations/xmls",
                  "RDD2022_all_countries/Norway/Norway/train/annotations/xmls",
                  "RDD2022_all_countries/United_States/United_States/train/annotations/xmls",
]

for CountryDir in CountryListDir:
    
    CountryDir = ROOTDIR + CountryDir

    delete_nasty_files(CountryDir)

    fileList = sorted(glob.glob(CountryDir + "/*.xml"))

    # Processing all the annotation
    for file in tqdm(fileList):

        convertPascal2YOLOv8(file)
        # break

100%|██████████| 10506/10506 [00:09<00:00, 1066.24it/s]
100%|██████████| 7706/7706 [00:04<00:00, 1648.84it/s]
100%|██████████| 2401/2401 [00:01<00:00, 1286.58it/s]
100%|██████████| 1977/1977 [00:01<00:00, 988.58it/s] 
100%|██████████| 2829/2829 [00:01<00:00, 2247.12it/s]
100%|██████████| 8161/8161 [00:06<00:00, 1202.66it/s]
100%|██████████| 4805/4805 [00:04<00:00, 1070.24it/s]


In [35]:
def CopyDatasetSplit(baseDir):
    
    # Split the training data to train and validation data due to lack of annotation on test data
    # Seed
    random.seed(1337)
    
    # Output Directory
    # !!! Change this to your clone folder
    baseOutputDir = "datasets/init_base"
    countryName = Path(baseDir).parents[0]
    countryName = os.path.split(countryName)[1]
    countryName = countryName.split('/')[0]

    baseImageDir = baseDir + "images/"
    baseAnnotDir = baseDir + "labels/"

    image_list_all = sorted(glob.glob(baseImageDir + "/*"))
    annot_list_all = sorted(glob.glob(baseAnnotDir + "/*"))

    # Drop any images that doesnt have annotation (background)
    # Or just leave it at some percentage of the dataset
    backgroundImages_Percentage = 1
    image_list = []
    annot_list = []
    
    dataset_length_all = len(image_list_all)
    max_background_image = int(dataset_length_all*backgroundImages_Percentage)
    _counter = 0

    for i in range(len(annot_list_all)):
        
        with open(annot_list_all[i]) as f:
            _annot = f.read()

            # Annotation not empty
            if _annot:
                image_list.append(image_list_all[i])
                annot_list.append(annot_list_all[i])
            elif _counter < max_background_image:
                image_list.append(image_list_all[i])
                annot_list.append(annot_list_all[i])
                _counter = _counter + 1
                
    # Dataset length
    dataset_length = len(image_list)
    # print(dataset_length, len(annot_list))

    split_ratio = 0.9
    middle_point = round(split_ratio * dataset_length)

    # Create random list number using seed
    numberList = list(range(0, dataset_length))
    random.shuffle(numberList)
    trainNumberList = numberList[:middle_point]
    validNumberList = numberList[middle_point:]
    print("Training/Validation Samples :", len(trainNumberList), len(validNumberList))

    # Training images and labels
    print("Copying training images and labels for", countryName)
    for i in tqdm(trainNumberList):

        # Images
        outputImagesDir = baseOutputDir + countryName + "/images/train/"
        if not os.path.exists(outputImagesDir):
            os.makedirs(outputImagesDir)

        shutil.copy2(image_list[i], outputImagesDir)

        # Annotations
        outputAnnotDir = baseOutputDir + countryName + "/labels/train/"
        if not os.path.exists(outputAnnotDir):
            os.makedirs(outputAnnotDir)

        shutil.copy2(annot_list[i], outputAnnotDir)
        # print(outputImagesDir, outputAnnotDir)

    # Validation images and labels
    print("Copying validation images and labels for", countryName)
    for i in tqdm(validNumberList):

        # Images
        outputImagesDir = baseOutputDir + countryName + "/images/val/"
        if not os.path.exists(outputImagesDir):
            os.makedirs(outputImagesDir)

        shutil.copy2(image_list[i], outputImagesDir)

        # Annotations
        outputAnnotDir = baseOutputDir + countryName + "/labels/val/"
        if not os.path.exists(outputAnnotDir):
            os.makedirs(outputAnnotDir)

        shutil.copy2(annot_list[i], outputAnnotDir)
        # print(outputImagesDir, outputAnnotDir)

# baseDir = "../dataset/RDD2022/RDD2022_all_countries/Japan/train/"
# CopyDatasetSplit(baseDir)

In [36]:
# Base Directory
ROOTDIR = "datasets_raw/"

# Use only japan india
CountryListDir = ["RDD2022_all_countries/Japan/Japan/train/",
                  "RDD2022_all_countries/India/India/train/",
                  "RDD2022_all_countries/China_Drone/China_Drone/train/",
                  "RDD2022_all_countries/Czech/Czech/train/",
                  "RDD2022_all_countries/Norway/Norway/train/",
                  "RDD2022_all_countries/United_States/United_States/train/",
                  "RDD2022_all_countries/China_MotorBike/China_MotorBike/train/",
]

for CountryDir in CountryListDir:
    CountryDir = ROOTDIR + CountryDir
    CopyDatasetSplit(CountryDir)

Training/Validation Samples : 9455 1051
Copying training images and labels for Japan


100%|██████████| 9455/9455 [00:06<00:00, 1457.90it/s]


Copying validation images and labels for Japan


100%|██████████| 1051/1051 [00:00<00:00, 1183.82it/s]


Training/Validation Samples : 6935 771
Copying training images and labels for India


100%|██████████| 6935/6935 [00:04<00:00, 1469.18it/s]


Copying validation images and labels for India


100%|██████████| 771/771 [00:00<00:00, 1499.79it/s]


Training/Validation Samples : 2161 240
Copying training images and labels for China_Drone


100%|██████████| 2161/2161 [00:01<00:00, 1896.42it/s]


Copying validation images and labels for China_Drone


100%|██████████| 240/240 [00:00<00:00, 2026.98it/s]


Training/Validation Samples : 2546 283
Copying training images and labels for Czech


100%|██████████| 2546/2546 [00:01<00:00, 1708.72it/s]


Copying validation images and labels for Czech


100%|██████████| 283/283 [00:00<00:00, 1793.89it/s]


Training/Validation Samples : 7345 816
Copying training images and labels for Norway


100%|██████████| 7345/7345 [00:22<00:00, 321.28it/s]


Copying validation images and labels for Norway


100%|██████████| 816/816 [00:02<00:00, 398.39it/s]


Training/Validation Samples : 4324 481
Copying training images and labels for United_States


100%|██████████| 4324/4324 [00:01<00:00, 2308.29it/s]


Copying validation images and labels for United_States


100%|██████████| 481/481 [00:00<00:00, 2308.53it/s]


Training/Validation Samples : 1779 198
Copying training images and labels for China_MotorBike


100%|██████████| 1779/1779 [00:00<00:00, 2438.93it/s]


Copying validation images and labels for China_MotorBike


100%|██████████| 198/198 [00:00<00:00, 2129.93it/s]


Report

In [41]:
countries = ['Japan',
             'India',
             'Czech',
             'Norway',
             'United_States',
             'China_MotorBike',
             'China_Drone']

dataset_size = list()

# declare list that stores records
records = list()

for country in countries:
    path_train = f'datasets/init_base{country}/images/train'
    path_val = f'datasets/init_base{country}/images/val'

    record = dict()
    record['Country'] = country
    record['Train_size'] = len(os.listdir(path_train)) if os.path.exists(path_train) else 0
    record['Val_size'] = len(os.listdir(path_val)) if os.path.exists(path_val) else 0

    # save record
    records.append(record)

records = pd.DataFrame(records)
records['Total_size'] = records['Train_size'] + records['Val_size']
records

Unnamed: 0,Country,Train_size,Val_size,Total_size
0,Japan,9455,1051,10506
1,India,6935,771,7706
2,Czech,2546,283,2829
3,Norway,7345,816,8161
4,United_States,4324,481,4805
5,China_MotorBike,1779,198,1977
6,China_Drone,2161,240,2401


In [42]:
records.drop(columns = 'Country').sum()

Train_size    34545
Val_size       3840
Total_size    38385
dtype: int64