# Extract files corresponding to images

In [4]:
import os
import shutil
import xml.etree.ElementTree as ET
import glob

class XMLExtractor:
    def __init__(self, image_folders, xml_folder, output_folders):
        self.image_folders = image_folders
        self.xml_folder = xml_folder
        self.output_folders = output_folders

    def extract_xmls(self):
        for i, img_folder in enumerate(self.image_folders):
            output_folder = self.output_folders[i]
            self.check_folder_exists(output_folder)
            self.process_images(img_folder, output_folder)

    def process_images(self, img_folder, output_folder):
        img_filenames = set()
        for img_file in os.listdir(img_folder):
            img_filenames.add(os.path.splitext(img_file)[0])

        for xml_file in glob.glob(os.path.join(self.xml_folder, '*.xml')):
            root = ET.parse(xml_file).getroot()
            filename = root.find('filename').text
            if os.path.splitext(filename)[0] in img_filenames:
                output_xml_path = os.path.join(output_folder, os.path.basename(xml_file))
                shutil.copy(xml_file, output_xml_path)
                print(f"Copied {xml_file} to {output_xml_path}")

    def check_folder_exists(self, path):
        if not os.path.exists(path):
            try:
                os.makedirs(path)
                print(f'Created {path}')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

if __name__ == '__main__':
    image_folders = ['./raw_data_I_changed/raw_data/test/images',
                     './raw_data_I_changed/raw_data/train/images',
                     './raw_data_I_changed/raw_data/valid/images']
    xml_folder = './raw_data_I_changed/direction_xmls'
    output_folders = ['./raw_data_I_changed/raw_data/test/xmls',
                        './raw_data_I_changed/raw_data/train/xmls',
                        './raw_data_I_changed/raw_data/valid/xmls'
                      ]

    extractor = XMLExtractor(image_folders, xml_folder, output_folders)
    extractor.extract_xmls()

Copied ./raw_data_I_changed/direction_xmls\TE_C_00004.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_C_00004.xml
Copied ./raw_data_I_changed/direction_xmls\TE_C_00009.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_C_00009.xml
Copied ./raw_data_I_changed/direction_xmls\TE_C_00018.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_C_00018.xml
Copied ./raw_data_I_changed/direction_xmls\TE_C_00023.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_C_00023.xml
Copied ./raw_data_I_changed/direction_xmls\TE_D_00004.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_D_00004.xml
Copied ./raw_data_I_changed/direction_xmls\TE_D_00033.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_D_00033.xml
Copied ./raw_data_I_changed/direction_xmls\TE_D_00035.xml to ./raw_data_I_changed/raw_data/test/xmls\TE_D_00035.xml
Copied ./raw_data_I_changed/direction_xmls\TR_C_00017.xml to ./raw_data_I_changed/raw_data/test/xmls\TR_C_00017.xml
Copied ./raw_data_I_changed/direction_xmls\TR_C_00022.xml to ./raw_data_

# Split to train, test and valid. 
## Condition: similar images and xmls stays in train folder

In [7]:
import os
import shutil
import xml.etree.ElementTree as ET
import glob
import math
import random

class DataSplitter:
    def __init__(self, similar_img_folder, unique_img_folder, similar_xml_folder, unique_xml_folder, output_folder):
        self.similar_img_folder = similar_img_folder
        self.unique_img_folder = unique_img_folder
        self.similar_xml_folder = similar_xml_folder
        self.unique_xml_folder = unique_xml_folder
        self.output_folder = output_folder
        self.total_files = 0

    def split_data(self):
        self.check_folder_exists(self.output_folder)
        train_folder = os.path.join(self.output_folder, 'train')
        valid_folder = os.path.join(self.output_folder, 'valid')
        test_folder = os.path.join(self.output_folder, 'test')

        self.check_folder_exists(train_folder)
        self.check_folder_exists(valid_folder)
        self.check_folder_exists(test_folder)

        train_img_folder = os.path.join(train_folder, 'images')
        train_xml_folder = os.path.join(train_folder, 'xmls')
        valid_img_folder = os.path.join(valid_folder, 'images')
        valid_xml_folder = os.path.join(valid_folder, 'xmls')
        test_img_folder = os.path.join(test_folder, 'images')
        test_xml_folder = os.path.join(test_folder, 'xmls')

        self.check_folder_exists(train_img_folder)
        self.check_folder_exists(train_xml_folder)
        self.check_folder_exists(valid_img_folder)
        self.check_folder_exists(valid_xml_folder)
        self.check_folder_exists(test_img_folder)
        self.check_folder_exists(test_xml_folder)

        # Copy similar and xmls_similar data to train
        self.copy_similar_data(train_img_folder, train_xml_folder)

        # Split unique and unique_xmls data into valid and test
        self.split_unique_data(train_img_folder, train_xml_folder, valid_img_folder, valid_xml_folder, test_img_folder, test_xml_folder)

        print(f"Total files processed: {self.total_files}")

    def copy_similar_data(self, train_img_folder, train_xml_folder):
        self.copy_files(self.similar_img_folder, train_img_folder)
        self.copy_files(self.similar_xml_folder, train_xml_folder, prefix='xmls_')

    def split_unique_data(self, train_img_folder, train_xml_folder, valid_img_folder, valid_xml_folder, test_img_folder, test_xml_folder):
        total_files = 176 + 186  # Similar + Unique
        train_count = 254 - 176  # 254 - similar
        valid_count = 73
        test_count = 35

        unique_img_files = self.get_all_image_files(self.unique_img_folder)
        unique_xml_files = self.get_all_xml_files(self.unique_xml_folder)
        random.shuffle(unique_img_files)
        random.shuffle(unique_xml_files)

        for i, img_file in enumerate(unique_img_files):
            img_path = os.path.join(self.unique_img_folder, img_file)
            xml_file = os.path.splitext(os.path.basename(img_file))[0] + '.xml'
            xml_path = os.path.join(self.unique_xml_folder, xml_file)

            if i < train_count:
                self.copy_files(img_path, train_img_folder)
                self.copy_files(xml_path, train_xml_folder)
            elif i < train_count + valid_count:
                self.copy_files(img_path, valid_img_folder)
                self.copy_files(xml_path, valid_xml_folder)
            else:
                self.copy_files(img_path, test_img_folder)
                self.copy_files(xml_path, test_xml_folder)

    def get_all_image_files(self, folder):
        image_files = []
        for filename in os.listdir(folder):
            if any(filename.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.PNG', '.gif']):
                image_files.append(filename)
        return image_files

    def get_all_xml_files(self, folder):
        xml_files = []
        for filename in os.listdir(folder):
            if filename.endswith('.xml'):
                xml_files.append(filename)
        return xml_files

    def copy_files(self, src, dst, prefix=''):
        if os.path.isfile(src):
            dst_path = os.path.join(dst, prefix + os.path.basename(src))
            shutil.copy2(src, dst_path)
            self.total_files += 1
            print(f"Copied {src} to {dst_path}")

    def check_folder_exists(self, path):
        if not os.path.exists(path):
            try:
                os.makedirs(path)
                print(f'Created {path}')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

if __name__ == '__main__':
    similar_img_folder = './data/similar'
    unique_img_folder = './data/unique'
    similar_xml_folder = './data/xmls_similar'
    unique_xml_folder = './data/xmls_unique'
    output_folder = './raw_data'

    splitter = DataSplitter(similar_img_folder, unique_img_folder, similar_xml_folder, unique_xml_folder, output_folder)
    splitter.split_data()

Created ./raw_data
Created ./raw_data\train
Created ./raw_data\valid
Created ./raw_data\test
Created ./raw_data\train\images
Created ./raw_data\train\xmls
Created ./raw_data\valid\images
Created ./raw_data\valid\xmls
Created ./raw_data\test\images
Created ./raw_data\test\xmls
Copied ./data/unique\VA_C_00008.PNG to ./raw_data\train\images\VA_C_00008.PNG
Copied ./data/xmls_unique\VA_C_00008.xml to ./raw_data\train\xmls\VA_C_00008.xml
Copied ./data/unique\TE_C_00021.png to ./raw_data\train\images\TE_C_00021.png
Copied ./data/xmls_unique\TE_C_00021.xml to ./raw_data\train\xmls\TE_C_00021.xml
Copied ./data/unique\TR_C_00007.PNG to ./raw_data\train\images\TR_C_00007.PNG
Copied ./data/xmls_unique\TR_C_00007.xml to ./raw_data\train\xmls\TR_C_00007.xml
Copied ./data/unique\TR_C_00130.PNG to ./raw_data\train\images\TR_C_00130.PNG
Copied ./data/xmls_unique\TR_C_00130.xml to ./raw_data\train\xmls\TR_C_00130.xml
Copied ./data/unique\TR_D_00031.PNG to ./raw_data\train\images\TR_D_00031.PNG
Copied ./