In [None]:

import os
import shutil
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

# Define paths
raw_data_path = 'data/raw_data'
labeled_data_path = 'data/labeled_data'
output_path = 'data/prepared_data'

# Read labeled data information (assuming XML format, adjust accordingly)
def read_xml_annotations(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    annotations = []

    for obj in root.findall('.//object'):
        label = obj.find('name').text
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)

        annotations.append({
            'label': label,
            'xmin': xmin,
            'ymin': ymin,
            'xmax': xmax,
            'ymax': ymax
        })

    return annotations

# Create a DataFrame to store annotations
annotations_list = []

for xml_file in os.listdir(labeled_data_path):
    if xml_file.endswith('.xml'):
        xml_path = os.path.join(labeled_data_path, xml_file)
        annotations_list.extend(read_xml_annotations(xml_path))

annotations_df = pd.DataFrame(annotations_list)

# Split data into training and testing sets
train_df, test_df = train_test_split(annotations_df, test_size=0.2, random_state=42)

# Copy images and create new folders for training and testing
train_output_folder = os.path.join(output_path, 'train')
test_output_folder = os.path.join(output_path, 'test')

os.makedirs(train_output_folder, exist_ok=True)
os.makedirs(test_output_folder, exist_ok=True)

for xml_file in os.listdir(labeled_data_path):
    if xml_file.endswith('.xml'):
        image_file = xml_file.replace('.xml', '.jpg')
        shutil.copy(os.path.join(raw_data_path, image_file), train_output_folder if xml_file in train_df['xml_file'].values else test_output_folder)

# Save the CSV files
train_df.to_csv(os.path.join(output_path, 'train_annotations.csv'), index=False)
test_df.to_csv(os.path.join(output_path, 'test_annotations.csv'), index=False)

print("Data preparation completed.")