# Combining DataV2 and DataV3 into Dataset_XML

This notebook combines the XML-based license plate datasets (DataV2 and DataV3) into a single unified dataset called Dataset_XML.

Both datasets use the same annotation format with bounding boxes defined as:
- `xmin, ymin`: Top-left corner
- `xmax, ymax`: Bottom-right corner

In [1]:
import os
import shutil
from tqdm import tqdm
import xml.etree.ElementTree as ET

# Paths
base_dir = r"c:\ULB\MA1\Proj\PROJ-H419\Car-plate-detection"
datav2_dir = os.path.join(base_dir, "DataV2")
datav3_dir = os.path.join(base_dir, "DataV3")
output_dir = os.path.join(base_dir, "Dataset_XML")
output_images_dir = os.path.join(output_dir, "images")
output_anno_dir = os.path.join(output_dir, "annotations")

# Make sure output directories exist
os.makedirs(output_images_dir, exist_ok=True)
os.makedirs(output_anno_dir, exist_ok=True)

# Function to copy and rename files
def copy_files_with_prefixes(source_img_dir, source_anno_dir, prefix, is_same_dir=False):
    """
    Copy files from source directories to output directories, adding a prefix to avoid name conflicts
    
    Args:
        source_img_dir: Directory containing images
        source_anno_dir: Directory containing annotations
        prefix: Prefix to add to filenames
        is_same_dir: If True, annotations and images are in the same directory
    """
    print(f"Processing {source_img_dir} with prefix {prefix}...")
    
    # Find images
    if is_same_dir:
        source_files = [f for f in os.listdir(source_img_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    else:
        source_files = [f for f in os.listdir(source_img_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    # Copy files
    n_processed = 0
    for file in tqdm(source_files, desc=f"Copying {prefix} files"):
        # Get basename without extension
        base_name = os.path.splitext(file)[0]
        
        # Source image path
        img_path = os.path.join(source_img_dir, file)
        
        # Source annotation path
        if is_same_dir:
            anno_file = f"{base_name}.xml"
            anno_path = os.path.join(source_anno_dir, anno_file)
        else:
            anno_file = f"{base_name}.xml"
            anno_path = os.path.join(source_anno_dir, anno_file)
        
        # Skip if annotation doesn't exist
        if not os.path.exists(anno_path):
            print(f"  Skipping {file} - no matching annotation")
            continue
        
        # Target paths with prefix
        target_img_file = f"{prefix}_{file}"
        target_anno_file = f"{prefix}_{base_name}.xml"
        
        target_img_path = os.path.join(output_images_dir, target_img_file)
        target_anno_path = os.path.join(output_anno_dir, target_anno_file)
        
        # Copy image
        shutil.copy2(img_path, target_img_path)
        
        # Update XML with new filename before copying
        try:
            tree = ET.parse(anno_path)
            root = tree.getroot()
            
            # Update filename in XML
            filename_elem = root.find('filename')
            if filename_elem is not None:
                filename_elem.text = target_img_file
            
            # Update path in XML if it exists
            path_elem = root.find('path')
            if path_elem is not None:
                path_elem.text = target_img_path
            
            # Write updated XML to target
            tree.write(target_anno_path)
            n_processed += 1
            
        except Exception as e:
            print(f"  Error processing XML {anno_path}: {e}")
    
    return n_processed

# Process DataV2
datav2_images = os.path.join(datav2_dir, "images")
datav2_annotations = os.path.join(datav2_dir, "annotations")

if os.path.exists(datav2_images) and os.path.exists(datav2_annotations):
    n_v2 = copy_files_with_prefixes(datav2_images, datav2_annotations, "v2", is_same_dir=False)
    print(f"Processed {n_v2} files from DataV2")
else:
    print("DataV2 directories not found")

# Process DataV3
datav3_images = os.path.join(datav3_dir, "images")

if os.path.exists(datav3_images):
    n_v3 = copy_files_with_prefixes(datav3_images, datav3_images, "v3", is_same_dir=True)
    print(f"Processed {n_v3} files from DataV3")
else:
    print("DataV3 directory not found")

# Count total files in output directory
output_images = len([f for f in os.listdir(output_images_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
output_annotations = len([f for f in os.listdir(output_anno_dir) if f.lower().endswith('.xml')])

print(f"\nCombined dataset created at {output_dir}")
print(f"Total images: {output_images}")
print(f"Total annotations: {output_annotations}")

Processing c:\ULB\MA1\Proj\PROJ-H419\Car-plate-detection\DataV2\images with prefix v2...


Copying v2 files: 100%|██████████| 433/433 [00:02<00:00, 200.67it/s]
Copying v2 files: 100%|██████████| 433/433 [00:02<00:00, 200.67it/s]


Processed 433 files from DataV2
Processing c:\ULB\MA1\Proj\PROJ-H419\Car-plate-detection\DataV3\images with prefix v3...


Copying v3 files: 100%|██████████| 207/207 [00:02<00:00, 73.98it/s]

Processed 207 files from DataV3

Combined dataset created at c:\ULB\MA1\Proj\PROJ-H419\Car-plate-detection\Dataset_XML
Total images: 640
Total annotations: 640



