In [3]:
import os
import json
import polars as pl

import yaml
import shutil

In [4]:
baseFolder = './roboflow/'

dataset1_path = os.path.join(baseFolder, "Document-Segmentation-1")
dataset2_path = os.path.join(baseFolder, "ID-Cards-Segmentation-1")
output_path = "datasets"

In [5]:
chosen_classes = {
    'Document': ['0'],
    'ID Card': ['Cards'],
}

with open(f"{dataset1_path}/data.yaml", 'r') as f:
    yaml1 = yaml.safe_load(f)
with open(f"{dataset2_path}/data.yaml", 'r') as f:
    yaml2 = yaml.safe_load(f)

names1 = {i: name for i, name in enumerate(yaml1['names'])} if isinstance(yaml1['names'], list) else yaml1['names']
names2 = {i: name for i, name in enumerate(yaml2['names'])} if isinstance(yaml2['names'], list) else yaml2['names']

class_mapping = {}
new_class_names = []

for new_class_idx, (new_class_name, old_class_names) in enumerate(chosen_classes.items()):
    new_class_names.append(new_class_name)
    for old_class_name in old_class_names:
        for class_id, name in names1.items():
            if name == old_class_name:
                class_mapping[f"1:{class_id}"] = new_class_idx
        for class_id, name in names2.items():
            if name == old_class_name:
                class_mapping[f"2:{class_id}"] = new_class_idx

os.makedirs(f"{output_path}/train/images", exist_ok=True)
os.makedirs(f"{output_path}/valid/images", exist_ok=True)
os.makedirs(f"{output_path}/train/labels", exist_ok=True)
os.makedirs(f"{output_path}/valid/labels", exist_ok=True)

datasets = [(dataset1_path, 1), (dataset2_path, 2)]
files_processed = 0

for dataset_path, dataset_num in datasets:
    for split in ['train', 'valid', 'test']:
        images_dir = f"{dataset_path}/{split}/images"
        labels_dir = f"{dataset_path}/{split}/labels"
        
        if not (os.path.exists(images_dir) and os.path.exists(labels_dir)):
            continue
            
        output_split = 'valid' if split == 'test' else split
        
        for file in os.listdir(images_dir):
            if file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                base_name = os.path.splitext(file)[0]
                label_file = f"{labels_dir}/{base_name}.txt"
                
                if os.path.exists(label_file):
                    with open(label_file, 'r') as f:
                        lines = f.readlines()
                    
                    new_lines = []
                    for line in lines:
                        line = line.strip()
                        if not line:
                            continue
                        parts = line.split()
                        if len(parts) >= 7:
                            old_class_id = int(parts[0])
                            key = f"{dataset_num}:{old_class_id}"
                            if key in class_mapping:
                                new_class_id = class_mapping[key]
                                coords = parts[1:]
                                if len(coords) % 2 == 0 and len(coords) >= 6:
                                    new_lines.append(f"{new_class_id} {' '.join(coords)}\n")
                    
                    if new_lines:
                        new_img_name = f"ds{dataset_num}_{file}"
                        new_label_name = f"ds{dataset_num}_{base_name}.txt"
                        
                        shutil.copy2(f"{images_dir}/{file}", f"{output_path}/{output_split}/images/{new_img_name}")
                        with open(f"{output_path}/{output_split}/labels/{new_label_name}", 'w') as f:
                            f.writelines(new_lines)
                        
                        files_processed += 1

merged_yaml = {
    'names': new_class_names,
    'nc': len(new_class_names),
    'path': output_path,
    'train': "train/images",
    'val': "valid/images",
}

with open(f"{output_path}/data.yaml", 'w') as f:
    yaml.dump(merged_yaml, f, default_flow_style=False)

train_count = len([f for f in os.listdir(f"{output_path}/train/images") if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])
valid_count = len([f for f in os.listdir(f"{output_path}/valid/images") if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])

print(f"Processed: {files_processed} files")
print(f"Output: {train_count} train, {valid_count} valid")
print(f"Classes: {new_class_names}")

Processed: 4615 files
Output: 3723 train, 892 valid
Classes: ['Document', 'ID Card']


In [6]:
import os
import cv2
import numpy as np
from pathlib import Path
import shutil

base_path = "datasets"
splits = ["train", "valid"]

rectangular_files = []
segmentation_files = []
total_processed = 0

for split in splits:
   images_dir = os.path.join(base_path, split, "images")
   labels_dir = os.path.join(base_path, split, "labels")
   
   if not os.path.exists(images_dir) or not os.path.exists(labels_dir):
       print(f"Skipping {split} - directory not found")
       continue
   
   image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
   image_files = []
   for ext in image_extensions:
       image_files.extend(Path(images_dir).glob(f"*{ext}"))
       image_files.extend(Path(images_dir).glob(f"*{ext.upper()}"))
   
   print(f"Processing {split}: {len(image_files)} images")
   
   for img_path in image_files:
       label_path = os.path.join(labels_dir, img_path.stem + '.txt')
       
       if not os.path.exists(label_path):
           continue
           
       img = cv2.imread(str(img_path))
       if img is None:
           continue
           
       h, w = img.shape[:2]
       
       with open(label_path, 'r') as f:
           lines = f.readlines()
       
       is_rectangular = True
       
       for line in lines:
           parts = line.strip().split()
           if len(parts) < 6:
               continue
               
           coords = [float(x) for x in parts[1:]]
           
           if len(coords) > 4:
               points = np.array(coords).reshape(-1, 2)
               points[:, 0] *= w
               points[:, 1] *= h
               points = points.astype(np.int32)
               
               if len(points) > 4:
                   is_rectangular = False
                   break
               elif len(points) == 4:
                   x_coords = points[:, 0]
                   y_coords = points[:, 1]
                   
                   min_x, max_x = np.min(x_coords), np.max(x_coords)
                   min_y, max_y = np.min(y_coords), np.max(y_coords)
                   
                   expected_rect = np.array([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]])
                   
                   distances = []
                   for point in points:
                       min_dist = float('inf')
                       for rect_point in expected_rect:
                           dist = np.linalg.norm(point - rect_point)
                           min_dist = min(min_dist, dist)
                       distances.append(min_dist)
                   
                   avg_distance = np.mean(distances)
                   if avg_distance > 10:
                       is_rectangular = False
                       break
       
       if is_rectangular:
           rectangular_files.append((split, str(img_path), label_path))
       else:
           segmentation_files.append((split, str(img_path), label_path))
       
       total_processed += 1

print(f"\nAnalysis Results:")
print(f"Rectangular (bounding box-like) masks: {len(rectangular_files)}")
print(f"Segmentation masks: {len(segmentation_files)}")
print(f"Total processed: {total_processed}")

print(f"\nBreakdown by split:")
for split in splits:
   rect_count = sum(1 for s, _, _ in rectangular_files if s == split)
   seg_count = sum(1 for s, _, _ in segmentation_files if s == split)
   print(f"{split}: {rect_count} rectangular, {seg_count} segmentation")

if len(rectangular_files) > 0:
   print(f"\nFirst 5 files to be removed:")
   for i, (split, img_path, label_path) in enumerate(rectangular_files[:5]):
       print(f"{i+1}. {split}/{Path(img_path).name}")

confirm = input(f"\nDo you want to remove {len(rectangular_files)} rectangular mask files? (y/n): ")

if confirm.lower() == 'y':
   removed_count = 0
   
   for split, img_path, label_path in rectangular_files:
       try:
           if os.path.exists(img_path):
               os.remove(img_path)
               print(f"Removed image: {Path(img_path).name}")
           
           if os.path.exists(label_path):
               os.remove(label_path)
               print(f"Removed label: {Path(label_path).name}")
           
           removed_count += 1
           
       except Exception as e:
           print(f"Error removing {Path(img_path).name}: {e}")
   
   print(f"\nRemoval complete!")
   print(f"Successfully removed {removed_count} file pairs")
   print(f"Remaining files: {len(segmentation_files)} segmentation mask pairs")
   
   print(f"\nFinal dataset size:")
   for split in splits:
       remaining_count = sum(1 for s, _, _ in segmentation_files if s == split)
       print(f"{split}: {remaining_count} files")

else:
   print("Operation cancelled. No files were removed.")

Processing train: 3723 images
Processing valid: 892 images

Analysis Results:
Rectangular (bounding box-like) masks: 0
Segmentation masks: 4615
Total processed: 4615

Breakdown by split:
train: 0 rectangular, 3723 segmentation
valid: 0 rectangular, 892 segmentation
Operation cancelled. No files were removed.
