In [2]:
import os
import json
import polars as pl

import yaml
import shutil

In [6]:
baseFolder = './roboflow/'

dataset1_path = os.path.join(baseFolder, "Document-Segmentation-2")
dataset2_path = os.path.join(baseFolder, "ID-Cards-Segmentation-1")
output_path = "datasets"

In [8]:
dataset1_path, dataset2_path

('./roboflow/Document-Segmentation-2', './roboflow/ID-Cards-Segmentation-1')

In [11]:
with open(f"{dataset1_path}/data.yaml", 'r') as f:
    yaml1 = yaml.safe_load(f)
with open(f"{dataset2_path}/data.yaml", 'r') as f:
    yaml2 = yaml.safe_load(f)

def get_class_names(yaml_data):
    if isinstance(yaml_data['names'], dict):
        return yaml_data['names']
    else:
        return {i: name for i, name in enumerate(yaml_data['names'])}

names1 = get_class_names(yaml1)
names2 = get_class_names(yaml2)

print("Dataset 1 classes:")
for i, name in names1.items():
    print(f"  {i}: {name}")
print("\nDataset 2 classes:")
for i, name in names2.items():
    print(f"  {i}: {name}")

# selections = input("\nEnter classes (format: 1:0,1:2,2:1): ").split(',')

Dataset 1 classes:
  0: 0

Dataset 2 classes:
  0: Cards
  1: Lanyard


In [13]:
chosen_classes = {
    'Lanyard': ['Lanyard'],
    'ID Card': ['0'],
}

with open(f"{dataset1_path}/data.yaml", 'r') as f:
    yaml1 = yaml.safe_load(f)
with open(f"{dataset2_path}/data.yaml", 'r') as f:
    yaml2 = yaml.safe_load(f)

names1 = {i: name for i, name in enumerate(yaml1['names'])} if isinstance(yaml1['names'], list) else yaml1['names']
names2 = {i: name for i, name in enumerate(yaml2['names'])} if isinstance(yaml2['names'], list) else yaml2['names']

class_mapping = {}
new_class_names = []

for new_class_idx, (new_class_name, old_class_names) in enumerate(chosen_classes.items()):
    new_class_names.append(new_class_name)
    for old_class_name in old_class_names:
        for class_id, name in names1.items():
            if name == old_class_name:
                class_mapping[f"1:{class_id}"] = new_class_idx
        for class_id, name in names2.items():
            if name == old_class_name:
                class_mapping[f"2:{class_id}"] = new_class_idx

os.makedirs(f"{output_path}/train", exist_ok=True)
os.makedirs(f"{output_path}/valid", exist_ok=True)

datasets = [(dataset1_path, 1), (dataset2_path, 2)]

for dataset_path, dataset_num in datasets:
    for split in ['train', 'valid']:
        split_path = f"{dataset_path}/{split}"
        
        if not os.path.exists(split_path):
            continue
            
        for file in os.listdir(split_path):
            if file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                base_name = os.path.splitext(file)[0]
                label_file = f"{split_path}/{base_name}.txt"
                
                if os.path.exists(label_file):
                    with open(label_file, 'r') as f:
                        lines = f.readlines()
                    
                    new_lines = []
                    for line in lines:
                        parts = line.strip().split()
                        if len(parts) >= 5:
                            old_class_id = int(parts[0])
                            key = f"{dataset_num}:{old_class_id}"
                            if key in class_mapping:
                                new_class_id = class_mapping[key]
                                new_lines.append(f"{new_class_id} {' '.join(parts[1:])}\n")
                    
                    if new_lines:
                        new_img_name = f"ds{dataset_num}_{file}"
                        new_label_name = f"ds{dataset_num}_{base_name}.txt"
                        shutil.copy2(f"{split_path}/{file}", f"{output_path}/{split}/{new_img_name}")
                        with open(f"{output_path}/{split}/{new_label_name}", 'w') as f:
                            f.writelines(new_lines)

merged_yaml = {
    'names': new_class_names,
    'nc': len(new_class_names),
    'path': output_path,
    'train': "train",
    'val': "valid"
}

with open(f"{output_path}/data.yaml", 'w') as f:
    yaml.dump(merged_yaml, f, default_flow_style=False)

train_count = len([f for f in os.listdir(f"{output_path}/train") if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])
valid_count = len([f for f in os.listdir(f"{output_path}/valid") if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])

print(f"Merged: {len(new_class_names)} classes, {train_count} train, {valid_count} valid")

Merged: 2 classes, 0 train, 0 valid
