In [1]:
import glob
import yaml

In [4]:
yolo_dir = "../data/reduced/"
yaml_paths = glob.glob(f"{yolo_dir}/**/data.yaml", recursive=True)
print(len(yaml_paths))

118


In [6]:
# Initialize an empty dictionary to store the combined data
combined_data = {}

# Track train and val paths
train_paths = set()
val_paths = set()

# Loop through each YAML file
for yaml_path in yaml_paths:
    with open(yaml_path, 'r') as file:
        data = yaml.safe_load(file)
        
        # Collect train and val paths
        if 'train' in data and data['train']:
            train_paths.add(data['train'])
        if 'val' in data and data['val']:
            val_paths.add(data['val'])
        
        # Merge the data from this file into the combined data
        for key, value in data.items():
            # Skip train and val keys for now
            if key in ['train', 'val']:
                continue
                
            # If this key doesn't exist in combined_data yet, add it
            if key not in combined_data:
                combined_data[key] = value
            # If the key exists and both values are lists, extend the list
            elif isinstance(combined_data[key], list) and isinstance(value, list):
                # Convert to sets to remove duplicates, then back to list
                combined_data[key] = list(set(combined_data[key] + value))
            # For other types, just keep the last value
            else:
                combined_data[key] = value

# Add the train and val paths to the combined data
combined_data['train'] = list(train_paths)
combined_data['val'] = list(val_paths)

# If original train/val structure needs to be maintained for compatibility
if 'train' not in combined_data:
    combined_data['train'] = combined_data['train'][0] if train_paths else ""
if 'val' not in combined_data:
    combined_data['val'] = combined_data['val'][0] if val_paths else ""

# Write the combined data to a new YAML file
output_path = f"{yolo_dir}/training_data.yaml"
with open(output_path, 'w') as file:
    yaml.dump(combined_data, file, sort_keys=False)

print(f"Combined YAML file created at: {output_path}")
print(f"Keys in combined file: {list(combined_data.keys())}")
print(f"Number of train paths: {len(train_paths)}")
print(f"Number of val paths: {len(val_paths)}")

Combined YAML file created at: ../data/reduced//training_data.yaml
Keys in combined file: ['names', 'nc', 'train', 'val']
Number of train paths: 118
Number of val paths: 118
