In [11]:
# Import numpy for array manipulation
import numpy as np
import os
import yaml

np.random.seed(0)

files = os.listdir(r"C:\Users\koenk\Documents\Master_Thesis\Data\Processed_data\images/")

# Assume the IDs are stored in a list called ids
ids = np.unique([int(file.split("_")[0][1:]) for file in files if file.endswith(".JPG")])

# Shuffle the IDs randomly
np.random.shuffle(ids)

# Split the IDs into 4 folds
folds = np.array_split(ids, 4)

# Initialize a set to store the IDs that have been used for internal validation
used_ids = set()

output = {}

# Loop over the folds
for i in range(4):
    # Select the ith fold as the test set
    test_set = folds[i]
    
    # Concatenate the remaining folds as the training set
    train_set = np.concatenate([folds[j] for j in range(4) if j != i])
    
    # Select 20% of the training set as the internal validation set
    # Make sure the IDs are not in the used_ids set
    valid_set = np.random.choice([id for id in train_set if id not in used_ids], size=int(len(train_set) * 0.2), replace=False)
    
    # Add the IDs in the validation set to the used_ids set
    used_ids.update(valid_set)
    
    # Store the test, train, and validation sets for the ith fold in the output dictionary
    output[f"Fold {i+1}"] = {"Test set": test_set.tolist(), "Train set": train_set.tolist(), "Validation set": valid_set.tolist()}

# Open a file for writing the output data in YAML format
with open("output.yaml", "w") as yaml_file:
    # Dump the output data to the file using the yaml module
    yaml.dump(output, yaml_file, default_flow_style=False)