In [2]:
#### Data splitting script
#### Quentin Coppola
#### This script splits and renames the raw png files (ML_Data_png.zip)
#### Unnecessary if you use pre-split data Ready_Data_png.zip


import os
import shutil
from sklearn.model_selection import train_test_split

# Load datasets
data_dir = "/Users/quentin/Desktop/ML_MRIqc_DATASET/ML_Data/pics" # Dir housing ML_Data_png.zip
new_dir = "/Users/quentin/Desktop/ML_MRIqc_DATASET/ML_Data/Cleaned" # Dir for all renames accepted and rejected PNG files 
output_dir = "/Users/quentin/Desktop/ML_MRIqc_DATASET/Ready_Data" # Dir for this specific train/val/test split 


# ID raw data
accepted_dir = os.path.join(data_dir, "Accepted")
rejected_dir = os.path.join(data_dir, "Rejected")
# Get files
accepted = [
    os.path.join(root, file)
    for root, _, files in os.walk(accepted_dir)
    for file in files if file.endswith(".png")
]
rejected = [
    os.path.join(root, file)
    for root, _, files in os.walk(rejected_dir)
    for file in files if file.endswith(".png")
]

# Rename the data because u were a lazy idiot and didn't want to originally rename the raw data
def rename_files_in_folder(src_folder, dest_folder):
    os.makedirs(dest_folder, exist_ok=True)  
    for root, _, files in os.walk(src_folder):
        folder_name = os.path.basename(root)  
        for file in files:
            if file.endswith(".png"):
                old_path = os.path.join(root, file)
                new_name = f"{folder_name}_{file}"
                new_path = os.path.join(dest_folder, new_name)
                shutil.copy(old_path, new_path)

rename_files_in_folder(accepted_dir, os.path.join(new_dir, "Accepted"))
rename_files_in_folder(rejected_dir, os.path.join(new_dir, "Rejected"))

# Place renamed files
accepted_clean = [
    os.path.join(root, file)
    for root, _, files in os.walk(os.path.join(new_dir, "Accepted"))
    for file in files if file.endswith(".png")
]
rejected_clean = [
    os.path.join(root, file)
    for root, _, files in os.walk(os.path.join(new_dir, "Rejected"))
    for file in files if file.endswith(".png")
]

# 80% train, 10% test, 10% validation (Change random state if u want)
train_acc, temp_acc = train_test_split(accepted_clean, test_size=0.2, random_state=420)
val_acc, test_acc = train_test_split(temp_acc, test_size=0.5, random_state=420)

train_rej, temp_rej = train_test_split(rejected_clean, test_size=0.2, random_state=420)
val_rej, test_rej = train_test_split(temp_rej, test_size=0.5, random_state=420)

# helper function
def create_split_folder(output_dir, data, class_name):
    class_dir = os.path.join(output_dir, class_name)
    os.makedirs(class_dir, exist_ok=True)
    for path in data:
        shutil.copy(path, class_dir)

splits = {
    "train": (train_acc, train_rej),
    "val": (val_acc, val_rej),
    "test": (test_acc, test_rej),
}

for split, (acc_data, rej_data) in splits.items():
    create_split_folder(os.path.join(output_dir, split), acc_data, "Accepted")
    create_split_folder(os.path.join(output_dir, split), rej_data, "Rejected")