In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
def get_files(path):
    dir_name  = os.path.join(os.getcwd(), path)
    files     = os.listdir(dir_name)
    paths    = []

    for file in files:
        if '.jpg' in file:
            paths.append(os.path.join(path,file))
    
    return paths

In [3]:
def copy_images(paths, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    for path in paths:
        shutil.copy(path, destination_folder)
    
    print("Images copied to", destination_folder)

In [4]:
def print_path_info(paths):
    g_paths = [path for path in paths if '_g_' in path]
    non_g_paths = [path for path in paths if '_g_' not in path]

    print("Total ->", len(paths))
    print("Glaucoma ->", len(g_paths))
    print("No Glaucoma", len(non_g_paths)) 
    print("Glaucoma % ->", len(g_paths)/len(paths))
    print("No Glaucoma % ->", len(non_g_paths)/len(paths))
    print("")

In [5]:
DATASET_NAME = 'RIM'
path = './datasets/' + DATASET_NAME
paths = get_files(path)
print("Imagenes ->", len(paths))

Imagenes -> 455


In [6]:
# First split: 90% for training, 10% for validation
train_paths, val_paths = train_test_split(paths, test_size=0.10, random_state=42, shuffle=True)

# Second split: 90% of the first training set for training, 10% for testing
train_paths, test_paths = train_test_split(train_paths, test_size=0.10, random_state=30, shuffle=True)

print_path_info(train_paths)
print("--------------------")
print_path_info(val_paths)
print("--------------------")
print_path_info(test_paths)

Total -> 368
Glaucoma -> 148
No Glaucoma 220
Glaucoma % -> 0.40217391304347827
No Glaucoma % -> 0.5978260869565217

--------------------
Total -> 46
Glaucoma -> 24
No Glaucoma 22
Glaucoma % -> 0.5217391304347826
No Glaucoma % -> 0.4782608695652174

--------------------
Total -> 41
Glaucoma -> 22
No Glaucoma 19
Glaucoma % -> 0.5365853658536586
No Glaucoma % -> 0.4634146341463415



In [7]:
def k_fold_split(paths, test_paths, k, dataset_name):
    kf = KFold(n_splits=k, random_state=42, shuffle=True)
    fold = 0
    for train_index, val_index in kf.split(paths):
        fold += 1
        train_paths = [paths[i] for i in train_index]
        val_paths = [paths[i] for i in val_index]

        train_folder = f'datasets_split_folds/{dataset_name}/fold{fold}/train'
        valid_folder = f'datasets_split_folds/{dataset_name}/fold{fold}/valid'
        test_folder = f'datasets_split_folds/{dataset_name}/fold{fold}/test'

        copy_images(train_paths, train_folder)
        copy_images(val_paths, valid_folder)
        copy_images(test_paths, test_folder)

# Call the function
k_fold_split(train_paths, test_paths, k=5, dataset_name=DATASET_NAME)

Images copied to datasets_split_folds/RIM/fold1/train
Images copied to datasets_split_folds/RIM/fold1/valid
Images copied to datasets_split_folds/RIM/fold1/test
Images copied to datasets_split_folds/RIM/fold2/train
Images copied to datasets_split_folds/RIM/fold2/valid
Images copied to datasets_split_folds/RIM/fold2/test
Images copied to datasets_split_folds/RIM/fold3/train
Images copied to datasets_split_folds/RIM/fold3/valid
Images copied to datasets_split_folds/RIM/fold3/test
Images copied to datasets_split_folds/RIM/fold4/train
Images copied to datasets_split_folds/RIM/fold4/valid
Images copied to datasets_split_folds/RIM/fold4/test
Images copied to datasets_split_folds/RIM/fold5/train
Images copied to datasets_split_folds/RIM/fold5/valid
Images copied to datasets_split_folds/RIM/fold5/test
