## Reference: How to split image dataset into train, validation and test set?
https://aravinda-gn.medium.com/how-to-split-image-dataset-into-train-validation-and-test-set-5a41c48af332

#### Part1: Apply train test split to jpg files

In [11]:
import os
import random
import shutil


current_directory = os.getcwd()
print("Current Directory:", current_directory)

# List the contents of the current directory
current_list = os.listdir(current_directory)
print("Current List of Files and Directories:",'\n',current_list)


# input_folder = "red_cone"
input_folder = "earring_opencv_v2"
output_folder = 'earring_opencv/images'

# path to destination folders
train_folder = os.path.join(output_folder, "train")
val_folder = os.path.join(output_folder, "valid")
# test_folder = os.path.join(input_folder, 'test')

# Create destination folders if they don't exist
# Sol1
# os.makedirs(train_folder, exist_ok=True)
# os.makedirs(val_folder, exist_ok=True)

#Sol2
for folder_path in [train_folder, val_folder]: #, test_folder]:
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)



Current Directory: f:\
Current List of Files and Directories: 
 ['.Spotlight-V100', '.TemporaryItems', '拿掉空格及括號.bat', 'earring_opencv_v1.zip', '.Trashes', 'System Volume Information', '$RECYCLE.BIN', 'Ginger', 'Vicky', 'earring_original_v1.zip', 'label_bound_check.ipynb', 'opencv_load_save.ipynb', 'earring_original_v1', 'earring_original_v2', 'earring_opencv_v1', 'train_test_split.ipynb', 'earring_opencv_v2', 'earring_opencv']


In [8]:
# Define a list of image extensions
image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".JPG"]

# Create a list of image filenames in 'input_folder'
imgs_list = [
    filename
    for filename in os.listdir(input_folder)
    if os.path.splitext(filename)[-1] in image_extensions
]

print(imgs_list)
print("\n", len(imgs_list))

['gemstone_ring_combination56.jpg', 'gemstone_ring_combination57.jpg', 'gemstone_ring_combination58.jpg', 'gemstone_ring_combination59.jpg', 'gemstone_ring_combination60.jpg', 'gemstone_ring_combination61.jpg', 'gemstone_ring_combination62.jpg', 'gemstone_ring_combination63.jpg', 'gemstone_ring_combination64.jpg', 'gemstone_ring_combination65.jpg', 'gemstone_ring_combination66.jpg', 'gemstone_ring_combination67.jpg', 'gemstone_ring_combination68.jpg', 'gemstone_ring_combination69.jpg', 'gemstone_ring_combination71.jpg', 'gemstone_ring_combination72.jpg', 'gemstone_ring_combination73.jpg', 'gemstone_ring_combination75.jpg', 'gemstone_ring_combination76.jpg', 'gemstone_ring_combination77.jpg', 'gemstone_ring_combination78.jpg', 'gemstone_ring1.jpg', 'gemstone_ring2.jpg', 'gemstone_ring3.jpg', 'gemstone_ring4.jpg', 'gemstone_ring5.jpg', 'gemstone_ring7.jpg', 'gemstone_ring9.jpg', 'gemstone_ring11.jpg', 'gemstone_ring12.jpg', 'gemstone_ring13.jpg', 'gemstone_ring15.jpg', 'gemstone_ring17.j

In [9]:
# Sets the random seed
random.seed(42)

# Shuffle the list of image filenames
random.shuffle(imgs_list)

print(imgs_list)


['soju134.jpg', 'mix132.JPG', 'H472.jpg', 'water_droplets_combination76.jpg', 'H151.jpg', 'hand164.jpg', 'v379.jpg', 'red_cone301.jpg', 'white_thin_ring45.jpg', 'green_plate32.jpg', 'v335.jpg', 'green_ring_pin70.jpg', '17_flower245.jpg', 'green_ring28.jpg', 'water_droplets_combination59.jpg', 'gemstone_ring15.jpg', 'green_plate73.jpg', 'H269.jpg', 'golden_hook76.jpg', 'H843.jpg', '15_ear_clip373.jpg', '39_medium_black_ring112.jpg', 'water_droplets_assembly46.jpg', 'H63.jpg', '22_corgi11.jpg', 'Patty660.jpg', 'green_ring_pin279.jpg', '15_ear_clip371.jpg', 'slippers84.jpg', 'golden_hook215.jpg', 'eyes_assembly60.jpg', 'H461.jpg', '15_ear_clip2.jpg', 'gemstone_ring_assembly46.jpg', 'H180.jpg', 'H116.jpg', '15_ear_clip406.jpg', 'tassel113.jpg', 'golden_hook173.jpg', 'white_ring19.jpg', 'sliver_hook418.jpg', 'big_eye12.jpg', 'coke155.jpg', 'black_dot_pin134.jpg', 'coke145.jpg', '15_ear_clip97.jpg', 'mix46.jpg', 'water_droplets_combination1.jpg', 'H763.jpg', 'sliver_hook165.jpg', 'coke33.jpg

In [10]:
# determine the number of images for each set
train_size = int(len(imgs_list) * 0.8)
val_size = int(len(imgs_list) * 0.2)
# test_size = int(len(imgs_list) * 0.15)

print(train_size)
print(val_size)

10017
2504


In [12]:
# Copy image files to destination folders
t=0
v=0

for i, f in enumerate(imgs_list):
    if i < train_size:
        dest_folder = train_folder
        t += 1
    else:
        dest_folder = val_folder
        v += 1
    # elif i < train_size + val_size:
    #     dest_folder = val_folder
    # else:
    #     dest_folder = test_folder
    shutil.copy(os.path.join(input_folder, f), os.path.join(dest_folder, f))

print('jpg files in train folder:', t)
print('jpg files in valid folder:', v)

#### Part2: Apply the same train test split index to txt files

In [13]:
import os
import random
import shutil


current_directory = os.getcwd()
print("Current Directory:", current_directory)

# List the contents of the current directory
current_list = os.listdir(current_directory)
print("Current List of Files and Directories:",'\n',current_list)



# input_folder = "red_cone"
input_folder = "earring_opencv_v2"
output_folder = 'earring_opencv/labels'


# path to destination folders
train_folder = os.path.join(output_folder, "train")
val_folder = os.path.join(output_folder, "valid")
# test_folder = os.path.join(output_folder, 'test')


# Create destination folders if they don't exist
# Sol1
# os.makedirs(train_folder, exist_ok=True)
# os.makedirs(val_folder, exist_ok=True)
#Sol2
for folder_path in [train_folder, val_folder]: #, test_folder]:
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

Current Directory: f:\
Current List of Files and Directories: 
 ['.Spotlight-V100', '.TemporaryItems', '拿掉空格及括號.bat', 'earring_opencv_v1.zip', '.Trashes', 'System Volume Information', '$RECYCLE.BIN', 'Ginger', 'Vicky', 'earring_original_v1.zip', 'label_bound_check.ipynb', 'opencv_load_save.ipynb', 'earring_original_v1', 'earring_original_v2', 'earring_opencv_v1', 'train_test_split.ipynb', 'earring_opencv_v2', 'earring_opencv']


In [15]:
# Check Unique File Extension
base_names = []
file_extensions = []

for file_name in imgs_list:
    base_name, file_extension = file_name.split(".")
    base_names.append(base_name)
    file_extensions.append(file_extension)
    # print("Base Name:", base_name)
    # print("File Extension:", file_extension)
print(base_names)
print(file_extensions)

uni = list(set(file_extensions))
print(uni)

['soju134', 'mix132', 'H472', 'water_droplets_combination76', 'H151', 'hand164', 'v379', 'red_cone301', 'white_thin_ring45', 'green_plate32', 'v335', 'green_ring_pin70', '17_flower245', 'green_ring28', 'water_droplets_combination59', 'gemstone_ring15', 'green_plate73', 'H269', 'golden_hook76', 'H843', '15_ear_clip373', '39_medium_black_ring112', 'water_droplets_assembly46', 'H63', '22_corgi11', 'Patty660', 'green_ring_pin279', '15_ear_clip371', 'slippers84', 'golden_hook215', 'eyes_assembly60', 'H461', '15_ear_clip2', 'gemstone_ring_assembly46', 'H180', 'H116', '15_ear_clip406', 'tassel113', 'golden_hook173', 'white_ring19', 'sliver_hook418', 'big_eye12', 'coke155', 'black_dot_pin134', 'coke145', '15_ear_clip97', 'mix46', 'water_droplets_combination1', 'H763', 'sliver_hook165', 'coke33', 'mother_of_pearl87', '15_ear_clip114', 'v508', 'yellow_square_pin61', 'white_thin_ring300', 'small_eye5', 'green_ring67', 'golden_pin51', '16_ribbon195', 'Patty1673', 'H380', 'white_ring6', 'H321', 'ge

In [16]:
# Replace the shuffled list of .jpg files with .txt files.
txt_list = []

for img in imgs_list:
    if img.split(".")[-1] in uni:
        txt_list.append(img.replace(img.split(".")[-1], "txt"))

print(txt_list)
print("\n", len(txt_list))

['soju134.txt', 'mix132.txt', 'H472.txt', 'water_droplets_combination76.txt', 'H151.txt', 'hand164.txt', 'v379.txt', 'red_cone301.txt', 'white_thin_ring45.txt', 'green_plate32.txt', 'v335.txt', 'green_ring_pin70.txt', '17_flower245.txt', 'green_ring28.txt', 'water_droplets_combination59.txt', 'gemstone_ring15.txt', 'green_plate73.txt', 'H269.txt', 'golden_hook76.txt', 'H843.txt', '15_ear_clip373.txt', '39_medium_black_ring112.txt', 'water_droplets_assembly46.txt', 'H63.txt', '22_corgi11.txt', 'Patty660.txt', 'green_ring_pin279.txt', '15_ear_clip371.txt', 'slippers84.txt', 'golden_hook215.txt', 'eyes_assembly60.txt', 'H461.txt', '15_ear_clip2.txt', 'gemstone_ring_assembly46.txt', 'H180.txt', 'H116.txt', '15_ear_clip406.txt', 'tassel113.txt', 'golden_hook173.txt', 'white_ring19.txt', 'sliver_hook418.txt', 'big_eye12.txt', 'coke155.txt', 'black_dot_pin134.txt', 'coke145.txt', '15_ear_clip97.txt', 'mix46.txt', 'water_droplets_combination1.txt', 'H763.txt', 'sliver_hook165.txt', 'coke33.txt

In [17]:
# determine the number of images for each set
train_size = int(len(txt_list) * 0.8)
val_size = int(len(txt_list) * 0.2)
# test_size = int(len(imgs_list) * 0.15)

print(train_size)
print(val_size)

10017
2504


In [19]:
# Copy image files to destination folders

t=0
v=0

for i, f in enumerate(txt_list):
    if i < train_size:
        dest_folder = train_folder
        t += 1
    else:
        dest_folder = val_folder
        v += 1
    # elif i < train_size + val_size:
    #     dest_folder = val_folder
    # else:
    #     dest_folder = test_folder
    shutil.copy(os.path.join(input_folder, f), os.path.join(dest_folder, f))

print('txt files in train folder:', t)
print('txt files in valid folder:', v)

txt files in train folder: 10017
txt files in valid folder: 2505
