# Part 2: Create test set and training set

In this second notebook, the created png images and their equivalent masks are assigned to either the training set or the test set. Best practice is to shuffle the data and assign approximately 80% to the training set and the other 20% to the test set:

>Indicated with bucket size (percentage)

The following two things must be checked afterwards:

- Are the total amount of images in the two sets equal to the original number of images (same goes for masks). If not, either not all data is used or wrong data slipped in.
- Are the number of images and masks in each of does sets equal. If not, there are images without labels or labels without images, thus, this data can not be used further.

In [1]:
import os
from fastai.vision.all import *

In [2]:
DATASET_PATH = Path(os.getcwd()) / "data"
MASKS_PATH = DATASET_PATH  / "masks"
IMAGES_PNG_PATH = DATASET_PATH / "images_png"
TRAINING_PATH = DATASET_PATH/"training_set"
TEST_PATH =  DATASET_PATH/"test_set"

for path in [TRAINING_PATH,TEST_PATH]:
    if not path.exists():
        os.mkdir(path)
        os.mkdir(path/'images')
        os.mkdir(path/'masks')

TRAINING_IMAGES_PATH = TRAINING_PATH/'images'
TRAINING_MASK_PATH = TRAINING_PATH/'masks'
TEST_IMAGES_PATH = TEST_PATH/'images'
TEST_MASK_PATH = TEST_PATH/'masks'
    
def create_random_buckets(training_rte):
    print('Creating random sets..')
    training_amt = round(len(IMAGES_PNG_PATH.ls()) * training_rte)
    testing_amt = len(IMAGES_PNG_PATH.ls()) - training_amt

    total_idx = set(range(len(IMAGES_PNG_PATH.ls())))
    train_idx = set(random.sample(total_idx,training_amt))
    test_idx = total_idx-train_idx
    
    
    if not any(TRAINING_IMAGES_PATH.ls()):
        print('Copying training images/masks..')
        for idx in train_idx:
            shutil.copy(IMAGES_PNG_PATH.ls()[idx],TRAINING_IMAGES_PATH)
    
        [[shutil.copy(mask,TRAINING_MASK_PATH) for mask in MASKS_PATH.ls() if img.stem == mask.stem] for img in TRAINING_IMAGES_PATH.ls()]            
 
    
    if not any(TEST_IMAGES_PATH.ls()):
        print('Copying test images/masks..')
        for idx in test_idx:
            shutil.copy(IMAGES_PNG_PATH.ls()[idx],TEST_IMAGES_PATH)
                    
        [[shutil.copy(mask,TEST_MASK_PATH) for mask in MASKS_PATH.ls() if img.stem == mask.stem] for img in TEST_IMAGES_PATH.ls()]            

    
    print('Verifying correctness of splitting..')
    if len(IMAGES_PNG_PATH.ls()) == len(TRAINING_IMAGES_PATH.ls()) + len(TEST_IMAGES_PATH.ls()):
        print('  OK: Total images equal as train and test images!')
    else:
        print('  NOK: Total images NOT EQUAL to train and test images!')

    if len(TRAINING_IMAGES_PATH.ls()) == len(TRAINING_MASK_PATH.ls()) and len(TEST_IMAGES_PATH.ls()) == len(TEST_MASK_PATH.ls()):
        print('  OK: Number of Train/Test Images corresponds with Train/Test Mask!')
    else:
        print('  OK: Number of Images DOES NOT corresponds with Train/Test Mask!')    
create_random_buckets(0.8)

Creating random sets..
Copying training images/masks..


since Python 3.9 and will be removed in a subsequent version.
  train_idx = set(random.sample(total_idx,training_amt))


Copying test images/masks..
Verifying correctness of splitting..
  OK: Total images equal as train and test images!
  OK: Number of Train/Test Images corresponds with Train/Test Mask!
