In [1]:
import os
import random
import shutil

In [2]:
def list_files(base_path):
    # loop over the directory structure
    for (rootDir, dirNames, filenames) in os.walk(base_path):
        # loop over the filenames in the current directory
        for filename in filenames:
            
            #check that the file is a png image
            ext = filename[filename.rfind("."):].lower()
            if ext.endswith('.png'):
                image_path = os.path.join(rootDir, filename)
                yield image_path

In [3]:
image_paths = list(list_files('cell_images/'))
print(image_paths[:10])

['cell_images/Parasitized/C118P79ThinF_IMG_20151002_105018_cell_150.png', 'cell_images/Parasitized/C189P150ThinF_IMG_20151203_142224_cell_84.png', 'cell_images/Parasitized/C91P52ThinF_IMG_20150821_123116_cell_189.png', 'cell_images/Parasitized/C84P45ThinF_IMG_20150818_101226_cell_98.png', 'cell_images/Parasitized/C144P105ThinF_IMG_20151015_163432_cell_310.png', 'cell_images/Parasitized/C136P97ThinF_IMG_20151005_141803_cell_131.png', 'cell_images/Parasitized/C65P26N_ThinF_IMG_20150818_154050_cell_177.png', 'cell_images/Parasitized/C60P21thinF_IMG_20150804_105034_cell_105.png', 'cell_images/Parasitized/C109P70ThinF_IMG_20150930_103811_cell_159.png', 'cell_images/Parasitized/C176P137NThinF_IMG_20151201_122708_cell_126.png']


In [4]:
random.seed(123)
random.shuffle(image_paths)

In [5]:
# compute the training and testing split
i = int(len(image_paths) * 0.8)
train_paths = image_paths[:i]
test_paths = image_paths[i:]
 
# we'll be using part of the training data for validation
i = int(len(train_paths) * 0.1)
val_paths = train_paths[:i]
train_paths = train_paths[i:]

In [6]:
datasets = [
    ("training", train_paths, 'cell_images/training'),
    ("validation", val_paths, 'cell_images/validation'), 
    ("testing", test_paths, 'cell_images/testing')
]

In [7]:
# loop over the datasets
for (dataset, image_paths, dataset_dir) in datasets:
    # show which data split we are creating
    print("Building '{}' split".format(dataset))

    # if the output base output directory does not exist, create it
    if not os.path.exists(dataset_dir):
        print("Creating {}' directory".format(dataset_dir))
        os.makedirs(dataset_dir)

    # loop over the input image paths
    for inputPath in image_paths:
        # extract the filename of the input image along with its
        # corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]

        # build the path to the label directory
        label_path = os.path.sep.join([dataset_dir, label])

        # if the label output directory does not exist, create it
        if not os.path.exists(label_path):
            print("Creating {}' directory".format(label_path))
            os.makedirs(label_path)

        # construct the path to the destination image and then copy
        # the image itself
        p = os.path.sep.join([label_path, filename])
        shutil.copy2(inputPath, p)

Building 'training' split
Creating cell_images/training' directory
Creating cell_images/training/Parasitized' directory
Creating cell_images/training/Uninfected' directory
Building 'validation' split
Creating cell_images/validation' directory
Creating cell_images/validation/Uninfected' directory
Creating cell_images/validation/Parasitized' directory
Building 'testing' split
Creating cell_images/testing' directory
Creating cell_images/testing/Uninfected' directory
Creating cell_images/testing/Parasitized' directory
