Implement a utility function using TensorFlow's tf.data API to split image data into training, validation, and test sets from a directory.

In [1]:
import tensorflow as tf
import os

In [6]:
#Load image dataset from directory
dataset_url="https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip"
path_to_zip=tf.keras.utils.get_file('cats_and_dogs.zip',origin=dataset_url,extract=True)
data_dir=os.path.join(os.path.dirname(path_to_zip),'cats_and_dogs_filtered','train')

In [None]:
#Function to create split datasets
def create_split_datasets(data_dir,img_size=(160,160),batch_size=32,val_split=0.2,test_split=0.1):
    #First split into train and validation
    full_dataset=tf.keras.utils.image_dataset_from_directory(
        data_dir,
        image_size=img_size,
        batch_size=batch_size,
        validation_split=val_split+test_split,
        subset='training',
        seed=123
    )
    valtest_dataset=tf.keras.utils.image_dataset_from_directory( #Split into validation and test
        data_dir,
        image_size=img_size,
        batch_size=batch_size,
        validation_split=val_split+test_split,
        subset='validation',
        seed=123
    )

    val_batches=int(val_split/(val_split+test_split)*len(valtest_dataset))

    val_dataset=valtest_dataset.take(val_batches)
    test_dataset=valtest_dataset.skip(val_batches)

    return full_dataset,val_dataset,test_dataset

In [8]:
#Generate datasets
train_ds,val_ds,test_ds=create_split_datasets('cats_and_dogs_filtered/train')

Found 2000 files belonging to 2 classes.
Using 1400 files for training.
Found 2000 files belonging to 2 classes.
Using 600 files for validation.


In [9]:
#Print dataset info
print('Train:',len(train_ds))
print('Validation:',len(val_ds))
print('Test:',len(test_ds))

Train: 44
Validation: 12
Test: 7
