In [7]:
import tensorflow as tf
from tensorflow.keras.datasets import cifar100
from sklearn.model_selection import train_test_split
import numpy as np
import os

In [2]:

def prepare_dataset(batch_size=32, test_size=0.2, seed=42):
    """
    Prepare the CIFAR-100 dataset for training and testing with lazy loading.
    
    Args:
        batch_size (int): The number of samples per batch.
        test_size (float): Fraction of data to reserve for testing.
        seed (int): Random seed for reproducibility.
    
    Returns:
        train_dataset (tf.data.Dataset): Training dataset generator.
        test_dataset (tf.data.Dataset): Testing dataset generator.
    """
    # Load CIFAR-100 data
    (x_train_full, y_train_full), (x_test, y_test) = cifar100.load_data()

    # Combine full training and test sets for a single split
    x_full = tf.concat([x_train_full, x_test], axis=0)
    y_full = tf.concat([y_train_full, y_test], axis=0)
    
    # Split into train and test
    x_train, x_test, y_train, y_test = train_test_split(
        x_full, y_full, test_size=test_size, random_state=seed
    )
    
    # Normalization transformation
    def normalize(image, label):
        image = tf.cast(image, tf.float32) / 255.0
        return image, label
    
    # Augmentation transformation for training data
    def augment(image, label):
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_crop(image, size=[28, 28, 3])  # Optional crop
        return normalize(image, label)
    
    # Create TF Dataset objects with lazy loading
    train_dataset = (
        tf.data.Dataset.from_tensor_slices((x_train, y_train))
        .shuffle(buffer_size=len(x_train), seed=seed)
        .map(augment, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    test_dataset = (
        tf.data.Dataset.from_tensor_slices((x_test, y_test))
        .map(normalize, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    return train_dataset, test_dataset

In [8]:
def save_dataset(save_dir='cifar100_dataset'):
    """
    Save the CIFAR-100 dataset locally in a specified directory.

    Args:
        save_dir (str): Path to the directory where the dataset will be saved.
    """
    
    # Create directory if it does not exist
    os.makedirs(os.path.join(os.path.abspath(os.getcwd()), "data", save_dir), exist_ok=True)
    
    # Load CIFAR-100 dataset
    (x_train, y_train), (x_test, y_test) = cifar100.load_data()
    
    # Save data as numpy arrays
    np.save(os.path.join(save_dir, 'x_train.npy'), x_train)
    np.save(os.path.join(save_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(save_dir, 'x_test.npy'), x_test)
    np.save(os.path.join(save_dir, 'y_test.npy'), y_test)
    
    print(f"Dataset saved in directory: {save_dir}")

In [None]:
save_dataset()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
[1m 79028224/169001437[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m1:07:38[0m 45us/step