In [1]:
# TO RUN, MAKE SURE YOU HAV ~ 9Gb of RAM AVALABLE TO USE!!!

import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tqdm import tqdm

# Enable mixed precision
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

print("GPU Available: ", tf.config.list_physical_devices('GPU'))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))
x = x.astype('float32') / 255.0

# Move data to GPU
x = tf.constant(x, dtype=tf.float32)
y = tf.constant(y, dtype=tf.int32)

@tf.function
def generate_samples(images, labels, num_samples):
    idx1 = tf.random.uniform([num_samples], 0, tf.shape(images)[0], dtype=tf.int32)
    idx2 = tf.random.uniform([num_samples], 0, tf.shape(images)[0], dtype=tf.int32)

    image1, image2 = tf.gather(images, idx1), tf.gather(images, idx2)
    label1, label2 = tf.gather(labels, idx1), tf.gather(labels, idx2)

    alpha = tf.random.uniform([num_samples, 1, 1], 0, 1)

    new_images = alpha * image1 + (1 - alpha) * image2
    new_labels = tf.cast(tf.round(alpha[:, 0, 0] * tf.cast(label1, tf.float32) + (1 - alpha[:, 0, 0]) * tf.cast(label2, tf.float32)), tf.int32)

    # Random shift
    shift = tf.random.uniform([num_samples, 2], -2, 3, dtype=tf.int32)
    new_images = tf.map_fn(lambda x: tf.roll(x[0], x[1], [0, 1]), (new_images, shift), fn_output_signature=tf.float32)

    # Random noise
    noise = tf.random.normal(tf.shape(new_images), mean=0.0, stddev=0.05)
    new_images = new_images + noise

    new_images = tf.clip_by_value(new_images, 0, 1)

    return new_images, new_labels

def generate_and_save_dataset(num_samples, batch_size=8192, base_dir="MNIST8M"):
    train_dir = os.path.join(base_dir, "train")
    test_dir = os.path.join(base_dir, "test")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    num_batches = (num_samples + batch_size - 1) // batch_size
    samples_generated = 0

    for batch in tqdm(range(num_batches), desc="Generating and saving batches"):
        current_batch_size = min(batch_size, num_samples - samples_generated)

        new_images, new_labels = generate_samples(x, y, current_batch_size)

        # Split into train and test (70-30 split)
        split_idx = int(0.7 * current_batch_size)
        train_x, test_x = new_images[:split_idx], new_images[split_idx:]
        train_y, test_y = new_labels[:split_idx], new_labels[split_idx:]

        # Save train data
        np.save(os.path.join(train_dir, f"x_train_{batch}.npy"), train_x.numpy())
        np.save(os.path.join(train_dir, f"y_train_{batch}.npy"), train_y.numpy())

        # Save test data
        np.save(os.path.join(test_dir, f"x_test_{batch}.npy"), test_x.numpy())
        np.save(os.path.join(test_dir, f"y_test_{batch}.npy"), test_y.numpy())

        samples_generated += current_batch_size

    print("Data generation and saving complete.")
    print(f"Total samples generated: {samples_generated}")

# Generate and save the dataset
num_samples = 8000000
batch_size = 10000  # Adjust this based on your GPU memory

generate_and_save_dataset(num_samples, batch_size)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


GPU Available:  []
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


Generating and saving batches: 100%|██████████| 800/800 [10:51<00:00,  1.23it/s]

Data generation and saving complete.
Total samples generated: 8000000





In [5]:
!pip install tqdm



In [None]:
import os
import numpy as np
from tqdm import tqdm

def load_generated_dataset(base_dir="MNIST8M", batch_size=8000):
    train_dir = os.path.join(base_dir, "train")
    test_dir = os.path.join(base_dir, "test")

    def load_data(directory, prefix):
        files = sorted([f for f in os.listdir(directory) if f.startswith(prefix)])
        total_samples = sum(os.path.getsize(os.path.join(directory, f)) for f in files) // (28 * 28 * 4)  # Estimate total samples

        data_list = []
        with tqdm(total=total_samples, desc=f"Loading {prefix}", unit="samples") as pbar:
            for file in files:
                data = np.load(os.path.join(directory, file))
                data_list.append(data)
                pbar.update(len(data))

        return np.concatenate(data_list)

    x_train = load_data(train_dir, "x_train")
    y_train = load_data(train_dir, "y_train")
    x_test = load_data(test_dir, "x_test")
    y_test = load_data(test_dir, "y_test")

    return x_train, y_train, x_test, y_test

# Load the dataset
x_train, y_train, x_test, y_test = load_generated_dataset()

# Print shapes to verify
print("\nDataset loaded successfully.")
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

Loading x_train:  69%|██████▊   | 3843000/5600032 [01:12<00:40, 42890.15samples/s]