# Explore here


In [None]:
import torch

if torch.cuda.is_available():
    print("✅ GPU is available to Torch")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU name: {torch.cuda.get_device_name(1)}")
else:
    print("❌ GPU is NOT available to Torch")

✅ GPU is available to Torch
GPU name: NVIDIA GeForce RTX 3080 Ti
GPU name: NVIDIA GeForce RTX 3070


In [None]:
import os

USE_BOTH_GPUS = True
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf
from tensorflow.python.client import device_lib

print("🔍 TensorFlow build info:")
print(tf.sysconfig.get_build_info())

devices = device_lib.list_local_devices()
for d in devices:
    print(f"{d.name} - {d.device_type}")


2025-05-24 04:51:35.843958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748076697.146029   46070 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748076697.480535   46070 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748076700.174584   46070 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748076700.174623   46070 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748076700.174625   46070 computation_placer.cc:177] computation placer alr

🔍 TensorFlow build info:
OrderedDict([('cpu_compiler', '/usr/lib/llvm-18/bin/clang'), ('cuda_compute_capabilities', ['sm_60', 'sm_70', 'sm_80', 'sm_89', 'compute_90']), ('cuda_version', '12.5.1'), ('cudnn_version', '9'), ('is_cuda_build', True), ('is_rocm_build', False), ('is_tensorrt_build', False)])
/device:CPU:0 - CPU
/device:GPU:0 - GPU
/device:GPU:1 - GPU


I0000 00:00:1748076791.906539   46070 gpu_device.cc:2019] Created device /device:GPU:0 with 9446 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6
I0000 00:00:1748076791.915298   46070 gpu_device.cc:2019] Created device /device:GPU:1 with 5490 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:02:00.0, compute capability: 8.6


In [3]:
assert tf.config.list_physical_devices("GPU"), "❌ No GPU detected by TensorFlow"


## Image loading and preprocessing

In [1]:
import os

USE_BOTH_GPUS = True
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

In [2]:
from tensorflow.keras import mixed_precision

IMAGE_SIZE = 200  # allows choosing smaller than 200 to deal with memory constraints
BATCH_SIZE = 16
DATA_SUBSET = 10000  # allows choosing fewer than entirety of available photos for less memory & faster (albeit underfitting) training
TRAIN_DIR = "../data/raw/train/"
TEST_DIR = "../data/raw/test1/"

mixed_precision.set_global_policy("mixed_float16")

2025-05-24 06:18:01.454959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748081882.206803  106228 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748081882.529011  106228 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748081885.494893  106228 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748081885.494934  106228 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748081885.494937  106228 computation_placer.cc:177] computation placer alr

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf

# prevent TensorFlow from preallocating all memory up front:
gpus = tf.config.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


In [4]:
import os
import numpy as np
import cv2
import pandas as pd
from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Directory with images
filenames = sorted(os.listdir(TRAIN_DIR))[:DATA_SUBSET]
labels = [0 if fname.startswith("dog") else 1 for fname in filenames]

# Full paths
image_paths = [os.path.join(TRAIN_DIR, fname) for fname in filenames]

# Create TensorFlow Dataset
path_ds = tf.data.Dataset.from_tensor_slices((image_paths, labels))


def list_images_and_labels(directory, limit=None):
    filenames = sorted(os.listdir(directory))
    if limit:
        filenames = filenames[:limit]
    paths = [os.path.join(directory, fname) for fname in filenames]
    labels = [0 if fname.startswith("dog") else 1 for fname in filenames]
    return paths, labels


train_paths, train_labels = list_images_and_labels(TRAIN_DIR, limit=DATA_SUBSET)
test_paths, test_labels = list_images_and_labels(
    TEST_DIR, limit=int(DATA_SUBSET * 0.25)
)


# Load, decode, resize, normalize images
def process_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, depth=2)


# create datasets
train_ds = (
    tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
    .map(process_image, num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(buffer_size=len(train_paths))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

valid_ds = (
    tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
    .map(process_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

# Map preprocessing function
# ds = path_ds.map(process_image, num_parallel_calls=tf.data.AUTOTUNE)


I0000 00:00:1748081942.146238  106228 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1748081942.150537  106228 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9536 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6
I0000 00:00:1748081942.158025  106228 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1748081942.158357  106228 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 5564 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:02:00.0, compute capability: 8.6


In [5]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard


def train_cat_dog_model(
    train_generator,
    valid_generator,
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
    num_classes=2,
    epochs=30,
):
    # Strategy for multi-GPU support
    strategy = tf.distribute.MirroredStrategy()
    print(f"✅ Using {strategy.num_replicas_in_sync} GPU(s)")

    with strategy.scope():
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(shape=input_shape),
                tf.keras.layers.Conv2D(64, 3, activation="relu", padding="same"),
                tf.keras.layers.MaxPooling2D(),
                tf.keras.layers.Conv2D(128, 3, activation="relu", padding="same"),
                tf.keras.layers.MaxPooling2D(),
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(2, activation="softmax"),
            ]
        )
        model.compile(
            optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
        )

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            "checkpoints/best_model.keras", save_best_only=True, monitor="val_accuracy"
        ),
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    ]

    history = model.fit(
        train_ds, validation_data=valid_ds, epochs=epochs, callbacks=callbacks
    )
    return model, history


In [6]:
model, history = train_cat_dog_model(
    train_generator=train_ds,
    valid_generator=valid_ds,
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
    epochs=10,
)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
✅ Using 2 GPU(s)


2025-05-24 06:19:18.582262: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 4995 of 10000


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2025-05-24 06:19:17.738259: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 1/10
INFO:tensorflow:Collective all_reduce tensors: 8 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


2025-05-24 06:19:33.611867: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 7266 of 10000
2025-05-24 06:19:29.973697: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.
I0000 00:00:1748081970.289589  107121 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1748081970.288927  107124 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m608/625[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m2s[0m 152ms/step - accuracy: 0.9971 - loss: 0.0079

2025-05-24 06:21:09.793617: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-05-24 06:21:09.793695: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2025-05-24 06:21:09.794366: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2025-05-24 06:21:14.107348: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 162ms/step - accuracy: 0.9972 - loss: 0.0077 - val_accuracy: 1.0000 - val_loss: 1.1921e-07
Epoch 2/10


2025-05-24 06:21:33.842860: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 4832 of 10000
2025-05-24 06:21:32.625284: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 151ms/step - accuracy: 1.0000 - loss: 1.1921e-07 - val_accuracy: 1.0000 - val_loss: 1.1921e-07


2025-05-24 06:23:07.185223: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


Epoch 3/10


2025-05-24 06:23:19.004468: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 5606 of 10000
2025-05-24 06:23:17.313068: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 152ms/step - accuracy: 1.0000 - loss: 1.1921e-07 - val_accuracy: 1.0000 - val_loss: 1.1921e-07
Epoch 4/10


2025-05-24 06:25:04.203467: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 3488 of 10000
2025-05-24 06:25:05.367930: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 156ms/step - accuracy: 1.0000 - loss: 1.1921e-07 - val_accuracy: 1.0000 - val_loss: 1.1921e-07


2025-05-24 06:26:42.752805: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
