# Explore here


In [1]:
import torch

if torch.cuda.is_available():
    print("✅ GPU is available to Torch")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU name: {torch.cuda.get_device_name(1)}")
else:
    print("❌ GPU is NOT available to Torch")

✅ GPU is available to Torch
GPU name: NVIDIA GeForce RTX 3080 Ti
GPU name: NVIDIA GeForce RTX 3070


In [2]:
import os

USE_BOTH_GPUS = True
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf
from tensorflow.python.client import device_lib

print("🔍 TensorFlow build info:")
print(tf.sysconfig.get_build_info())

devices = device_lib.list_local_devices()
for d in devices:
    print(f"{d.name} - {d.device_type}")


2025-05-27 23:15:15.662803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748402117.427350     887 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748402117.931068     887 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748402119.533285     887 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748402119.533340     887 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748402119.533346     887 computation_placer.cc:177] computation placer alr

🔍 TensorFlow build info:
OrderedDict([('cpu_compiler', '/usr/lib/llvm-18/bin/clang'), ('cuda_compute_capabilities', ['sm_60', 'sm_70', 'sm_80', 'sm_89', 'compute_90']), ('cuda_version', '12.5.1'), ('cudnn_version', '9'), ('is_cuda_build', True), ('is_rocm_build', False), ('is_tensorrt_build', False)])
/device:CPU:0 - CPU
/device:GPU:0 - GPU
/device:GPU:1 - GPU


I0000 00:00:1748402274.474264     887 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1748402274.484431     887 gpu_device.cc:2019] Created device /device:GPU:0 with 9446 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6
I0000 00:00:1748402274.492488     887 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1748402274.493140     887 gpu_device.cc:2019] Created device /device:GPU:1 with 5490 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:02:00.0, compute capability: 8.6


In [3]:
assert tf.config.list_physical_devices("GPU"), "❌ No GPU detected by TensorFlow"


## Image loading and preprocessing

In [4]:
import os

USE_BOTH_GPUS = True
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

In [5]:
from tensorflow.keras import mixed_precision

IMAGE_SIZE = 200  # allows choosing smaller than 200 to deal with memory constraints
BATCH_SIZE = 16
DATA_SUBSET = 10000  # allows choosing fewer than entirety of available photos for less memory & faster (albeit underfitting) training
TRAIN_DIR = "../data/raw/train/"
TEST_DIR = "../data/raw/test1/"

mixed_precision.set_global_policy("mixed_float16")

In [6]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf

# prevent TensorFlow from preallocating all memory up front:
gpus = tf.config.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


In [7]:
import os
import numpy as np
import cv2
import pandas as pd
from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Directory with images
filenames = sorted(os.listdir(TRAIN_DIR))[:DATA_SUBSET]
labels = [0 if fname.startswith("dog") else 1 for fname in filenames]

# Full paths
image_paths = [os.path.join(TRAIN_DIR, fname) for fname in filenames]

# Create TensorFlow Dataset
path_ds = tf.data.Dataset.from_tensor_slices((image_paths, labels))


def list_images_and_labels(directory, limit=None):
    filenames = sorted(os.listdir(directory))
    if limit:
        filenames = filenames[:limit]
    paths = [os.path.join(directory, fname) for fname in filenames]
    labels = [0 if fname.startswith("dog") else 1 for fname in filenames]
    return paths, labels


train_paths, train_labels = list_images_and_labels(TRAIN_DIR, limit=DATA_SUBSET)
test_paths, test_labels = list_images_and_labels(
    TEST_DIR, limit=int(DATA_SUBSET * 0.25)
)


# Load, decode, resize, normalize images
def process_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, depth=2)


# create datasets
train_ds = (
    tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
    .map(process_image, num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(buffer_size=len(train_paths))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

valid_ds = (
    tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
    .map(process_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

# Map preprocessing function
# ds = path_ds.map(process_image, num_parallel_calls=tf.data.AUTOTUNE)


I0000 00:00:1748402278.170218     887 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9446 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6
I0000 00:00:1748402278.170292     887 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 5490 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:02:00.0, compute capability: 8.6


In [8]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" if USE_BOTH_GPUS else "0"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard


def train_cat_dog_model(
    train_generator,
    valid_generator,
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
    num_classes=2,
    epochs=30,
):
    # Strategy for multi-GPU support
    strategy = tf.distribute.MirroredStrategy()
    print(f"✅ Using {strategy.num_replicas_in_sync} GPU(s)")

    with strategy.scope():
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(shape=input_shape),
                tf.keras.layers.Conv2D(64, 3, activation="relu", padding="same"),
                tf.keras.layers.MaxPooling2D(),
                tf.keras.layers.Conv2D(128, 3, activation="relu", padding="same"),
                tf.keras.layers.MaxPooling2D(),
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(2, activation="softmax"),
            ]
        )
        model.compile(
            optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
        )

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            "checkpoints/best_model.keras", save_best_only=True, monitor="val_accuracy"
        ),
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    ]

    history = model.fit(
        train_ds, validation_data=valid_ds, epochs=epochs, callbacks=callbacks
    )
    return model, history


In [9]:
model, history = train_cat_dog_model(
    train_generator=train_ds,
    valid_generator=valid_ds,
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
    epochs=10,
)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
✅ Using 2 GPU(s)


2025-05-27 23:18:19.210091: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 4346 of 10000
2025-05-27 23:18:29.218033: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 8740 of 10000
2025-05-27 23:18:32.707753: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

2025-05-27 23:18:48.075208: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 6468 of 10000
2025-05-27 23:18:52.184457: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.
I0000 00:00:1748402332.989620    6500 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1748402332.989623    6499 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - accuracy: 1.0000 - loss: 0.0069

2025-05-27 23:21:03.116110: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-05-27 23:21:03.116174: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2025-05-27 23:21:03.116858: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2025-05-27 23:21:15.256249: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 232ms/step - accuracy: 1.0000 - loss: 0.0069 - val_accuracy: 1.0000 - val_loss: 1.1921e-07
Epoch 2/10


2025-05-27 23:21:58.639158: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 2308 of 10000
2025-05-27 23:22:18.288577: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 3610 of 10000
2025-05-27 23:22:28.282605: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 5702 of 10000
2025-05-27 23:22:46.109910: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 1.0000 - loss: 1.1921e-07

2025-05-27 23:24:30.517709: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 167ms/step - accuracy: 1.0000 - loss: 1.1921e-07 - val_accuracy: 1.0000 - val_loss: 1.1921e-07
Epoch 3/10


2025-05-27 23:24:41.123606: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 2642 of 10000
2025-05-27 23:24:52.055255: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 3903 of 10000
2025-05-27 23:25:11.091399: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 5495 of 10000
2025-05-27 23:25:22.212203: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 7251 of 10000
2025-05-27 23:25:36.406018: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 167ms/step - accuracy: 1.0000 - loss: 1.1921e-07 - val_accuracy: 1.0000 - val_loss: 1.1921e-07
Epoch 4/10


2025-05-27 23:27:31.571979: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 5532 of 10000
2025-05-27 23:27:40.438699: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 1.0000 - loss: 1.1921e-07

2025-05-27 23:29:24.006086: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 166ms/step - accuracy: 1.0000 - loss: 1.1921e-07 - val_accuracy: 1.0000 - val_loss: 1.1921e-07


In [None]:
from tensorflow.keras.models import load_model

best_model = load_model("checkpoints/best_model.keras")
results = best_model.evaluate(valid_ds)
print(f"Validation accuracy: {results[1]:.4f}")


# playing around with pretrained VGG-16 and layering on top of it

In [10]:
from tensorflow.keras.applications import VGG16

# Load VGG-16 without the top (fully connected) layers
base_model = VGG16(
    weights="imagenet",  # Use pre-trained weights from ImageNet
    include_top=False,  # Drop the classifier head
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
)

# Freeze all layers so we don't train them (yet)
for layer in base_model.layers:
    layer.trainable = False

# Confirm the number of layers and their trainable status
print(f"Total layers: {len(base_model.layers)}")
print(f"Trainable layers: {sum([layer.trainable for layer in base_model.layers])}")


Total layers: 19
Trainable layers: 0


Classifier put on top of pretrained VGG-16 base:
Flatten final convolutional features
Use a dense layer or two for learning dataset-specific patterns
end in a softmax because we're doing binary dog-or-cat classification

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.models import Model

# Define the new head
head_model = Sequential(
    [
        Flatten(input_shape=base_model.output_shape[1:]),
        Dense(256, activation="relu"),
        Dropout(0.5),
        Dense(2, activation="softmax"),  # Two classes: dog and cat
    ]
)

# Combine base + head into one model
vgg_based_model = Model(inputs=base_model.input, outputs=head_model(base_model.output))

# Print summary
vgg_based_model.summary()


  super().__init__(**kwargs)


In [None]:
vgg_based_model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)

history = vgg_based_model.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=10,
    callbacks=[ModelCheckpoint(...), EarlyStopping(...)],
)


### Evaluate VGG-based model

In [None]:
from tensorflow.keras.models import load_model

best_model = load_model("checkpoints/best_model.keras")
results = best_model.evaluate(valid_ds)
print(f"Validation accuracy: {results[1]:.4f}")


In [None]:
vgg_based_model.save("models/final_model.keras")