In [23]:
import tensorflow as tf
from keras.src.backend.jax.nn import sparse_categorical_crossentropy
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import VGG16
import tensorflow_datasets as tfds
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import optimizers
import numpy as np
import os



## Use Cats & Dogs Dataset Load a subset of the data
##Create random bounding box in each image
## Define a model that is able to do both:
## 1. Classify the image either cat or dog
## 2. Predict the bounding box that was generated in the begging

### setup

In [24]:


# path to images
data_dir = "data/train/"

img_size = 128
num_classes = 2
images = []# hold image tensors after loading
labels = [] # hold corresponding labels for each list

for fname in os.listdir(data_dir):
    if fname.endswith(".jpg"):
        img_path = os.path.join(data_dir, fname)
        #load and resize image
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, (img_size, img_size))
        img = tf.cast(img, tf.float32)

        images.append(img)

        # assing label based on filename
        if "cat" in fname.lower():
            labels.append(0)
        elif "dog" in fname.lower():
            labels.append(1)
        else:
            continue # skips it if not clear


# converts to tensors

images = tf.stack(images)
labels = tf.convert_to_tensor(labels, dtype=tf.int32)







### Create a random bounding box in each image

In [25]:
#to create t random bounding box you need to pick two random x and y and it starts with minium x and then starts with min y then max x nad max y

def preprocess_bound_box(image):
    #resizes and normalize the image
    image = tf.image.resize(image, (img_size, img_size))
    image = tf.cast(image, tf.float32) / 255.0 # divide by 255.0 to normalize pixel values
    #pixel values range 0,255 for each color channel
    # Nueral network work better with smaller ranges

    #Generate random bounding box in nroamlized coordinates
    #Ensure   x_min < x_max and y_min < y_max
#picks random float in the range
    x_min = np.random.uniform(0, 0.5) #0, 0.5 to make sure box starts somewhree top left half
    y_min = np.random.uniform(0, 0.5)

    x_max = np.random.uniform(0.5, 1.0)
    y_max = np.random.uniform(0.5, 1.0)

    boundbox = np.array([x_min, y_min, x_max, y_max], dtype=np.float32)

    return image, boundbox

#ex usage with list of images

processed_images = []
boundboxes = []

for img in images:
    img_proc, boundbox = preprocess_bound_box(img)
    processed_images.append(img_proc)
    boundboxes.append(boundbox)

processed_images = tf.stack(processed_images)
boundboxes = tf.stack(boundboxes)

print("Processed images shape", processed_images.shape)
print("Boundboxes shape", boundboxes.shape)
print("Example Bound box:", boundboxes[0].numpy())





Processed images shape (4097, 128, 128, 3)
Boundboxes shape (4097, 4)
Example Bound box: [0.13846704 0.0780012  0.811932   0.8295029 ]


#### Interpreation

Processed images shape (4097, 128, 128, 3)
4097 is amount of images  in dataset
129, 128 is height and width
3 is number of colour channels

Boundboxes shape (4097, 4)
4097 number of bounding boxes her image (1 per image)
4 each bounding box is defined by 4 coordinates

Example Bound box: [0.43754366 0.26004314 0.65264034 0.5635852 ]
x_min, y_min, x_max, y_max
left edge of the box (43.7%)
top edge of the box (26%)


Model to clasisify image and predict the bounding box that was genereated
- Using vgg16 in this easy case

In [26]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, Model

IMG_SIZE = 128
NUM_CLASSES = 2

# 1. Input layer
inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))

# 2. Base VGG16
base_model = VGG16(
    weights='imagenet',
    include_top=False,
    input_tensor=inputs   # <-- connect directly to your inputs
)
base_model.trainable = False

# 3. Flatten & dense layers
x = layers.Flatten()(base_model.output)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.3)(x)

# 4. Multi-task outputs
class_output = layers.Dense(NUM_CLASSES, activation='softmax', name='class_output')(x)
bbox_output = layers.Dense(4, activation='sigmoid', name='bbox_output')(x)

# 5. Model
model = Model(inputs=inputs, outputs=[class_output, bbox_output])
model.summary()


compile

In [27]:

model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss={
        "class_output": "sparse_categorical_crossentropy",
        "bbox_output": "mse"

    },
    loss_weights={
        "class_output": 0.2, # in each back propgation it does 20% of the effect
        "bbox_output": 0.8, # this one holds 80%
    },
    metrics={
        "class_output":"accuracy",
        "bbox_output":"mse"
    }
)



SyntaxError: invalid syntax. Perhaps you forgot a comma? (99886139.py, line 2)

prep dataset

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(
    (processed_images, {'class_output': labels, 'bbox_output': boundboxes})
)
dataset = dataset.shuffle(1000).batch(16).prefetch(tf.data.AUTOTUNE)

train

In [None]:
history = model.fit(
    dataset,
    epochs=10,
)

predict

In [None]:
sample_img = processed_images[0:1]
class_pred, bbox_pred = model.predicts(sample_img)

print("Class probabilities:", class_pred)
print("Predicted bounding box (normalized):", bbox_pred)