In [16]:
# Importing all the necessary libraries

import os
import random
import numpy as np
import tensorflow as tf
import xml.etree.ElementTree as ET
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split

In [17]:
#Assigning the paths of the directories

DATASET_PATH = '/content/drive/MyDrive/Annotated-PascalVOCxml'
ANNOTATIONS_PATH = '/content/drive/MyDrive/Annotated-PascalVOCxml/Annotations'
IMAGES_PATH = '/content/drive/MyDrive/Annotated-PascalVOCxml/PNGImages'
MODEL_PATH = "/content/drive/MyDrive/ssd_model_voc-Generated.h5"

In [18]:
IMAGE_SIZE = (300, 300)
BATCH_SIZE = 8
NUM_CLASSES = 5 # As I have annoted the dataset into 5 classes

In [19]:
# Load pretrained model - MobileNetV2

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(300, 300, 3))


  base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(300, 300, 3))


In [20]:
#model feature extraction

x = Flatten()(base_model.output)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.3)(x)

In [21]:
# This is for multi-object detection output

bbox_output = Dense(NUM_CLASSES * 4, activation='linear', name="bounding_box")(x)
bbox_output = Reshape((NUM_CLASSES, 4))(bbox_output)


In [22]:
# Define our SSD model

model = Model(inputs=base_model.input, outputs=bbox_output)
model.compile(optimizer=Adam(learning_rate=0.0001), loss="mse")


In [23]:
# To parse PASCAL VOC annotations

def parse_voc_annotations(annotations_path, images_path):
    """ Parses PASCAL VOC XML annotations and returns image-label pairs. """
    data = []

    for xml_file in os.listdir(annotations_path):
        xml_path = os.path.join(annotations_path, xml_file)
        image_name = xml_file.replace('.xml', '')

        # Find corresponding image file

        image_file = None
        for ext in ['.jpg', '.jpeg', '.png']:
            if os.path.exists(os.path.join(images_path, image_name + ext)):
                image_file = image_name + ext
                break

        # Skip if no matching image
        if not image_file:
            continue

        image_path = os.path.join(images_path, image_file)

        # Parse XML for bounding boxes

        tree = ET.parse(xml_path)
        root = tree.getroot()
        objects = []

        size = root.find('size')
        img_width = int(size.find('width').text)
        img_height = int(size.find('height').text)

        for obj in root.findall('object'):
            label = obj.find('name').text.strip().lower()
            bbox = obj.find('bndbox')
            xmin, ymin, xmax, ymax = (
                int(bbox.find('xmin').text) / img_width,
                int(bbox.find('ymin').text) / img_height,
                int(bbox.find('xmax').text) / img_width,
                int(bbox.find('ymax').text) / img_height
            )
            objects.append((label, (xmin, ymin, xmax, ymax)))

        data.append((image_path, objects))
    return data


In [24]:
# Load dataset

dataset = parse_voc_annotations(ANNOTATIONS_PATH, IMAGES_PATH)
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)


In [25]:
# Data generator

def data_generator(dataset, batch_size=8):
    while True:
        random.shuffle(dataset)
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i + batch_size]
            images, targets = [], []

            for image_path, objects in batch:
                img = load_img(image_path, target_size=IMAGE_SIZE)
                img_array = img_to_array(img) / 255.0

                # Encode multiple objects per image
                bbox_target = np.zeros((NUM_CLASSES, 4))
                for j, obj in enumerate(objects[:NUM_CLASSES]):
                    _, (xmin, ymin, xmax, ymax) = obj
                    bbox_target[j] = [xmin, ymin, xmax, ymax]

                images.append(img_array)
                targets.append(bbox_target)

            yield np.array(images), np.array(targets)

In [26]:
# Train SSD model

history = model.fit(
    data_generator(train_data, BATCH_SIZE),
    steps_per_epoch=len(train_data) // BATCH_SIZE,
    validation_data=data_generator(val_data, BATCH_SIZE),
    validation_steps=len(val_data) // BATCH_SIZE,
    epochs=30
)


Epoch 1/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 5s/step - loss: 6.9473 - val_loss: 0.7945
Epoch 2/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 5s/step - loss: 4.9876 - val_loss: 0.1838
Epoch 3/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 5s/step - loss: 0.4153 - val_loss: 0.1768
Epoch 4/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 5s/step - loss: 0.2287 - val_loss: 0.1572
Epoch 5/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 10s/step - loss: 0.1721 - val_loss: 0.1596
Epoch 6/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 5s/step - loss: 0.1473 - val_loss: 0.1459
Epoch 7/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 5s/step - loss: 0.1401 - val_loss: 0.1754
Epoch 8/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 5s/step - loss: 0.1416 - val_loss: 0.1376
Epoch 9/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [29]:
# Save model

model.save(MODEL_PATH)
print(f"SSD Model saved at {MODEL_PATH}")




SSD Model saved at /content/drive/MyDrive/ssd_model_voc-Generated.h5


In [30]:
# Model Evaluation

eval_results = model.evaluate(data_generator(val_data, BATCH_SIZE), steps=len(val_data) // BATCH_SIZE)
print(f"Validation Loss: {eval_results}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 609ms/step - loss: 0.0782
Validation Loss: 0.07690971344709396
