<a href="https://colab.research.google.com/github/Ibrah-N/Deep-Learning-Projects-Computer-Vision/blob/main/dl_18_yolo_object_detection_model_scratch_with_yolo_custom_loss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import random
import shutil


import tensorflow as tf

## Preprocessing Data

### Download Dataset

In [3]:
# install kaggle

# !pip install -q kaggle

In [4]:
# copy the kaggle.json to kaggle

# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

In [None]:
# donwload kaggle dataset

# !kaggle datasets download -d huanghanchina/pascal-voc-2012
# !unzip pascal-voc-2012.zip -d dataset

In [3]:
# Copy the dataset to drive

# !cp -r /content/pascal-voc-2012.zip /content/drive/MyDrive/


!unzip -q /content/drive/MyDrive/pascal-voc-2012.zip -d dataset

### Configuration

In [38]:
DATASET_IMAGES = "/content/dataset/VOC2012/JPEGImages/"
DATASET_MAPS = "/content/dataset/VOC2012/Annotations/"



CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
           'dog', 'horse', 'motorbike', 'person', 'pttedplant', 'sheep', 'sofa', 'train', 'tvmonitor']



B = 2
N_CLASSES = len(CLASSES)
OBJ_IND = N_CLASSES+5*B
H, W = 224, 224
SPLIT_SIZE = int(H/32)
N_EPOCHS = 25
LR = 0.00001
BATCH_SIZE = 32

In [39]:
!mkdir /content/dataset/VOC2012/VAL_JPEGImages/
!mkdir /content/dataset/VOC2012/VAL_Annotations/

mkdir: cannot create directory ‘/content/dataset/VOC2012/VAL_JPEGImages/’: File exists
mkdir: cannot create directory ‘/content/dataset/VOC2012/VAL_Annotations/’: File exists


In [40]:
VAL_IMAGES = "/content/dataset/VOC2012/VAL_JPEGImages/"
VAL_MAPS = "/content/dataset/VOC2012/VAL_Annotations/"



val_list = [
    "2007_000027", "2007_000032", "2007_000033", "2007_000123", "2007_000559", "2007_000664", "2007_000676",
    "2007_000584", "2007_001073", "2007_001154", "2007_000925", "2007_001761", "2007_001724", "2007_001763",
    "2007_002445", "2007_002462", "2007_002488", "2007_002539", "2007_002545", "2007_002387", "2007_002400",
]

In [41]:
# move the validattion images
# and xmls into val directory

for val in val_list:
  shutil.move(DATASET_IMAGES+val+".jpg", VAL_IMAGES)


for val in val_list:
  shutil.move(DATASET_MAPS+val+".xml", VAL_MAPS)

Error: Destination path '/content/dataset/VOC2012/VAL_JPEGImages/2007_000027.jpg' already exists

In [42]:
# parse the xml tree elements

def parse_xml(filename):
  tree = ET.parse(filename)
  root = tree.getroot()
  size_tree = root.find('size')
  width = int(size_tree.find('width').text)
  height = int(size_tree.find('height').text)


  bounding_boxes = []
  for obj in root.findall('object'):
    for box in obj.iter('bndbox'):
      xmin = int(box.find('xmin').text)
      ymin = int(box.find('ymin').text)
      xmax = int(box.find('xmax').text)
      ymax = int(box.find('ymax').text)
      break


    class_name = obj.find('name').text
    class_dict = {CLASSES[i]:i for i in range(len(CLASSES))}
    box = [
        (xmin+xmax)/(2*width), (ymin+ymax)/(2*height),
        (xmax-xmin)/(width), (ymax-ymin)/(height),
        class_dict[class_name]
    ]
    bounding_boxes.append(box)


  return np.array(bounding_boxes, dtype=np.float32)

In [43]:
boundingbox = parse_xml(DATASET_MAPS+"2007_000061.xml")
boundingbox

array([[0.711     , 0.43543544, 0.326     , 0.8048048 , 3.        ],
       [0.465     , 0.6996997 , 0.194     , 0.11411411, 3.        ]],
      dtype=float32)

In [44]:
# def output labels

def output_labels(bbox):
  labels = np.zeros((SPLIT_SIZE,SPLIT_SIZE, int(N_CLASSES+5)))

  for b in range(len(bbox)):
    print(bbox)
    grid_x = bbox[...,b, 0] * SPLIT_SIZE
    grid_y = bbox[...,b, 1] * SPLIT_SIZE
    i = int(grid_x)
    j = int(grid_y)

    labels[i, j, 0:5] = [1, grid_x%1, grid_y%1, bbox[...,b, 2], bbox[...,b, 3]]
    labels[i, j, 5+int(bbox[...,b, 4])] = 1


  return tf.convert_to_tensor(labels, dtype=tf.float32)

In [45]:
output_lbl = output_labels(boundingbox)

[[0.711      0.43543544 0.326      0.8048048  3.        ]
 [0.465      0.6996997  0.194      0.11411411 3.        ]]
[[0.711      0.43543544 0.326      0.8048048  3.        ]
 [0.465      0.6996997  0.194      0.11411411 3.        ]]


In [46]:
output_lbl.shape

TensorShape([7, 7, 25])

In [47]:
img = cv.imread(DATASET_IMAGES+"2007_000032.jpg")

### Load Data

In [48]:
img_paths = []
xml_paths = []
for path in os.listdir(DATASET_IMAGES):
  img_paths.append(DATASET_IMAGES+path)
  xml_paths.append(DATASET_MAPS+path.split(".")[0]+".xml")



val_img_paths = []
val_xml_paths = []
for path in os.listdir(VAL_IMAGES):
  val_img_paths.append(VAL_IMAGES+path)
  val_xml_paths.append(VAL_MAPS+path.split(".")[0]+".xml")



print(f"Total Images: {len(img_paths)}")
print(f"Total Validation Images: {len(val_img_paths)}")

Total Images: 17104
Total Validation Images: 21


In [49]:
train_dataset = tf.data.Dataset.from_tensor_slices((img_paths, xml_paths))
val_dataset = tf.data.Dataset.from_tensor_slices((val_img_paths, val_xml_paths))

### Design Dataset

In [50]:
def get_img_box(img_path, xml_path):

  img = tf.io.decode_jpeg(tf.io.read_file(img_path))
  img = tf.cast(tf.image.resize(img, (H, W)), dtype=tf.float32)

  bbox = tf.numpy_function(func=parse_xml, inp=[xml_path], Tout=tf.float32)
  return img, bbox

In [51]:
train_dataset = (
    train_dataset
    .map(get_img_box)
)

val_dataset = (
    val_dataset
    .map(get_img_box)
)

In [52]:
def process_labels(img, bbox):
  labels = tf.numpy_function(func=output_labels, inp=[bbox], Tout=tf.float32)
  return img, labels

In [53]:
train_dataset = (
    train_dataset
    .map(process_labels)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)


val_dataset = (
    val_dataset
    .map(process_labels)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

## Custom Yolo Loss

In [54]:
def IoU(boxes1, boxes2):
  boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,
                       boxes1[..., 1] - boxes1[..., 3] / 2.0,
                       boxes1[..., 0] + boxes1[..., 2] / 2.0,
                       boxes1[..., 1] + boxes1[..., 3] / 2.0],
                       axis=-1)
  boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
                       boxes2[..., 1] - boxes2[..., 3] / 2.0,
                       boxes2[..., 0] + boxes2[..., 2] / 2.0,
                       boxes2[..., 1] + boxes2[..., 3] / 2.0],
                       axis=-1)

  lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
  rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])

  intersection = tf.maximum(0.0, rd - lu)
  intersection_area = intersection[..., 0] * intersection[..., 1]
  boxes1_area = boxes1[..., 2] * boxes1[..., 3]
  boxes2_area = boxes2[..., 2] * boxes2[..., 3]

  union_area = tf.maximum(boxes1_area + boxes2_area - intersection_area, 1e-8)
  return tf.clip_by_value(intersection_area / union_area, 0.0, 1.0)

In [55]:
def differences(y_true, y_pred):
  return tf.reduce_sum(tf.square(y_true - y_pred))

In [56]:
# Yolo Model Loss
# 1.  Coordinates Loss
# 2.  Size Loss
# 3.  Objectness Loss
# 4.  No Objectness Loss
# 5.  Classes Loss

def yolo_loss(y_true, y_pred):
  target = y_true[..., 0]

  ########## Objectness Loss ##############
  y_target_extract = tf.gather_nd(y_true, tf.where(target[:]==1))
  y_pred_extract = tf.gather_nd(y_pred, tf.where(target[:]==1))

  print(tf.where(target[:]==1))
  print(y_target_extract)
  print(y_pred_extract)


  rescalar = tf.where(target[:]==1)*32
  upscalar_1 = tf.concat([rescalar[:, 1:], tf.zeros([len(rescalar), 2], dtype=tf.int64)], axis=-1)

  target_upscalar_2 = tf.repeat([[32., 32., 224., 224.]],
                                repeats=[len(rescalar)], axis=0)*tf.cast(y_target_extract[...,1:5], dtype=tf.float32)
  pred_1_upscalar_2 = tf.repeat([[32., 32., 224., 224.]],
                                repeats=[len(rescalar)], axis=0)*tf.cast(y_pred_extract[...,1:5], dtype=tf.float32)
  pred_2_upscalar_2 = tf.repeat([[32., 32., 224., 224.]],
                                repeats=[len(rescalar)], axis=0)*tf.cast(y_pred_extract[...,6:10], dtype=tf.float32)

  target_orig = tf.cast(upscalar_1, dtype=tf.float32)+tf.cast(target_upscalar_2, dtype=tf.float32)
  pred_1_orig = tf.cast(upscalar_1, dtype=tf.float32)+tf.cast(pred_1_upscalar_2, dtype=tf.float32)
  pred_2_orig = tf.cast(upscalar_1, dtype=tf.float32)+tf.cast(pred_2_upscalar_2, dtype=tf.float32)

  mask = tf.cast(tf.math.greater(IoU(target_orig, pred_1_orig), IoU(target_orig, pred_2_orig)), dtype=tf.int32)
  y_pred_joined = tf.transpose(tf.concat([tf.expand_dims(y_pred_extract[..., 0], axis=0),
                                         tf.expand_dims(y_pred_extract[...,5], axis=0)],
                                         axis=0))
  obj_pred = tf.gather_nd(y_pred_joined, tf.stack([tf.range(len(rescalar)), mask], axis=-1))
  object_loss = differences(tf.cast(obj_pred, dtype=tf.float32), tf.cast(tf.ones([len(rescalar)]), dtype=tf.float32))



  ########## No Objectness Loss ##############
  y_pred_extract = tf.gather_nd(y_pred[...,0:B*5], tf.where(target[:]==0))
  y_target_extract = tf.zeros(len(y_pred_extract))

  no_object_loss_1 = differences(tf.cast(y_pred_extract[..., 0], dtype=tf.float32), tf.cast(y_target_extract, dtype=tf.float32))
  no_object_loss_2 = differences(tf.cast(y_pred_extract[..., 5], dtype=tf.float32), tf.cast(y_target_extract, dtype=tf.float32))
  no_object_loss = no_object_loss_1 + no_object_loss_2



  ########## Classes Loss ##############
  y_pred_extract = tf.gather_nd(y_pred[..., B*5:], tf.where(target[:]==1))
  y_target_extract = tf.gather_nd(y_true[..., 5:], tf.where(target[:]==1))
  class_loss = differences(tf.cast(y_pred_extract, dtype=tf.float32), tf.cast(y_target_extract, dtype=tf.float32))



  ########## Center Co-ordinates Loss ##############
  y_pred_extract = tf.gather_nd(y_pred[..., 0:B*5], tf.where(target[:]==1))
  center_joined = tf.stack([y_pred_extract[...,1:3], y_pred_extract[...,6:8]], axis=1)

  center_pred = tf.gather_nd(center_joined, tf.stack([tf.range(len(rescalar)), mask], axis=-1))
  center_target = tf.gather_nd(y_true[..., 1:3], tf.where(target[:]==1))
  center_loss = differences(tf.cast(center_pred, dtype=tf.float32), tf.cast(center_target, dtype=tf.float32))




  ########## Size Co-ordinates Loss ##############
  size_joined = tf.stack([y_pred_extract[...,3:5], y_pred_extract[...,8:10]], axis=1)

  size_pred = tf.gather_nd(size_joined, tf.stack([tf.range(len(rescalar)), mask], axis=-1))
  size_target = tf.gather_nd(y_true[..., 3:5], tf.where(target[:]==1))

  size_loss = differences(tf.cast(tf.math.sqrt(tf.math.abs(size_pred)), dtype=tf.float32), tf.cast(tf.math.sqrt(tf.math.abs(size_target)), dtype=tf.float32))
  box_loss = center_loss + size_loss


  lambda_coord = 5.0
  lambda_no_obj = 0.5
  loss = (
         tf.cast(object_loss, dtype=tf.float32)
       + tf.cast((lambda_no_obj*no_object_loss), dtype=tf.float32)
       + tf.cast((lambda_coord*box_loss), dtype=tf.float32)
       + tf.cast(class_loss, dtype=tf.float32)
      )


  return loss

In [57]:
y_true = output_labels(np.array([[ 0.163     ,  0.25866666,  0.05      ,  0.08      , 14.        ],
                                 [ 0.787     ,  0.31866667,  0.078     ,  0.12533334, 14.        ]]))
y_true = tf.expand_dims(y_true, axis=0)


y_pred = np.random.normal(size = (1, SPLIT_SIZE, SPLIT_SIZE, N_CLASSES+10))
y_pred[0][1][1] = [0.9, 0.15, 0.22, 0.04, 0.06,    0.5, 0.11, 0.1, 0.01, 0.02, 0.8,  0.12,0.0,0.,0.2,0.1,0.5,0.7,0.21,0.32,0.7,0.1,0.4,0.3,1.,0.12,0.0,0.,0.2,0.1]
y_pred[0][5][2] = [0.9, 0.77, 0.3, 0.04, 0.06,    0.8, 0.75, 0.3, 0.075, 0.02, 0.1,  0.1,0.1,0.2,0.,0.21,0.45,0.45,0.9,0.12,0.45,0.7,0.,0.,1.,0.12,0.0,0.,0.2,0.1]

yolo_loss(y_true, y_pred)

[[ 0.163       0.25866666  0.05        0.08       14.        ]
 [ 0.787       0.31866667  0.078       0.12533334 14.        ]]
[[ 0.163       0.25866666  0.05        0.08       14.        ]
 [ 0.787       0.31866667  0.078       0.12533334 14.        ]]
tf.Tensor(
[[0 1 1]
 [0 5 2]], shape=(2, 3), dtype=int64)
tf.Tensor(
[[1.         0.141      0.8106666  0.05       0.08       0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         1.         0.         0.         0.         0.
  0.        ]
 [1.         0.509      0.2306667  0.078      0.12533334 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         1.         0.         0.         0.         0.
  0.        ]], shape=(2, 25), dtype=float32)
tf.Tensor(
[[0.9   0.15  0.22  0.04  0.06  0.5   0.11  0.1   0.01  0.02  0.8   0.12
  0.    0.    0.2   0.1   0.5   0.7   0.21  0.32  0.7

<tf.Tensor: shape=(), dtype=float32, numpy=53.398106>

## Custom Yolo Model

In [58]:
checkpoint_filepath = "Checkpoint/yolo_resnet_$0.h5"
callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [59]:
def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  elif epoch >= 10 and epoch < 20:
    return 0.000001
  else:
    return 0.0000001

lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [60]:
NUM_FILTERS = 512
OUTPUT_DIMS = N_CLASSES+5*B

In [61]:
base_model = tf.keras.applications.ResNet50(
    include_top=False,
    weights="imagenet",
    input_shape=(H, W, 3)
)
base_model.trainable = False

In [62]:
yolo_model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.Conv2D(NUM_FILTERS, 3, padding="same", kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),

    tf.keras.layers.Conv2D(NUM_FILTERS, 3, padding="same", kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),


    tf.keras.layers.Conv2D(NUM_FILTERS, 3, padding="same", kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),

    tf.keras.layers.Conv2D(NUM_FILTERS, 3, padding="same", kernel_initializer='he_normal'),
    tf.keras.layers.LeakyReLU(alpha=0.1),
    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(NUM_FILTERS, kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),

    tf.keras.layers.Dense(SPLIT_SIZE*SPLIT_SIZE*OUTPUT_DIMS, activation='sigmoid'),
    tf.keras.layers.Reshape((SPLIT_SIZE, SPLIT_SIZE, OUTPUT_DIMS))
])
yolo_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50 (Functional)       (None, 7, 7, 2048)        23587712  
                                                                 
 conv2d_4 (Conv2D)           (None, 7, 7, 512)         9437696   
                                                                 
 batch_normalization_4 (Bat  (None, 7, 7, 512)         2048      
 chNormalization)                                                
                                                                 
 leaky_re_lu_5 (LeakyReLU)   (None, 7, 7, 512)         0         
                                                                 
 conv2d_5 (Conv2D)           (None, 7, 7, 512)         2359808   
                                                                 
 batch_normalization_5 (Bat  (None, 7, 7, 512)         2048      
 chNormalization)                                     

In [63]:
yolo_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
    loss=yolo_loss
)

In [64]:
history = yolo_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=N_EPOCHS,
    callbacks=[callback, lr_schedule]
)

Epoch 1/25
Tensor("yolo_loss/Where_2:0", shape=(None, None), dtype=int64)
Tensor("yolo_loss/GatherNd:0", dtype=float32)
Tensor("yolo_loss/GatherNd_1:0", dtype=float32)
Tensor("yolo_loss/Where_2:0", shape=(None, None), dtype=int64)
Tensor("yolo_loss/GatherNd:0", dtype=float32)
Tensor("yolo_loss/GatherNd_1:0", dtype=float32)
[[0.659      0.50149703 0.682      0.997006   7.        ]]
[[ 0.50149256  0.738       0.99701494  0.512       1.        ]
 [ 0.71791047  0.894       0.2716418   0.212      14.        ]]
[[ 0.50149256  0.738       0.99701494  0.512       1.        ]
 [ 0.71791047  0.894       0.2716418   0.212      14.        ]]
[[ 0.649       0.7506667   0.37        0.49866667 14.        ]]
[[ 0.465      0.5828877  0.274      0.8342246 14.       ]]
[[ 0.283       0.5315315   0.13        0.0960961   6.        ]
 [ 0.917       0.6156156   0.082       0.0960961  13.        ]
 [ 0.915       0.5570571   0.038       0.11111111 14.        ]]
[[ 0.283       0.5315315   0.13        0.0960961 

UnknownError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) UNKNOWN:  KeyError: 'pottedplant'
Traceback (most recent call last):

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "<ipython-input-42-54d41e2b71be>", line 26, in parse_xml
    class_dict[class_name]

KeyError: 'pottedplant'


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[yolo_loss/cond_1/else/_13/yolo_loss/cond_1/StringJoin/_92]]
  (1) UNKNOWN:  KeyError: 'pottedplant'
Traceback (most recent call last):

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "<ipython-input-42-54d41e2b71be>", line 26, in parse_xml
    class_dict[class_name]

KeyError: 'pottedplant'


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_30332]

[[0.3918919 0.564     0.7777778 0.872     2.       ]]
[[ 0.59066665  0.653       0.264       0.33       14.        ]]
