In [None]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import json
import xml.etree.ElementTree as ET
import os
import time
import random
from google.colab import files
from PIL import Image
import albumentations as A
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (GlobalAveragePooling2D, Activation, MaxPooling2D, Add, Conv2D, MaxPool2D, Dense,
                                     Flatten, InputLayer, BatchNormalization, Input, Embedding, Permute,
                                     Dropout, RandomFlip, RandomRotation, LayerNormalization, MultiHeadAttention,
                                     RandomContrast, Rescaling, Resizing, Reshape,LeakyReLU)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (Callback, CSVLogger, EarlyStopping, LearningRateScheduler,
                                        ModelCheckpoint, ReduceLROnPlateau)
from tensorflow.keras.regularizers import L2, L1
from tensorflow.keras.initializers import RandomNormal
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Example, Features, Feature
from google.colab import drive


<H1>DATA PREPARATION</H1>

In [None]:
class Utils():
    def __init__(self, split_size, num_boxes, classes):
        self.split_size = split_size
        self.num_boxes = num_boxes
        self.classes = classes
        
    def preprocess_xml(self, filename,m_a_p=False):
        tree = ET.parse(filename)
        root = tree.getroot()
        bounding_boxes = []

        size_tree = root.find('size')
        height = float(size_tree.find('height').text)
        width = float(size_tree.find('width').text)

        for object_tree in root.findall('object'):
            for bounding_box in object_tree.iter('bndbox'):

                xmin = (float(bounding_box.find('xmin').text)/width)
                ymin = (float(bounding_box.find('ymin').text)/height)
                xmax = (float(bounding_box.find('xmax').text)/width)
                ymax = (float(bounding_box.find('ymax').text)/height)
                if m_a_p:
                    xmin=int(xmin*width)
                    ymin=int(ymin*height)
                    xmax=int(xmax*width)
                    ymax=int(ymax*height)
                break
            class_name = object_tree.find('name').text
            bounding_box = [xmin,ymin,xmax,ymax,class_name]
            bounding_boxes.append(bounding_box)
        if m_a_p:
            return bounding_boxes,(height,width)
    return bounding_boxes

    def midpoint(self, x,y):
        return (x+y)/2
    def get_pos(self,classes, classe):
        for c in range(len(classes)):
            if(classes[c]==classe):
                return c
    def generate_pre_output(self,bounding_boxes,image_shape):
        height,width=image_shape
        bbox=[]
        for box in bounding_boxes:
            x_min,y_min,x_max,y_max,obj=box[0],box[1],box[2],box[3],box[4]
            bbox.append([int(x_min/width*224),int(y_min/height*224),int(x_max/width*224),int(y_max/height*224),obj])
        return bbox
    def bounding_box_to_output(self,box,offset,anchor):
        output=[0,0,0,0,1]
        output[0]=tf.math.log((box[0]-offset[0])/(1+offset[0]-box[0]))
        output[1]=tf.math.log((box[1]-offset[1])/(1+offset[1]-box[1]))
        
        output[2]=tf.math.log(box[2]/anchor[0])
        output[3]=tf.math.log(box[3]/anchor[1])
        return np.array(output)
    
    
    def generate_output(self, bounding_boxes):
        anchor_boxes=[[1.3,1.3],[2.3,1.3],[1.3,2.3]]
        output_label = np.zeros((self.split_size, self.split_size, len(anchor_boxes), len(self.classes)+(5*self.num_boxes)))

        for box in bounding_boxes:
            for a in range(len(anchor_boxes)):

                x_min, y_min, x_max, y_max = box[0], box[1], box[2], box[3]

                x_centre = self.midpoint(x_min, x_max)
                y_centre = self.midpoint(y_min, y_max)

                x_width = (x_max - x_min)*self.split_size
                y_height = (y_max - y_min)*self.split_size

                b_x_centre_cell = (x_centre*self.split_size)
                b_y_centre_cell = (y_centre*self.split_size)

                i = int(b_x_centre_cell)
                j = int(b_y_centre_cell)
                
                offset = [i,j]
                b = [b_x_centre_cell,b_y_centre_cell,x_width,y_height]
                width,height=box[5],box[6]
                
                b_x_min=((b_x_centre_cell-(anchor_boxes[a][0]/2))/self.split_size)*width
                if b_x_min<0:
                    b_x_min=0.
                b_y_min=((b_y_centre_cell-(anchor_boxes[a][1]/2))/self.split_size)*height
                if b_y_min<0:
                    b_y_min=0.
                
                
                b_x_max=((b_x_centre_cell+(anchor_boxes[a][0]/2))/self.split_size)*width
                b_y_max=((b_y_centre_cell+(anchor_boxes[a][1]/2))/self.split_size)*height
                
                x_centre_cell = (x_centre*self.split_size)-i
                y_centre_cell = (y_centre*self.split_size)-j

                bbox.append(self.bounding_box_to_output(b,offset,anchor_boxes[a]))
                box_iou.append(self.iou([x_min*width,y_min*height,x_max*width,y_max*height],[b_x_min,b_y_min,b_x_max,b_y_max]))
            
            highest_a = tf.argmax(tf.constant(box_iou)).numpy()
            output_label[i,j,highest_a,22:27] = self.bounding_box_to_output(b,offset,anchor_boxes[highest_a])
            output_label[i,j,highest_a,self.get_pos(self.classes, box[4])] = 1.
            
        return output_label

In [None]:
test_image='...'
image = tf.keras.preprocessing.image.load_img(
    test_image,color_mode='rgb',target_size=(224,224)
)
classes = ['background','aeroplane','bicycle','bird','boat','bottle','bus','car','cat',
           'chair','cow','diningtable','dog','horse','motorbike','person','pottedplant',
           'sheep','sofa','train','tvmonitor','book']

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__ (self, train_images, train_maps, split_size, num_boxes, classes, batch_size, shuffle = False):

        self.train_images = train_images
        self.train_maps = train_maps
        self.train_image_list = os.listdir(self.train_images)
        self.train_map_list = os.listdir(self.train_maps)
        self.split_size = split_size
        self.num_boxes = num_boxes
        self.classes = classes
        self.batch_size = batch_size

    def __len__(self):
        return int(np.floor(len(self.train_image_list)/self.batch_size))
  
    def __getitem__(self, idx):
        x,y_1,y_2 = self.__data_generation(idx)
        return np.array(x), [y_1,y_2]
  
    def __data_generation(self, idx):
        x = np.empty((self.batch_size, 224,224,3))
        y_1 = np.zeros((self.batch_size, self.split_size, self.split_size,3,27))
        y_2 = np.zeros((self.batch_size, self.split_size*4, self.split_size*4,3,27))

        for i,j in enumerate(list(range(idx*self.batch_size, (idx+1)*self.batch_size))):
            image = tf.keras.preprocessing.image.load_img(self.train_images + self.train_image_list[j],
                                                        color_mode ='rgb', target_size = (224,224))
            
            x[i] = tf.keras.preprocessing.image.img_to_array(image)
            utils = Utils(self.split_size,self.num_boxes,self.classes)
            bounding_boxes = utils.preprocess_xml(self.train_maps+self.train_map_list[j])
            y_1[i] = utils.generate_output(bounding_boxes)

            utils = Utils(self.split_size*4,self.num_boxes,self.classes)
            bounding_boxes = utils.preprocess_xml(self.train_maps+self.train_map_list[j])
            y_1[i] = utils.generate_output(bounding_boxes)

            
        return x,y_1,y_2

In [None]:
LR=1e-3
BATCH_SIZE=32
EPOCH=100
train_images='...'
train_maps='...'
val_images='...'
val_maps='...'
split_size=7
num_boxes=1

In [None]:
train_gen = DataGenerator(train_images, train_maps, split_size, num_boxes, classes, batch_size=BATCH_SIZE)
val_gen = DataGenerator(val_images, val_maps, split_size, num_boxes, classes, batch_size=BATCH_SIZE)

<H1>MODELING</H1>

In [None]:
def get_base_model():
    base_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
        weights='imagenet',
        input_shape=(INPUT_DIM,INPUT_DIM,3),
        include_top=False,)
    base_model.trainable=False
    block_6_output,out_relu=[base_model.get_layer(layer_name).output for layer_name in ["block_6_expand_relu","out_relu"]]
    
    return Model(
        inputs=[base_model.inputs],outputs=[out_relu,block_6_output]
    )

get_base_model().summary()

In [None]:
class head(tf.keras.layers.Layer):
    def __init__(self,OUTPUT_DIM,OUTPUT_SHAPE,NUM_FILTERS):
        super(Head,self).__init__()
        
        self.conv_1=Conv2D(NUM_FILTERS,(3,3),activation='relu',kernel_initializer=RandomNormal(0.01))
        self.conv_2=Conv2D(NUM_FILTERS,(3,3),activation='relu',kernel_initializer=RandomNormal(0.01))
        self.conv_3=Conv2D(OUTPUT_DIM,(3,3),activation='relu',kernel_initializer=RandomNormal(0.01))
        self.norm_1=LayerNormalization()
        
        self.flatten=Flatten()
        self.reshape=Reshape(OUTPUT_SHAPE)
    def call(self,images,training=False):
        x=self.conv_1(images)
        x=self.conv_2(x)
        x=self.conv_3(x)
        x=self.norm_1(x)
        x=self.flatten(x)
        x=self.reshape(x)
        return x

In [None]:
INPUT_DIM = 224
NUM_FILTERS = 16

s = 7
c = len(classes)
b = 1

OUTPUT_DIM  = s*s*3*(c+5*b)
OUTPUT_SHAPE = (s,s,3,c+5*b)

In [None]:
inputs = tf.keras.Input(shape = (INPUT_DIM, INPUT_DIM, 3))
out_relu,block_6_output=get_base_model()(inputs)
head_7=Head(OUTPUT_DIM,OUTPUT_SHAPE,NUM_FILTERS)(out_relu)
pre_concat_28=Conv2DTranspose(128,(1,1),strides=(4,4))(out_relu)
post_concat_28=tf.keras.layers.concatenate([pre_concat_28,block_6_output],axis=-1)

head_28=Head(3*s*4*s*4*(c+5*b), (s*4,s*4,3,c+5*b),NUM_FILTERS)(post_concat_28)
model=Model(inputs=inputs,outputs=[head_7,head_28])

In [None]:
model.summary()

<H1>TRAINING</H1>

In [None]:
class YOLOLoss(tf.losses.Loss):
    def __init__(self,):
        super(YOLOLoss,self).__init__()
        pass
    def call(self, y_true, y_pred):
        cce = tf.keras.losses.CategoricalCrossentropy()
        bce = tf.keras.losses.BinaryCrossentropy()
        
        target = tf.reshape(y_true[...,26], [-1])
        predictions = tf.reshape(y_pred[...,26], [-1])

        ###################### OBject Loss
        y_pred_extract = tf.keras.activations.sigmoid(tf.gather(predictions, tf.reshape(tf.where(target==1),[-1])))
        y_target_extract = tf.ones(len(y_pred_extract))
        object_loss = bce(y_pred_extract,y_target_extract)

        ####################### For No object

        y_pred_extract = tf.keras.activations.sigmoid(tf.gather(predictions, tf.reshape(tf.where(target==0),[-1])))
        y_target_extract = tf.zeros(len(y_pred_extract))
        no_object_loss = bce(y_pred_extract,y_target_extract)

        ####################### For OBject class loss

        y_pred_extract = tf.nn.softmax(tf.gather_nd(y_pred,tf.where(y_true[...,25]!=0))[...,0:22])
        y_target_extract = tf.gather_nd(y_true,tf.where(y_true[...,25]!=0))[...,0:22]

        class_loss = cce(y_pred_extract,y_target_extract)

        # ######################## For object bounding box loss

        y_pred_extract_centre = tf.gather_nd(y_pred,tf.where(y_true[...,25]!=0))[...,22:24]
        y_target_extract_centre = tf.gather_nd(y_true,tf.where(y_true[...,25]!=0))[...,22:24]



        bounding_centre_loss = tf.math.reduce_mean(tf.keras.losses.mean_squared_error(y_pred_extract_centre, y_target_extract_centre))

        y_pred_extract_side = tf.gather_nd(y_pred,tf.where(y_true[...,25]!=0))[...,24:26]
        y_target_extract_side = tf.gather_nd(y_true,tf.where(y_true[...,25]!=0))[...,24:26]

        bounding_side_loss = tf.math.reduce_mean(tf.keras.losses.mean_squared_error(y_pred_extract_side, y_target_extract_side))

        bounding_loss = bounding_centre_loss + bounding_side_loss

        lambda_coord = 5.
        lambda_no_obj = 0.5

        loss = object_loss + (lambda_no_obj*no_object_loss)+ tf.cast(lambda_coord*bounding_loss,dtype=tf.float32)+ tf.cast(class_loss,dtype=tf.float32) 
        
        return loss

In [None]:
model.compile(
    loss = YOLOLoss(),
    optimizer = Adam(learning_rate = LR),
    run_eagerly = True,
)

In [None]:
checkpoint_filepath='...'
callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True
)

In [None]:
history = model.fit(
    train_gen,
    verbose=1,
    shuffle=True,
    epochs=2500,
    callbacks=[callback])

<h1>TESTING</h1>

In [None]:
test_image ='...'
X=[]
anchor_noxes=[[1.3,1.3],[2.3,1.3],[1.3,2.3]]

im = tf.keras.preprocessing.image.load_img(test_image,color_mode='rgb',target_size=(500,500))
image_width,image_height=im.width,im.height
im=tf.keras.preprocessing.image.img_to_array(image)/255.


image = tf.keras.preprocessing.image.load_img(test_image,color_mode='rgb',target_size=(224,224))
image=tf.keras.preprocessing.image.img_to_array(image)

X.append(np.array(image))
X=np.array(X)

output=model.predict(X)

final_boxes=[]
object_classes=[]
final_scores=[]

In [None]:
def get_boxes(out,scales):
    for scale in range(len(scales)):
        split_size=scales[scale]
        for a in range(len(anchor_boxes)):
            output=out[scale][...,a,:]
            THRESH=10
            bounding_boxes=tf.where(output[...,26]>=THRESH)
            selected_output=tf.gather_nd(output,bounding_boxes)
            objects=tf.argmax(selected_output[...,0:22],axis=-1)
            i=0
            
            for box in bounding_boxes:
                box=np.array(box)
                output_box=output[pos[0]][box[1]][box[2]][22:27]
                final_scores.append(output_box[-1])

                x_centre=int((image_width/split_size)*(tf.keras.activations.sigmoid(output_box[0])+box[1])
                y_centre=int((image_height/split_size)*(tf.keras.activations.sigmoid(output_box[1])+box[2])
                
                x_width=int(image_width*anchor_boxes[a][0]*tf.math.exp(output_box[2]))
                y_height=int(image_height*anchor_boxes[a][1]*tf.math.exp(output_box[3]))
                
                x_min=int(x_centre-(x_width/2))
                y_min=int(y_centre-(y_height/2))

                x_max=int(x_centre+(x_width/2))
                y_max=int(y_centre+(y_height/2))

                if(x_min<=0):x_min=0
                if(y_min<=0):y_min=0
                if(x_max>=image_width):x_max=image_width
                if(y_max>=image_height):y_max=image_height

                final_boxes.append([x_min,y_min,x_max,y_max,str(classes[objects[i]])])
                i+=1
    return np.array(final_boxes),np.array(final_scores)

In [None]:
split_size=7
scales=[split_size,split_size*4]
final_boxes,final_scores=get_boxes(out,scales)
print('finalboxes',final_boxes)
i=0

object_classes=final_boxes[...,4]
final_boxes=final_boxes[...,0:4]

nms_output=tf.image.non_max_suppression(
    final_boxes,final_scores,max_output_size=20,iou_threshold=0.5,
    score_threshold=float('-inf')
)

final_nms_boxes=[]

for i in nms_output:
    final_nms_boxes.append(list(final_boxes[i]))
i=0
for i,box in enumerate(final_nms_boxes):
    cv2.rectangle(image, (int(box[0]),int(box[1])),(int(box[2]),int(box[3])),(0,0,255),1)
    cv2.putText(
      image,
      object_classes[i],
      (int(box[0]),int(box[1])),
      cv2.FONT_HERSHEY_SIMPLEX,1,(222,0,0),2
      )
cv2.imshow("YOU ONLY LOOK ONCE",im)
pause=cv2.waitKey()
cv2.destroyAllWindows()

In [None]:
def iou(bbox_1,bbox_2):
    x_1=tf.maximum(bbox_1[0],bbox_2[0])
    y_1=tf.maximum(bbox_1[1],bbox_2[1])
    x_2=tf.maximum(bbox_1[2],bbox_2[2])
    y_2=tf.maximum(bbox_1[3],bbox_2[3])
    
    inter_area=float(max(x_2-x_1,0)*max(y_2-y_1,0))
    bbox_1_area=(bbox_1[2]-bbox_1[0])*(bbox_1[3]-bbox_1[1])
    bbox_2_area=(bbox_2[2]-bbox_2[0])*(bbox_2[3]-bbox_2[1])1
    
    union_area=float(bbox_1_area+bbox_2_area-inter_area)
    return inter_area/union_area

In [None]:
def area_polygon(x,y):
    area_1,area_2=0,0
    for i in range(len(x)-1):
        area_1+=x[i]*y[i+1]
    area_1+=x[len(x)-1]*y[0]
    
    for i in range(len(x)-1):
        area_2+=x[i+1]*y[i]
    area_2+=x[0]*y[len(x)-1]
    return 0.5*tf.abs(area_1-area_2)

In [None]:
def get_target(val_data):
    THRESH=0.5
    utils=Utils(7,1,classes)
    bounding_boxes,image_shape=utils.preprocess_xml(val_data,m_a_p=True)
    y_target=utils.generate_pre_output(bounding_boxes,image_shape)
    return y_target

In [None]:
def mean(arr):
    summ=0
    for i in arr:
        summ+=i
    return summ/len(arr)

In [None]:
aps=[]

In [None]:
def mean_average_precision():
    for object_class in classes:
        precision=[1.]
        recall=[0.]
        n_class=0
        tp,fp=0,0
        for image,bbox in zip(os.listdir(val_images),os.listdir(val_maps)):
            y_target=get_target(val_maps+bbox)
            for target in y_target:
                if target[4]==object_class:
                    n_class+=1
        for image,bbox in zip(os.listdir(val_images),os.listdir(val_maps)):
            y_target=get_target(val_maps+bbox)
            y_pred=generate_output(val_images+image)
            
            for pred in y_pred:
                if pred[4]!=object_class:
                    pass
                else:
                    found=False
                    for target in y_target[4] and iou(pred[:4],target[:4])>0.5:
                        found=True
                        break
                    if found:
                        tp+=1
                    else:
                        fp+=1
                    if tp/(tp+fp)>1e-3 and tp/n_class>1e-3:
                        precision.append(tp/(tp+fp))
                        reacall.append(min(1.,tp/n_class))
        precision.append(0.)
        recall.append(max(recall))
        precision.append(0.)
        recall.append(0.)
        
        ap_s.append(area_polygon(recall,precision).numpy())
    return mean(ap_s)