In [1]:
# Import python library
import numpy as np
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import cv2

In [2]:
"""
Define the leak ReLU that used as activation
function
"""
def ReLU_Leak(x):
    function = tf.maximum(x, 0.1*x)
    return function

In [3]:
"""
Yolo class
Code Reference: https://github.com/xiaohu2015/
"""
class YOLO(object):
    
    def __init__(self, weightFile, verbose=True):
        
        self.verbose = verbose
        
        # detection params
        self.S = 7  # cell size
        self.B = 2  # boxes_per_cell
        self.classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
                        "bus", "car", "cat", "chair", "cow", "diningtable",
                        "dog", "horse", "motorbike", "person", "pottedplant",
                        "sheep", "sofa", "train","tvmonitor"]
        self.C = len(self.classes) # number of classes
        
        # box center coordinate
        self.x_offset = np.transpose(np.reshape(np.array([np.arange(self.S)]*self.S*self.B),[self.B, self.S, self.S]), [1, 2, 0])
        self.y_offset = np.transpose(self.x_offset, [1, 0, 2])
        
        # Threshold for confidence score
        self.threshold = 0.2  
        self.iouThresh = 0.4
        
        #  NMS algorithm
        self.maxOutput = 10

        self.sess = tf.compat.v1.Session()
        self.NNmodule()
        self.BoundBox()
        self.LoadWeight(weightFile)


    """
    Construct the neural network module
    """
    def NNmodule(self):
        
        self.images = tf.compat.v1.placeholder(tf.float32, [None, 448, 448, 3])
        #self.images = tf.keras.Input(tf.float32, [None, 448, 448, 3])
        
        #
        vpgrid = self.S * self.S *(self.C + 5 * self.B)
        
        # First convolution layer with dimension 7x7x64x2
        # and Maxpool layer with dimension 2x2x2
        NN = self.ConvLayer(self.images, 1, 64, 7, 2) 
        NN = self.MaxPoolLayer(NN, 1, 2, 2)

        # Second convolution layer with dimension 3x3x192
        # and Maxpool layer with dimension 2x2x2
        NN = self.ConvLayer(NN, 2, 192, 3, 1) 
        NN = self.MaxPoolLayer(NN, 2, 2, 2)

        # Third convolution layers with dimension 1x1x128,3x3x256,1x1x256 and 3x3x512
        # and Maxpool layer with dimension 2x2x2
        NN = self.ConvLayer(NN, 3, 128, 1, 1)
        NN = self.ConvLayer(NN, 4, 256, 3, 1)
        NN = self.ConvLayer(NN, 5, 256, 1, 1)
        NN = self.ConvLayer(NN, 6, 512, 3, 1)
        NN = self.MaxPoolLayer(NN, 6, 2, 2)

        # Fourth layers construct
        NN = self.ConvLayer(NN, 7, 256, 1, 1)
        NN = self.ConvLayer(NN, 8, 512, 3, 1)
        NN = self.ConvLayer(NN, 9, 256, 1, 1)
        NN = self.ConvLayer(NN, 10, 512, 3, 1)
        NN = self.ConvLayer(NN, 11, 256, 1, 1)
        NN = self.ConvLayer(NN, 12, 512, 3, 1)
        NN = self.ConvLayer(NN, 13, 256, 1, 1)
        NN = self.ConvLayer(NN, 14, 512, 3, 1)
        NN = self.ConvLayer(NN, 15, 512, 1, 1)
        NN = self.ConvLayer(NN, 16, 1024, 3, 1)
        NN = self.MaxPoolLayer(NN, 16, 2, 2)

        # Fifth layers construct
        NN = self.ConvLayer(NN, 17, 512, 1, 1)
        NN = self.ConvLayer(NN, 18, 1024, 3, 1)
        NN = self.ConvLayer(NN, 19, 512, 1, 1)
        NN = self.ConvLayer(NN, 20, 1024, 3, 1)
        NN = self.ConvLayer(NN, 21, 1024, 3, 1)
        NN = self.ConvLayer(NN, 22, 1024, 3, 2)

        # Sixth layers construct
        NN = self.ConvLayer(NN, 23, 1024, 3, 1)
        NN = self.ConvLayer(NN, 24, 1024, 3, 1)

        # Two fully connected layers construct
        NN = self.Flatten(NN)  
        NN = self.FullyConn(NN, 25, 512, activation = ReLU_Leak)
        NN = self.FullyConn(NN, 26, 4096, activation = ReLU_Leak)
        NN = self.FullyConn(NN, 27, vpgrid)
        self.predicts = NN
        
        
    
    """
    Construct the convolutional layer
    """    
    def ConvLayer(self, x, num, numFilter, sizeFilter, stride):
        
        inputChan = x.get_shape().as_list()[-1]
        
        # Weight
        w = tf.Variable(tf.compat.v1.truncated_normal([sizeFilter, sizeFilter, inputChan, numFilter], stddev=0.1))
        
        # Bias
        b = tf.Variable(tf.zeros([numFilter,]))
        
        pad_size = sizeFilter // 2
        pad_mat = np.array([[0, 0], [pad_size, pad_size], [pad_size, pad_size], [0, 0]])
        x_pad = tf.pad(x, pad_mat)
        
        conv = tf.nn.conv2d(x_pad, w, strides=[1, stride, stride, 1], padding="VALID")
        output = ReLU_Leak(tf.nn.bias_add(conv, b))
        
        return output
    
    """
    Construct the fully connected layer
    """
    def FullyConn(self, x, num, num_out, activation=None):
        
        num_in = x.get_shape().as_list()[-1]
        
        # Weight
        w = tf.Variable(tf.compat.v1.truncated_normal([num_in, num_out], stddev=0.1))
        
        # Bias
        b = tf.Variable(tf.zeros([num_out,]))
        
        output = tf.compat.v1.nn.xw_plus_b(x, w, b)
        if activation:
            output = activation(output)
        
        return output

    
    def MaxPoolLayer(self, x, num, poolSize, stride):
        
        output = tf.nn.max_pool(x, [1, poolSize, poolSize, 1], strides=[1, stride, stride, 1], padding="SAME")
        return output

    def Flatten(self, x):
        
        x_tr = tf.transpose(x, [0, 3, 1, 2])  # channle first mode
        prod = np.product(x.get_shape().as_list()[1:])
        
        return tf.reshape(x_tr, [-1, prod])
        
        
    """
    Bounding box for recognized objects
    Code Reference: https://github.com/xiaohu2015/
    """
    def BoundBox(self):
        
        # Dimension of the input image
        self.width = tf.compat.v1.placeholder(tf.float32, name="img_wid")
        self.height = tf.compat.v1.placeholder(tf.float32, name="img_h")
        
        # 
        lim1 = self.S * self.S * self.C
        lim2 = lim1 + self.S * self.S * self.B
        
        # class prediction
        class_probs = tf.reshape(self.predicts[0, :lim1], [self.S, self.S, self.C])
        
        # confidence value
        confidence = tf.reshape(self.predicts[0, lim1:lim2], [self.S, self.S, self.B])
        
        # boxes shape
        boxes = tf.reshape(self.predicts[0, lim2:], [self.S, self.S, self.B, 4])

        boxes = tf.stack([(boxes[:, :, :, 0] + tf.constant(self.x_offset, dtype=tf.float32)) / self.S * self.width,
                          (boxes[:, :, :, 1] + tf.constant(self.y_offset, dtype=tf.float32)) / self.S * self.height,
                          tf.square(boxes[:, :, :, 2]) * self.width,
                          tf.square(boxes[:, :, :, 3]) * self.height], axis=3)

        # class-specific confidence scores [S, S, B, C]
        scores = tf.expand_dims(confidence, -1) * tf.expand_dims(class_probs, 2)

        scores = tf.reshape(scores, [-1, self.C])
        boxes = tf.reshape(boxes, [-1, 4])

        # find each box class, only select the max score
        box_classes = tf.argmax(scores, axis=1)
        box_class_scores = tf.reduce_max(scores, axis=1)

        # filter the boxes by the score threshold
        filter_mask = box_class_scores >= self.threshold
        scores = tf.boolean_mask(box_class_scores, filter_mask)
        boxes = tf.boolean_mask(boxes, filter_mask)
        box_classes = tf.boolean_mask(box_classes, filter_mask)

        # NMS Algorithm
        # Code Reference: https://tensorflow.google.cn/api_docs/python/tf/image/non_max_suppression
        _boxes = tf.stack([boxes[:, 0] - 0.5 * boxes[:, 2], boxes[:, 1] - 0.5 * boxes[:, 3],
                           boxes[:, 0] + 0.5 * boxes[:, 2], boxes[:, 1] + 0.5 * boxes[:, 3]], axis=1)
        nms_indices = tf.image.non_max_suppression(_boxes, scores, self.maxOutput, self.iouThresh)
        self.scores = tf.gather(scores, nms_indices)
        self.boxes = tf.gather(boxes, nms_indices)
        self.box_classes = tf.gather(box_classes, nms_indices)

    
    """
    Load pretrained weight file from local drive
    """
    def LoadWeight(self, weightFile):
        
        saver = tf.compat.v1.train.Saver()
        saver.restore(self.sess, weightFile)

    
    """
    Detect on given image file
    """
    def DetectFile(self, image_file, imshow=True,detected_image_file="detected_image.jpg"):
        
        # read image
        image = cv2.imread(image_file)
        
        # Get image dimensions
        height, width, _ = image.shape
        
        scores, boxes, box_classes = self.ConvertImage(image)
        
        predict_boxes = []
        for i in range(len(scores)):
            predict_boxes.append((self.classes[box_classes[i]], boxes[i, 0], boxes[i, 1], boxes[i, 2], boxes[i, 3], scores[i]))
        
        self.PresentResult(image, predict_boxes, imshow, detected_image_file)

    
    """
    Convert image for detection
    """
    def ConvertImage(self, image):
        
        # Get dimension of input images
        height, width, _ = image.shape
        
        # Resize image dimension
        img_resized = cv2.resize(image, (448, 448))
        
        # Change image color channel
        img_RGB = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
        img_resized_np = np.asarray(img_RGB)
        _images = np.zeros((1, 448, 448, 3), dtype=np.float32)
        _images[0] = (img_resized_np / 255.0) * 2.0 - 1.0
        
        scores, boxes, box_classes = self.sess.run([self.scores, self.boxes, self.box_classes], feed_dict={self.images: _images, self.width: width, self.height: height})
        
        return scores, boxes, box_classes
 
    
    """
    Show recognized results
    """
    def PresentResult(self, image, results, imshow=True, detected_image_file=None):
        
        img_cp = image.copy()
        
        #  draw boxes
        for i in range(len(results)):
            x = int(results[i][1])
            y = int(results[i][2])
            w = int(results[i][3]) // 2
            h = int(results[i][4]) // 2
            if self.verbose:
                print("class: %s, [x, y, w, h]=[%d, %d, %d, %d], confidence=%f" % (results[i][0],x, y, w, h, results[i][-1]))

                cv2.rectangle(img_cp, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2)
                cv2.rectangle(img_cp, (x - w, y - h - 20), (x + w, y - h), (125, 125, 125), -1)
                cv2.putText(img_cp, results[i][0] + ' : %.2f' % results[i][5], (x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
            
        if imshow:
            cv2.imshow('Object Recognition Using YOLO', img_cp)
            cv2.waitKey(1)
            
        if detected_image_file:
            cv2.imwrite(detected_image_file, img_cp)

In [4]:
if __name__ == "__main__":
    yolo_net = YOLO("YOLO_Weight.ckpt")
    yolo_net.DetectFile("images.jpg")

INFO:tensorflow:Restoring parameters from YOLO_Weight.ckpt
class: car, [x, y, w, h]=[622, 505, 101, 75], confidence=0.300315
class: car, [x, y, w, h]=[348, 593, 143, 82], confidence=0.291154
class: car, [x, y, w, h]=[1084, 521, 65, 87], confidence=0.232172
