In [1]:
import numpy as np
import tensorflow as tf
import config as cfg

import os
import xml.etree.ElementTree as ET
import numpy as np
import cv2
import pickle
import copy
import tensorflow as tf


slim = tf.contrib.slim


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [2]:
DATA_PATH ='data'
PASCAL_PATH = os.path.join(DATA_PATH, 'pascal_voc')
CACHE_PATH = os.path.join(PASCAL_PATH, 'cache')

In [3]:
class dataset_pascal_voc(object):
    def __init__(self, phase, rebuild=False):
        self.devkil_path = os.path.join(PASCAL_PATH, 'VOCdevkit')
        self.data_path = os.path.join(self.devkil_path, 'VOC2007')
        self.cache_path = CACHE_PATH
        self.batch_size = 20
        self.image_size = 448
        self.cell_size = 7
        self.classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
           'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
           'train', 'tvmonitor']
        self.class_to_ind = dict(zip(self.classes, range(len(self.classes)))) #create dict where keys = labels
        self.flipped = True
        self.phase = phase
        self.rebuild = rebuild
        self.cursor = 0
        self.epoch = 1
        self.labels_got = None
        self.prepare()
        
    def get(self):
        X_img = np.zeros((self.batch_size, self.image_size, self.image_size, 3))
        Y_labels = np.zeros((self.batch_size, self.cell_size, self.cell_size, 25))
        count_batch = 0
        while count_batch < self.batch_size:
            img_name = self.labels_got[self.cursor]['imname']
            flipped = self.labels_got[self.cursor]['flipped']
            X_img[count_batch, :,:,:] = self.read_image(img_name,flipped)
            Y_labels[count_batch, :,:,:] = self.labels_got[self.cursor]['label']
            count_batch +=1
        
        return X_img, Y_labels
        
            
    def read_image(self, img_name, flipped=False):
        image = cv2.imread(img_name)
        # resize image to 448 , 448
        image = cv2.resize(image, (self.image_size, self.image_size))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image = (image /255.0) *2 -1.0
        if flipped:
            image = image[:,::-1,:]        
        return image
    
    def prepare(self):
        print("In prepare")
        # contains list of dict where each dicts where each dict contain file path, 3D labels & if flipped or not
        labels_got = self.get_labels()
        #print(labels_got[0:10])
        #print(labels_got[0]['label'])
        if self.flipped:
            print('Appending horizontally-flipped training examples ...')
            labels_copy = copy.deepcopy(labels_got)
            added_labels = self.data_augment(labels_got,labels_copy)
        np.random.shuffle(added_labels)
        self.labels_got = added_labels
        print("labels len", len(added_labels))
        return labels_got
        
    
    def data_augment(self, orig_labels, labels_copy):
        print("Create flipped data")
        for index in range(len(labels_copy)):
            labels_copy[index]['flipped'] = True
            labels_copy[index]['label'] = labels_copy[index]['label'][:,::-1,:]
            
            for i in range(self.cell_size):
                for j in range(self.cell_size):
                    if labels_copy[index]['label'][i,j,0] == 1:
                        #print(labels_copy[index]['label'][i,j,2])
                        labels_copy[index]['label'][i,j,1] = self.image_size-1-labels_copy[index]['label'][i,j,1]
                        #print(labels_copy[index]['label'][i,j,2])
        orig_labels+= labels_copy
        return orig_labels
        
        
    def get_labels(self):
        # getting file containing data
        cache_file = os.path.join(self.cache_path,'pascal_'+self.phase+'_gt_labels.pkl')
        print(self.cache_path)
        if(os.path.isfile(cache_file) and not self.rebuild):
            print("Getting labels from "+ cache_file)
            with open(cache_file, 'rb') as labels_file:
                labels_got = pickle.load(labels_file)
                
            print("original labels length :", len(labels_got))
            return labels_got

In [4]:
class YoloNet(object):
    
    def __init__(self, is_training=True):
        self.classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
           'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
           'train', 'tvmonitor']
        self.num_of_classes = len(self.classes)
        self.image_size = 448
        self.cell_size = 7
        self.boxes_per_cell = 2 
        # output size = S*S * B*5 + Cls
        self.output_size = (self.cell_size * self.cell_size) * (self.num_of_classes + self.boxes_per_cell * 5)
        self.object_scale = 1.0
        self.no_object_scale = 1.0
        self.class_scale = 2.0
        self.coordi_scale = 5.0
        
        self.learning_rate = 0.0001
        self.batch_size = 20
        self.alpha = 0.1
        self.boundary1 = self.cell_size * self.cell_size * self.num_of_classes # 7*7*20 = 980
        self.boundary2 = self.boundary1 +\
            self.cell_size * self.cell_size * self.boxes_per_cell # 980 + 7*7*2(98) = 1078

        
        #(7, 7, 2)
        self.offset = np.transpose(np.reshape(np.array(
            [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
            (self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))
        self.images = tf.placeholder(
            tf.float32, [None, self.image_size, self.image_size, 3],
            name='images')
        self.logits = self.build_network(self.images, num_outputs=self.output_size, alpha=self.alpha,is_training=True)
        
        if is_training:
            self.labels = tf.placeholder(
                tf.float32,
                [None, self.cell_size, self.cell_size, 5 + self.num_of_classes])
            self.loss_layer(self.logits, self.labels)
            self.total_loss = tf.losses.get_total_loss()
            tf.summary.scalar('total_loss', self.total_loss)

    def build_network(self, images, num_outputs, alpha, keep_prob=0.5, is_training=True, scope='yolo'):
        with tf.variable_scope(scope):
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                activation_fn=leaky_relu(alpha),
                weights_regularizer=slim.l2_regularizer(0.0005),
                # truncated help in convergance
                weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
            ):
                net = tf.pad(images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),name='pad_1')
                #print("Start padd", net)
                # 64 filters of 7 *7 with stride of 2 padding = no padding
                net = slim.conv2d(
                    net, 64, 7, 2, padding='VALID', scope='conv_2')
                #print("1st conv", net)
                # Max Pooling layer padding = same as input dimension. Stride = 2
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')
                
                net = slim.conv2d(net, 192, 3, scope='conv_4')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')
                
                net = slim.conv2d(net, 128, 1, scope='conv_6')
                net = slim.conv2d(net, 256, 3, scope='conv_7')
                net = slim.conv2d(net, 256, 1, scope='conv_8')
                net = slim.conv2d(net, 512, 3, scope='conv_9')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')
                
                net = slim.conv2d(net, 256, 1, scope='conv_11')
                net = slim.conv2d(net, 512, 3, scope='conv_12')
                net = slim.conv2d(net, 256, 1, scope='conv_13')
                net = slim.conv2d(net, 512, 3, scope='conv_14')
                net = slim.conv2d(net, 256, 1, scope='conv_15')
                net = slim.conv2d(net, 512, 3, scope='conv_16')
                net = slim.conv2d(net, 256, 1, scope='conv_17')
                net = slim.conv2d(net, 512, 3, scope='conv_18')
                net = slim.conv2d(net, 512, 1, scope='conv_19')
                net = slim.conv2d(net, 1024, 3, scope='conv_20')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21')
                
                net = slim.conv2d(net, 512, 1, scope='conv_22')
                net = slim.conv2d(net, 1024, 3, scope='conv_23')
                net = slim.conv2d(net, 512, 1, scope='conv_24')
                net = slim.conv2d(net, 1024, 3, scope='conv_25')
                net = slim.conv2d(net, 1024, 3, scope='conv_26')
                net = tf.pad(net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),name='pad_27')
                net = slim.conv2d(
                    net, 1024, 3, 2, padding='VALID', scope='conv_28')
                net = slim.conv2d(net, 1024, 3, scope='conv_29')
                net = slim.conv2d(net, 1024, 3, scope='conv_30')
                # transpose to change NHWC([batch_size, height, width, channels]) to NCHW
                net = tf.transpose(net, [0, 3, 1, 2], name='trans_31')
                net = slim.flatten(net, scope='flat_32')
                net = slim.fully_connected(net, 512, scope='fc_33')
                #=4096
                net = slim.fully_connected(net, 4096, scope='fc_34')
                #print(net)
                net = slim.dropout(
                    net, keep_prob=keep_prob, is_training=True,
                    scope='dropout_35')
                #7 x 7 x (2 x 5 + 20) = 7 x 7 x 30 tensor  = 1470
                net = slim.fully_connected(
                    net, num_outputs, activation_fn=None, scope='fc_36')        
                print(net)
        return net

    def calculate_iou(self, boxes1, boxes2, scope='iou'):
        with tf.variable_scope(scope):
            # covert (x_center,y_center,w,h) to (x1,y1,x2,y2)

            boxes1_temp = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,  # x_c - w/2 => x_1
                                 boxes1[..., 1] - boxes1[..., 3] / 2.0,  # y_c - h/2 => y_1
                                 boxes1[..., 0] + boxes1[..., 2] / 2.0,  # x_c + w/2 => x_2
                                 boxes1[..., 1] + boxes1[..., 3] / 2.0], # y_c + h/2 => y_2
                                axis=-1)

            boxes2_temp = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
                                 boxes2[..., 1] - boxes2[..., 3] / 2.0,
                                 boxes2[..., 0] + boxes2[..., 2] / 2.0,
                                 boxes2[..., 1] + boxes2[..., 3] / 2.0],
                                axis=-1)
            
            lu = tf.maximum(boxes1_temp[...,:2],boxes2_temp[...,:2])
            ru = tf.minimum(boxes1_temp[...,2:],boxes2_temp[...,2:])
            
            # intersection| calculating width and height of intersectioned regionregioni.e x2-x1, y2-y1
            intersection = tf.maximum(0.0, lu-ru)
            # calculate area of overlapped region by width*height
            intersection_area = intersection[...,0] * intersection[...,1]
            
            box1_area = boxes1[...,2] * boxes1[...,3]
            box2_area = boxes2[...,2] * boxes2[...,3]
            
            union = tf.maximum(box1_area + box2_area - intersection_area, 1e-10)
            
        return tf.clip_by_value(intersection_area / union, 0.0, 1.0)
    
    
    def loss_layer(self, predicts, labels,  scope = "loss_layer"):
        with tf.variable_scope(scope):

            predict_classes = tf.reshape(predicts[:,:self.boundary1],
                                        [self.batch_size, self.cell_size, self.cell_size, self.num_of_classes])
            
            predict_scales = tf.reshape(predicts[:,self.boundary1:self.boundary2],
                                       [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])
            
            predict_boxes = tf.reshape(predicts[:,self.boundary2:],
                                      [self.batch_size, self.cell_size, self.cell_size,self.boxes_per_cell, 4])
            
            
            classes = labels[...,5:]
            
            response = tf.reshape(labels[...,0],
                                 [self.batch_size, self.cell_size, self.cell_size, 1])
            
            boxes = tf.reshape(labels[...,1:5],
                              [self.batch_size, self.cell_size, self.cell_size, 1, 4])
            
            boxes = tf.tile(boxes, [1,1,1, self.boxes_per_cell, 1]) / self.image_size
            
            #shape=(1, 7, 7, 2)
            offset = tf.reshape(
                tf.constant(self.offset, dtype=tf.float32),
                [1, self.cell_size, self.cell_size, self.boxes_per_cell])
            # Tile operation creates a new tensor by replicating input multiples times
            # [[[0. 0.]
            #   [1. 1.]  X 7) X 7
            #   ...
            #   [6. 6.]
            # offest[0,:,:,1] =>1st row => [0. 1. 2. 3. 4. 5. 6.]
            offset = tf.tile(offset, [self.batch_size, 1, 1, 1]) 
            
            # [[[0. 0.]                  [[[1. 1.]                  
            #   [0. 0.]  X 7) X 7          [1. 1.]
            #   ...                        ...
            #   [0. 0.]                    [1.  1.]
            # offest[0,:,:,1] =>1st row => [0. 0. 0. 0. 0. 0. 0.]
            offset_tran = tf.transpose(offset, (0, 2, 1, 3))
            
            # here sqaure is for calculating width and height of predicted bbox for IOU calculation
            predict_boxes_tran = tf.stack(
                [(predict_boxes[..., 0] + offset)/self.cell_size, # position of x_c within cell containing obj  
                 (predict_boxes[..., 1] + offset_tran)/ self.cell_size,  # position of y_c within cell containing obj 
                 tf.square(predict_boxes[..., 2]),                        # take sqaure of (𝑤^1/2) => w
                 tf.square(predict_boxes[..., 3])], axis=-1)              # take sqaure of (h^1/2) => h
            
            iou_predict_truth = self.calculate_iou(predict_boxes_tran, boxes)

             # calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
            object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
            object_mask = tf.cast(
                (iou_predict_truth >= object_mask), tf.float32) * response

            # calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
            noobject_mask = tf.ones_like(
                object_mask, dtype=tf.float32) - object_mask
            # (7,7,4)
            boxes_tran = tf.stack(
                [boxes[..., 0] * self.cell_size - offset,
                 boxes[..., 1] * self.cell_size - offset_tran,
                 tf.sqrt(boxes[..., 2]),
                 tf.sqrt(boxes[..., 3])], axis=-1)

            # class_loss
            # ∑𝑆2𝑖=0 * 𝟙i𝑜𝑏𝑗  * ∑𝑐∈𝑐𝑙𝑎𝑠𝑠𝑒𝑠(𝑝𝑖(𝑐)−𝑝̂ 𝑖(𝑐))2 | 𝟙i𝑜𝑏𝑗 is 1 when there is a particular class is predicted, else 0
            class_delta = response * (predict_classes - classes)
            class_loss = tf.reduce_mean(
                tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]),
                name='class_loss') * self.class_scale 


            # object_loss
            # ∑𝑆^2𝑖=0 ∑Bj=0 𝟙𝑖𝑗𝑜𝑏𝑗 * (𝐶𝑖−𝐶̂ 𝑖)2 | 𝟙𝑖𝑗𝑜𝑏𝑗 "denotes that the 𝑗th bounding box predictor in cell 𝑖 is responsible for that prediction". 
            # In other words, it is equal to 1 if there is an object in cell 𝑖 and confidence of the 𝑗th predictors of this cell is the highest 
            # among all the predictors of this cell.
            object_delta = object_mask * (predict_scales - iou_predict_truth) # object_mask = 1_ij
            object_loss = tf.reduce_mean(
                tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
                name='object_loss') * self.object_scale # (obj_scale =1)


            # noobject_loss
            # 𝜆𝑛𝑜𝑜𝑏𝑗 * ∑𝑆2𝑖=0 ∑Bj=0 𝟙𝑖𝑗𝑛𝑜𝑜𝑏𝑗 * (𝐶𝑖−𝐶̂ 𝑖)2  | 𝟙𝑖𝑗𝑛𝑜𝑜𝑏𝑗 is almost the same except it values 1 when there are NO objects in cell 𝑖
            #If there are no objects in cell, then truth confidence should be zero. so we get: noobject_delta = noobject_mask * (predict_scales - 0)
            noobject_delta = noobject_mask * predict_scales # 
            noobject_loss = tf.reduce_mean(
                tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]),
                name='noobject_loss') * self.no_object_scale # (no_obj_scale =1)

            # coord_loss
            # 𝜆𝑐𝑜𝑜𝑟𝑑∑𝑆2𝑖=0[(𝑥𝑖−𝑥̂ 𝑖)2+(𝑦𝑖−𝑦𝑖^)2]+ 𝜆𝑐𝑜𝑜𝑟𝑑∑𝑆2𝑖=0[(𝑤𝑖‾‾‾√−𝑤̂ 𝑖‾‾‾√)2+(ℎ𝑖‾‾√−ℎ̂ 𝑖‾‾√)2]
            coord_mask = tf.expand_dims(object_mask, 4)
            boxes_delta = coord_mask * (predict_boxes - boxes_tran)
            coord_loss = tf.reduce_mean(
                tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]),
                name='coord_loss') * self.coordi_scale # (coordi_scale = 5)
            tf.losses.add_loss(class_loss)
            tf.losses.add_loss(object_loss)
            tf.losses.add_loss(noobject_loss)
            tf.losses.add_loss(coord_loss)

            tf.summary.scalar('class_loss', class_loss)
            tf.summary.scalar('object_loss', object_loss)
            tf.summary.scalar('noobject_loss', noobject_loss)
            tf.summary.scalar('coord_loss', coord_loss)

            tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0])
            tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1])
            tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2])
            tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3])
            tf.summary.histogram('iou', iou_predict_truth)

            
            
def leaky_relu(alpha):
    def op(inputs):
        return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu')
    return op

In [5]:
#yolo = YoloNet(is_training= False)

In [6]:
class Yolotrain(object):

    def __init__(self, net, data):
        self.net = net
        self.data = data
        #self.weights_file = cfg.WEIGHTS_FILE
        self.max_iter = cfg.MAX_ITER
        self.initial_learning_rate = cfg.LEARNING_RATE
        self.decay_steps = cfg.DECAY_STEPS
        self.decay_rate = cfg.DECAY_RATE
        self.staircase = cfg.STAIRCASE
        self.summary_iter = cfg.SUMMARY_ITER
        self.summary_op = tf.summary.merge_all()
        
        self.global_step = tf.train.create_global_step()
        self.learning_rate = tf.train.exponential_decay(
            self.initial_learning_rate, self.global_step, self.decay_steps,
            self.decay_rate, self.staircase, name='learning_rate')
        self.optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=self.learning_rate)
        self.train_op = slim.learning.create_train_op(
            self.net.total_loss, self.optimizer, global_step=self.global_step)
      
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
        
    def train(self):
        
        #for step in range(1, self.max_iter + 1):
        for step in range(1, 1 + 1):
            images, labels = self.data.get()
            feed_dict = {self.net.images: images,
                         self.net.labels: labels}
            print("Step",step)
            summary_str, loss, _ = self.sess.run(
                [self.summary_op, self.net.total_loss, self.train_op],
                feed_dict=feed_dict)
            #print(summary_str)
            print("loss :",loss)
            
            
            

In [7]:

pascal_dataset = dataset_pascal_voc('train')
yolo = YoloNet()
train_yolo_obj = Yolotrain(yolo, pascal_dataset)
 

print('Start training ...')
train_yolo_obj.train()
print('Done training.')

In prepare
data/pascal_voc/cache
Getting labels from data/pascal_voc/cache/pascal_train_gt_labels.pkl
original labels length : 5011
Appending horizontally-flipped training examples ...
Create flipped data
labels len 10022
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Tensor("yolo/fc_36/BiasAdd:0", shape=(?, 1470), dtype=float32)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.cast instead.
Start training ...
Step 1
loss : 63.854614
Done training.
