| @@ -0,0 +1,454 @@ | ||
| from keras.models import Model | ||
| from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda | ||
| from keras.layers.advanced_activations import LeakyReLU | ||
| import tensorflow as tf | ||
| import numpy as np | ||
| import os | ||
| import cv2 | ||
| from keras.applications.mobilenet import MobileNet | ||
| from keras.layers.merge import concatenate | ||
| from keras.optimizers import SGD, Adam, RMSprop | ||
| from preprocessing import BatchGenerator | ||
| from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard | ||
| from utils import BoundBox | ||
| from backend import TinyYoloFeature, FullYoloFeature, MobileNetFeature, SqueezeNetFeature, Inception3Feature, \ | ||
| VGG16Feature, ResNet50Feature | ||
|
|
||
|
|
||
| class YOLO(object): | ||
| def __init__(self, architecture, | ||
| input_size, | ||
| labels, | ||
| max_box_per_image, | ||
| anchors): | ||
|
|
||
| self.input_size = input_size | ||
|
|
||
| self.labels = list(labels) | ||
| self.nb_class = len(self.labels) | ||
| self.nb_box = len(anchors) / 2 | ||
| self.class_wt = np.ones(self.nb_class, dtype='float32') | ||
| self.anchors = anchors | ||
|
|
||
| self.max_box_per_image = max_box_per_image | ||
|
|
||
| ########################## | ||
| # Make the model | ||
| ########################## | ||
|
|
||
| # make the feature extractor layers | ||
| input_image = Input(shape=(self.input_size, self.input_size, 3)) | ||
| self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image, 4)) | ||
|
|
||
| if architecture == 'Inception3': | ||
| self.feature_extractor = Inception3Feature(self.input_size) | ||
| elif architecture == 'SqueezeNet': | ||
| self.feature_extractor = SqueezeNetFeature(self.input_size) | ||
| elif architecture == 'MobileNet': | ||
| self.feature_extractor = MobileNetFeature(self.input_size) | ||
| elif architecture == 'Full Yolo': | ||
| self.feature_extractor = FullYoloFeature(self.input_size) | ||
| elif architecture == 'Tiny Yolo': | ||
| self.feature_extractor = TinyYoloFeature(self.input_size) | ||
| elif architecture == 'VGG16': | ||
| self.feature_extractor = VGG16Feature(self.input_size) | ||
| elif architecture == 'ResNet50': | ||
| self.feature_extractor = ResNet50Feature(self.input_size) | ||
| else: | ||
| raise Exception( | ||
| 'Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!') | ||
|
|
||
| print self.feature_extractor.get_output_shape() | ||
| self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() | ||
| features = self.feature_extractor.extract(input_image) | ||
|
|
||
| # make the object detection layer | ||
| output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), | ||
| (1, 1), strides=(1, 1), | ||
| padding='same', | ||
| name='conv_23', | ||
| kernel_initializer='lecun_normal')(features) | ||
| output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) | ||
| output = Lambda(lambda args: args[0])([output, self.true_boxes]) | ||
|
|
||
| self.model = Model([input_image, self.true_boxes], output) | ||
|
|
||
| # initialize the weights of the detection layer | ||
| layer = self.model.layers[-4] | ||
| weights = layer.get_weights() | ||
|
|
||
| new_kernel = np.random.normal(size=weights[0].shape) / (self.grid_h * self.grid_w) | ||
| new_bias = np.random.normal(size=weights[1].shape) / (self.grid_h * self.grid_w) | ||
|
|
||
| layer.set_weights([new_kernel, new_bias]) | ||
|
|
||
| # print a summary of the whole model | ||
| self.model.summary() | ||
|
|
||
| def custom_loss(self, y_true, y_pred): | ||
| mask_shape = tf.shape(y_true)[:4] | ||
|
|
||
| cell_x = tf.to_float( | ||
| tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) | ||
| cell_y = tf.transpose(cell_x, (0, 2, 1, 3, 4)) | ||
|
|
||
| cell_grid = tf.tile(tf.concat([cell_x, cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) | ||
|
|
||
| coord_mask = tf.zeros(mask_shape) | ||
| conf_mask = tf.zeros(mask_shape) | ||
| class_mask = tf.zeros(mask_shape) | ||
|
|
||
| seen = tf.Variable(0.) | ||
| total_recall = tf.Variable(0.) | ||
|
|
||
| """ | ||
| Adjust prediction | ||
| """ | ||
| ### adjust x and y | ||
| pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid | ||
|
|
||
| ### adjust w and h | ||
| pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1, 1, 1, self.nb_box, 2]) | ||
|
|
||
| ### adjust confidence | ||
| pred_box_conf = tf.sigmoid(y_pred[..., 4]) | ||
|
|
||
| ### adjust class probabilities | ||
| pred_box_class = y_pred[..., 5:] | ||
|
|
||
| """ | ||
| Adjust ground truth | ||
| """ | ||
| ### adjust x and y | ||
| true_box_xy = y_true[..., 0:2] # relative position to the containing cell | ||
|
|
||
| ### adjust w and h | ||
| true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically | ||
|
|
||
| ### adjust confidence | ||
| true_wh_half = true_box_wh / 2. | ||
| true_mins = true_box_xy - true_wh_half | ||
| true_maxes = true_box_xy + true_wh_half | ||
|
|
||
| pred_wh_half = pred_box_wh / 2. | ||
| pred_mins = pred_box_xy - pred_wh_half | ||
| pred_maxes = pred_box_xy + pred_wh_half | ||
|
|
||
| intersect_mins = tf.maximum(pred_mins, true_mins) | ||
| intersect_maxes = tf.minimum(pred_maxes, true_maxes) | ||
| intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) | ||
| intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] | ||
|
|
||
| true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] | ||
| pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] | ||
|
|
||
| union_areas = pred_areas + true_areas - intersect_areas | ||
| iou_scores = tf.truediv(intersect_areas, union_areas) | ||
|
|
||
| true_box_conf = iou_scores * y_true[..., 4] | ||
|
|
||
| ### adjust class probabilities | ||
| true_box_class = tf.argmax(y_true[..., 5:], -1) | ||
|
|
||
| """ | ||
| Determine the masks | ||
| """ | ||
| ### coordinate mask: simply the position of the ground truth boxes (the predictors) | ||
| coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale | ||
|
|
||
| ### confidence mask: penelize predictors + penalize boxes with low IOU | ||
| # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 | ||
| true_xy = self.true_boxes[..., 0:2] | ||
| true_wh = self.true_boxes[..., 2:4] | ||
|
|
||
| true_wh_half = true_wh / 2. | ||
| true_mins = true_xy - true_wh_half | ||
| true_maxes = true_xy + true_wh_half | ||
|
|
||
| pred_xy = tf.expand_dims(pred_box_xy, 4) | ||
| pred_wh = tf.expand_dims(pred_box_wh, 4) | ||
|
|
||
| pred_wh_half = pred_wh / 2. | ||
| pred_mins = pred_xy - pred_wh_half | ||
| pred_maxes = pred_xy + pred_wh_half | ||
|
|
||
| intersect_mins = tf.maximum(pred_mins, true_mins) | ||
| intersect_maxes = tf.minimum(pred_maxes, true_maxes) | ||
| intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) | ||
| intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] | ||
|
|
||
| true_areas = true_wh[..., 0] * true_wh[..., 1] | ||
| pred_areas = pred_wh[..., 0] * pred_wh[..., 1] | ||
|
|
||
| union_areas = pred_areas + true_areas - intersect_areas | ||
| iou_scores = tf.truediv(intersect_areas, union_areas) | ||
|
|
||
| best_ious = tf.reduce_max(iou_scores, axis=4) | ||
| conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale | ||
|
|
||
| # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box | ||
| conf_mask = conf_mask + y_true[..., 4] * self.object_scale | ||
|
|
||
| ### class mask: simply the position of the ground truth boxes (the predictors) | ||
| class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale | ||
|
|
||
| """ | ||
| Warm-up training | ||
| """ | ||
| no_boxes_mask = tf.to_float(coord_mask < self.coord_scale / 2.) | ||
| seen = tf.assign_add(seen, 1.) | ||
|
|
||
| true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_bs), | ||
| lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, | ||
| true_box_wh + tf.ones_like(true_box_wh) * np.reshape( | ||
| self.anchors, | ||
| [1, 1, 1, self.nb_box, 2]) * no_boxes_mask, | ||
| tf.ones_like(coord_mask)], | ||
| lambda: [true_box_xy, | ||
| true_box_wh, | ||
| coord_mask]) | ||
|
|
||
| """ | ||
| Finalize the loss | ||
| """ | ||
| nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) | ||
| nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) | ||
| nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) | ||
|
|
||
| loss_xy = tf.reduce_sum(tf.square(true_box_xy - pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. | ||
| loss_wh = tf.reduce_sum(tf.square(true_box_wh - pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. | ||
| loss_conf = tf.reduce_sum(tf.square(true_box_conf - pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. | ||
| loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) | ||
| loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) | ||
|
|
||
| loss = loss_xy + loss_wh + loss_conf + loss_class | ||
|
|
||
| # if self.debug: | ||
| # nb_true_box = tf.reduce_sum(y_true[..., 4]) | ||
| # nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) | ||
| # | ||
| # current_recall = nb_pred_box/(nb_true_box + 1e-6) | ||
| # total_recall = tf.assign_add(total_recall, current_recall) | ||
| # | ||
| # loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \t', summarize=1000) | ||
| # loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) | ||
| # loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) | ||
| # loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) | ||
| # loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) | ||
| # loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) | ||
| # loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) | ||
| # loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) | ||
|
|
||
| return loss | ||
|
|
||
| def load_weights(self, weight_path): | ||
| self.model.load_weights(weight_path) | ||
|
|
||
| def predict(self, image): | ||
| image = cv2.resize(image, (self.input_size, self.input_size)) | ||
| image = self.feature_extractor.normalize(image) | ||
|
|
||
| input_image = image[:, :, ::-1] | ||
| input_image = np.expand_dims(input_image, 0) | ||
| dummy_array = np.zeros((1, 1, 1, 1, self.max_box_per_image, 4)) | ||
|
|
||
| netout = self.model.predict([input_image, dummy_array])[0] | ||
| boxes = self.decode_netout(netout) | ||
|
|
||
| return boxes | ||
|
|
||
| def bbox_iou(self, box1, box2): | ||
| x1_min = box1.x - box1.w / 2 | ||
| x1_max = box1.x + box1.w / 2 | ||
| y1_min = box1.y - box1.h / 2 | ||
| y1_max = box1.y + box1.h / 2 | ||
|
|
||
| x2_min = box2.x - box2.w / 2 | ||
| x2_max = box2.x + box2.w / 2 | ||
| y2_min = box2.y - box2.h / 2 | ||
| y2_max = box2.y + box2.h / 2 | ||
|
|
||
| intersect_w = self.interval_overlap([x1_min, x1_max], [x2_min, x2_max]) | ||
| intersect_h = self.interval_overlap([y1_min, y1_max], [y2_min, y2_max]) | ||
|
|
||
| intersect = intersect_w * intersect_h | ||
|
|
||
| union = box1.w * box1.h + box2.w * box2.h - intersect | ||
|
|
||
| return float(intersect) / union | ||
|
|
||
| def interval_overlap(self, interval_a, interval_b): | ||
| x1, x2 = interval_a | ||
| x3, x4 = interval_b | ||
|
|
||
| if x3 < x1: | ||
| if x4 < x1: | ||
| return 0 | ||
| else: | ||
| return min(x2, x4) - x1 | ||
| else: | ||
| if x2 < x3: | ||
| return 0 | ||
| else: | ||
| return min(x2, x4) - x3 | ||
|
|
||
| def decode_netout(self, netout, obj_threshold=0.3, nms_threshold=0.3): | ||
| grid_h, grid_w, nb_box = netout.shape[:3] | ||
|
|
||
| boxes = [] | ||
|
|
||
| # decode the output by the network | ||
| netout[..., 4] = self.sigmoid(netout[..., 4]) | ||
| netout[..., 5:] = netout[..., 4][..., np.newaxis] * self.softmax(netout[..., 5:]) | ||
| netout[..., 5:] *= netout[..., 5:] > obj_threshold | ||
|
|
||
| for row in range(grid_h): | ||
| for col in range(grid_w): | ||
| for b in range(nb_box): | ||
| # from 4th element onwards are confidence and class classes | ||
| classes = netout[row, col, b, 5:] | ||
|
|
||
| if np.sum(classes) > 0: | ||
| # first 4 elements are x, y, w, and h | ||
| x, y, w, h = netout[row, col, b, :4] | ||
|
|
||
| x = (col + self.sigmoid(x)) / grid_w # center position, unit: image width | ||
| y = (row + self.sigmoid(y)) / grid_h # center position, unit: image height | ||
| w = self.anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width | ||
| h = self.anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height | ||
| confidence = netout[row, col, b, 4] | ||
|
|
||
| box = BoundBox(x, y, w, h, confidence, classes) | ||
|
|
||
| boxes.append(box) | ||
|
|
||
| # suppress non-maximal boxes | ||
| for c in range(self.nb_class): | ||
| sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) | ||
|
|
||
| for i in xrange(len(sorted_indices)): | ||
| index_i = sorted_indices[i] | ||
|
|
||
| if boxes[index_i].classes[c] == 0: | ||
| continue | ||
| else: | ||
| for j in xrange(i + 1, len(sorted_indices)): | ||
| index_j = sorted_indices[j] | ||
|
|
||
| if self.bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: | ||
| boxes[index_j].classes[c] = 0 | ||
|
|
||
| # remove the boxes which are less likely than a obj_threshold | ||
| boxes = [box for box in boxes if box.get_score() > obj_threshold] | ||
|
|
||
| return boxes | ||
|
|
||
| def sigmoid(self, x): | ||
| return 1. / (1. + np.exp(-x)) | ||
|
|
||
| def softmax(self, x, axis=-1, t=-100.): | ||
| x = x - np.max(x) | ||
|
|
||
| if np.min(x) < t: | ||
| x = x / np.min(x) * t | ||
|
|
||
| e_x = np.exp(x) | ||
|
|
||
| return e_x / e_x.sum(axis, keepdims=True) | ||
|
|
||
| def train(self, train_imgs, # the list of images to train the model | ||
| valid_imgs, # the list of images used to validate the model | ||
| train_times, # the number of time to repeat the training set, often used for small datasets | ||
| valid_times, # the number of times to repeat the validation set, often used for small datasets | ||
| nb_epoch, # number of epoches | ||
| learning_rate, # the learning rate | ||
| batch_size, # the size of the batch | ||
| warmup_epochs, # number of initial batches to let the model familiarize with the new dataset | ||
| object_scale, | ||
| no_object_scale, | ||
| coord_scale, | ||
| class_scale, | ||
| saved_weights_name='best_weights.h5', | ||
| debug=False): | ||
|
|
||
| self.batch_size = batch_size | ||
| self.warmup_bs = warmup_epochs * ( | ||
| train_times * (len(train_imgs) / batch_size + 1) + valid_times * (len(valid_imgs) / batch_size + 1)) | ||
|
|
||
| self.object_scale = object_scale | ||
| self.no_object_scale = no_object_scale | ||
| self.coord_scale = coord_scale | ||
| self.class_scale = class_scale | ||
|
|
||
| self.debug = debug | ||
|
|
||
| if warmup_epochs > 0: nb_epoch = warmup_epochs # if it's warmup stage, don't train more than warmup_epochs | ||
|
|
||
| ############################################ | ||
| # Compile the model | ||
| ############################################ | ||
|
|
||
| optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) | ||
| self.model.compile(loss=self.custom_loss, optimizer=optimizer) | ||
|
|
||
| ############################################ | ||
| # Make train and validation generators | ||
| ############################################ | ||
|
|
||
| generator_config = { | ||
| 'IMAGE_H': self.input_size, | ||
| 'IMAGE_W': self.input_size, | ||
| 'GRID_H': self.grid_h, | ||
| 'GRID_W': self.grid_w, | ||
| 'BOX': self.nb_box, | ||
| 'LABELS': self.labels, | ||
| 'CLASS': len(self.labels), | ||
| 'ANCHORS': self.anchors, | ||
| 'BATCH_SIZE': self.batch_size, | ||
| 'TRUE_BOX_BUFFER': self.max_box_per_image, | ||
| } | ||
|
|
||
| train_batch = BatchGenerator(train_imgs, | ||
| generator_config, | ||
| norm=self.feature_extractor.normalize) | ||
| valid_batch = BatchGenerator(valid_imgs, | ||
| generator_config, | ||
| norm=self.feature_extractor.normalize, | ||
| jitter=False) | ||
|
|
||
| ############################################ | ||
| # Make a few callbacks | ||
| ############################################ | ||
|
|
||
| early_stop = EarlyStopping(monitor='val_loss', | ||
| min_delta=0.001, | ||
| patience=3, | ||
| mode='min', | ||
| verbose=1) | ||
| checkpoint = ModelCheckpoint(saved_weights_name, | ||
| monitor='val_loss', | ||
| verbose=1, | ||
| save_best_only=True, | ||
| mode='min', | ||
| period=1) | ||
| if os.path.exists('logs') is False: | ||
| os.mkdir('logs') | ||
|
|
||
| tb_counter = len([log for log in os.listdir(os.path.expanduser('logs/')) if 'yolo' in log]) + 1 | ||
| tensorboard = TensorBoard(log_dir=os.path.expanduser('logs/') + 'yolo' + '_' + str(tb_counter), | ||
| histogram_freq=0, | ||
| # write_batch_performance=True, | ||
| write_graph=True, | ||
| write_images=False) | ||
|
|
||
| ############################################ | ||
| # Start the training process | ||
| ############################################ | ||
|
|
||
| self.model.fit_generator(generator=train_batch, | ||
| steps_per_epoch=len(train_batch) * train_times, | ||
| epochs=nb_epoch, | ||
| verbose=1, | ||
| validation_data=valid_batch, | ||
| validation_steps=len(valid_batch) * valid_times, | ||
| callbacks=[early_stop, checkpoint, tensorboard]) |
| @@ -0,0 +1,330 @@ | ||
| import os | ||
| import cv2 | ||
| import copy | ||
| import numpy as np | ||
| import imgaug as ia | ||
| from imgaug import augmenters as iaa | ||
| from keras.utils import Sequence | ||
| import xml.etree.ElementTree as ET | ||
| from utils import BoundBox, normalize, bbox_iou | ||
| import sys | ||
|
|
||
|
|
||
| def parse_annotation(ann_dir, img_dir, labels=[]): | ||
| all_imgs = [] | ||
| seen_labels = {} | ||
|
|
||
| for ann in sorted(os.listdir(ann_dir)): | ||
| img = {'object': []} | ||
|
|
||
| tree = ET.parse(ann_dir + ann) | ||
|
|
||
| for elem in tree.iter(): | ||
| if 'filename' in elem.tag: | ||
| img['filename'] = img_dir + elem.text | ||
| if 'width' in elem.tag: | ||
| img['width'] = int(elem.text) | ||
| if 'height' in elem.tag: | ||
| img['height'] = int(elem.text) | ||
| if 'object' in elem.tag or 'part' in elem.tag: | ||
| obj = {} | ||
|
|
||
| for attr in list(elem): | ||
| if 'name' in attr.tag: | ||
| obj['name'] = attr.text | ||
|
|
||
| if obj['name'] in seen_labels: | ||
| seen_labels[obj['name']] += 1 | ||
| else: | ||
| seen_labels[obj['name']] = 1 | ||
|
|
||
| if len(labels) > 0 and obj['name'] not in labels: | ||
| break | ||
| else: | ||
| img['object'] += [obj] | ||
|
|
||
| if 'bndbox' in attr.tag: | ||
| for dim in list(attr): | ||
| if 'xmin' in dim.tag: | ||
| obj['xmin'] = int(round(float(dim.text))) | ||
| if 'ymin' in dim.tag: | ||
| obj['ymin'] = int(round(float(dim.text))) | ||
| if 'xmax' in dim.tag: | ||
| obj['xmax'] = int(round(float(dim.text))) | ||
| if 'ymax' in dim.tag: | ||
| obj['ymax'] = int(round(float(dim.text))) | ||
|
|
||
| if len(img['object']) > 0: | ||
| all_imgs += [img] | ||
|
|
||
| return all_imgs, seen_labels | ||
|
|
||
|
|
||
| def parse_rovio_annotation(ann_dir, img_dir, labels=[]): | ||
| all_x = [] | ||
|
|
||
| for l in os.listdir(ann_dir): | ||
| img = {'object': []} | ||
| obj = {} | ||
| sys.stdout.write('\rParsing annotation {}'.format(l)) | ||
| sys.stdout.flush() | ||
|
|
||
| if os.path.isfile(os.path.join(ann_dir, l)): | ||
| with open(os.path.join(ann_dir, l), 'r') as f: | ||
| line = f.read() | ||
|
|
||
| try: | ||
| _, xmin, ymin, xmax, ymax = line.split() | ||
|
|
||
| obj['xmin'] = int(round(float(xmin))) | ||
| obj['ymin'] = int(round(float(ymin))) | ||
| obj['xmax'] = int(round(float(xmax))) | ||
| obj['ymax'] = int(round(float(ymax))) | ||
| obj['name'] = 'rovio' | ||
| img['filename'] = os.path.join(img_dir, l.split('.')[0] + '.png') | ||
|
|
||
| img['object'] += [obj] | ||
| all_x += [img] | ||
| except: | ||
| import traceback | ||
|
|
||
| print(traceback.print_exc()) | ||
| print('something wrong with {}'.format(line)) | ||
|
|
||
| return all_x | ||
|
|
||
|
|
||
| class BatchGenerator(Sequence): | ||
| def __init__(self, images, | ||
| config, | ||
| shuffle=True, | ||
| jitter=True, | ||
| norm=None): | ||
| self.generator = None | ||
|
|
||
| self.images = images | ||
| self.config = config | ||
|
|
||
| self.shuffle = shuffle | ||
| self.jitter = jitter | ||
| self.norm = norm | ||
|
|
||
| self.anchors = [BoundBox(0, 0, config['ANCHORS'][2 * i], config['ANCHORS'][2 * i + 1]) for i in | ||
| range(int(len(config['ANCHORS']) // 2))] | ||
|
|
||
| ### augmentors by https://github.com/aleju/imgaug | ||
| sometimes = lambda aug: iaa.Sometimes(0.5, aug) | ||
|
|
||
| # Define our sequence of augmentation steps that will be applied to every image | ||
| # All augmenters with per_channel=0.5 will sample one value _per image_ | ||
| # in 50% of all cases. In all other cases they will sample new values | ||
| # _per channel_. | ||
| self.aug_pipe = iaa.Sequential( | ||
| [ | ||
| # apply the following augmenters to most images | ||
| # iaa.Fliplr(0.5), # horizontally flip 50% of all images | ||
| # iaa.Flipud(0.2), # vertically flip 20% of all images | ||
| # sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width | ||
| sometimes(iaa.Affine( | ||
| # scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis | ||
| # translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis) | ||
| # rotate=(-5, 5), # rotate by -45 to +45 degrees | ||
| # shear=(-5, 5), # shear by -16 to +16 degrees | ||
| # order=[0, 1], # use nearest neighbour or bilinear interpolation (fast) | ||
| # cval=(0, 255), # if mode is constant, use a cval between 0 and 255 | ||
| # mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples) | ||
| )), | ||
| # execute 0 to 5 of the following (less important) augmenters per image | ||
| # don't execute all of them, as that would often be way too strong | ||
| iaa.SomeOf((0, 5), | ||
| [ | ||
| # sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation | ||
| iaa.OneOf([ | ||
| iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0 | ||
| iaa.AverageBlur(k=(2, 7)), | ||
| # blur image using local means with kernel sizes between 2 and 7 | ||
| iaa.MedianBlur(k=(3, 11)), | ||
| # blur image using local medians with kernel sizes between 2 and 7 | ||
| ]), | ||
| iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images | ||
| # iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images | ||
| # search either for all edges or for directed edges | ||
| # sometimes(iaa.OneOf([ | ||
| # iaa.EdgeDetect(alpha=(0, 0.7)), | ||
| # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)), | ||
| # ])), | ||
| iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05 * 255), per_channel=0.5), | ||
| # add gaussian noise to images | ||
| iaa.OneOf([ | ||
| iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels | ||
| # iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2), | ||
| ]), | ||
| # iaa.Invert(0.05, per_channel=True), # invert color channels | ||
| iaa.Add((-10, 10), per_channel=0.5), | ||
| # change brightness of images (by -10 to 10 of original value) | ||
| iaa.Multiply((0.5, 1.5), per_channel=0.5), | ||
| # change brightness of images (50-150% of original value) | ||
| iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast | ||
| # iaa.Grayscale(alpha=(0.0, 1.0)), | ||
| # sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths) | ||
| # sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around | ||
| ], | ||
| random_order=True | ||
| ) | ||
| ], | ||
| random_order=True | ||
| ) | ||
|
|
||
| if shuffle: np.random.shuffle(self.images) | ||
|
|
||
| def __len__(self): | ||
| return int(np.ceil(float(len(self.images)) / self.config['BATCH_SIZE'])) | ||
|
|
||
| def __getitem__(self, idx): | ||
| l_bound = idx * self.config['BATCH_SIZE'] | ||
| r_bound = (idx + 1) * self.config['BATCH_SIZE'] | ||
|
|
||
| if r_bound > len(self.images): | ||
| r_bound = len(self.images) | ||
| l_bound = r_bound - self.config['BATCH_SIZE'] | ||
|
|
||
| instance_count = 0 | ||
|
|
||
| x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images | ||
| b_batch = np.zeros((r_bound - l_bound, 1, 1, 1, self.config['TRUE_BOX_BUFFER'], | ||
| 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes | ||
| y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], | ||
| 4 + 1 + self.config['CLASS'])) # desired network output | ||
|
|
||
| for train_instance in self.images[l_bound:r_bound]: | ||
| # augment input image and fix object's position and size | ||
| img, all_objs = self.aug_image(train_instance, jitter=self.jitter) | ||
|
|
||
| # construct output from object's x, y, w, h | ||
| true_box_index = 0 | ||
|
|
||
| for obj in all_objs: | ||
| if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: | ||
| center_x = .5 * (obj['xmin'] + obj['xmax']) | ||
| center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) | ||
| center_y = .5 * (obj['ymin'] + obj['ymax']) | ||
| center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) | ||
|
|
||
| grid_x = int(np.floor(center_x)) | ||
| grid_y = int(np.floor(center_y)) | ||
|
|
||
| if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: | ||
| obj_indx = self.config['LABELS'].index(obj['name']) | ||
|
|
||
| center_w = (obj['xmax'] - obj['xmin']) / ( | ||
| float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell | ||
| center_h = (obj['ymax'] - obj['ymin']) / ( | ||
| float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell | ||
|
|
||
| box = [center_x, center_y, center_w, center_h] | ||
|
|
||
| # find the anchor that best predicts this box | ||
| best_anchor = -1 | ||
| max_iou = -1 | ||
|
|
||
| shifted_box = BoundBox(0, | ||
| 0, | ||
| center_w, | ||
| center_h) | ||
|
|
||
| for i in range(len(self.anchors)): | ||
| anchor = self.anchors[i] | ||
| iou = bbox_iou(shifted_box, anchor) | ||
|
|
||
| if max_iou < iou: | ||
| best_anchor = i | ||
| max_iou = iou | ||
|
|
||
| # assign ground truth x, y, w, h, confidence and class probs to y_batch | ||
| y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box | ||
| y_batch[instance_count, grid_y, grid_x, best_anchor, 4] = 1. | ||
| y_batch[instance_count, grid_y, grid_x, best_anchor, 5 + obj_indx] = 1 | ||
|
|
||
| # assign the true box to b_batch | ||
| b_batch[instance_count, 0, 0, 0, true_box_index] = box | ||
|
|
||
| true_box_index += 1 | ||
| true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] | ||
|
|
||
| # assign input image to x_batch | ||
| if self.norm != None: | ||
| x_batch[instance_count] = self.norm(img) | ||
| else: | ||
| # plot image and bounding boxes for sanity check | ||
| for obj in all_objs: | ||
| if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: | ||
| cv2.rectangle(img[:, :, ::-1], (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), | ||
| (255, 0, 0), 3) | ||
| cv2.putText(img[:, :, ::-1], obj['name'], | ||
| (obj['xmin'] + 2, obj['ymin'] + 12), | ||
| 0, 1.2e-3 * img.shape[0], | ||
| (0, 255, 0), 2) | ||
|
|
||
| x_batch[instance_count] = img | ||
|
|
||
| # increase instance counter in current batch | ||
| instance_count += 1 | ||
|
|
||
| # print ' new batch created', idx | ||
|
|
||
| return [x_batch, b_batch], y_batch | ||
|
|
||
| def on_epoch_end(self): | ||
| if self.shuffle: np.random.shuffle(self.images) | ||
|
|
||
| def aug_image(self, train_instance, jitter): | ||
| image_name = train_instance['filename'] | ||
| image = cv2.imread(image_name) | ||
|
|
||
| if image is None: print 'Cannot find ', image_name | ||
|
|
||
| h, w, c = image.shape | ||
| all_objs = copy.deepcopy(train_instance['object']) | ||
|
|
||
| if jitter: | ||
| ### scale the image | ||
| scale = np.random.uniform() / 10. + 1. | ||
| image = cv2.resize(image, (0, 0), fx=scale, fy=scale) | ||
|
|
||
| ### translate the image | ||
| max_offx = (scale - 1.) * w | ||
| max_offy = (scale - 1.) * h | ||
| offx = int(np.random.uniform() * max_offx) | ||
| offy = int(np.random.uniform() * max_offy) | ||
|
|
||
| image = image[offy: (offy + h), offx: (offx + w)] | ||
|
|
||
| ### flip the image | ||
| flip = np.random.binomial(1, .5) | ||
| if flip > 0.5: image = cv2.flip(image, 1) | ||
|
|
||
| image = self.aug_pipe.augment_image(image) | ||
|
|
||
| # resize the image to standard size | ||
| image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W'])) | ||
| image = image[:, :, ::-1] | ||
|
|
||
| # fix object's position and size | ||
| for obj in all_objs: | ||
| for attr in ['xmin', 'xmax']: | ||
| if jitter: obj[attr] = int(obj[attr] * scale - offx) | ||
| obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w) | ||
| obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0) | ||
|
|
||
| for attr in ['ymin', 'ymax']: | ||
| if jitter: obj[attr] = int(obj[attr] * scale - offy) | ||
|
|
||
| obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h) | ||
| obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0) | ||
|
|
||
| if jitter and flip > 0.5: | ||
| xmin = obj['xmin'] | ||
| obj['xmin'] = self.config['IMAGE_W'] - obj['xmax'] | ||
| obj['xmax'] = self.config['IMAGE_W'] - xmin | ||
|
|
||
| return image, all_objs |
| @@ -0,0 +1,176 @@ | ||
| import numpy as np | ||
| import os | ||
| import xml.etree.ElementTree as ET | ||
| import tensorflow as tf | ||
| import copy | ||
| import cv2 | ||
|
|
||
|
|
||
| class BoundBox: | ||
| def __init__(self, x, y, w, h, c=None, classes=None): | ||
| self.x = x | ||
| self.y = y | ||
| self.w = w | ||
| self.h = h | ||
|
|
||
| self.c = c | ||
| self.classes = classes | ||
|
|
||
| self.label = -1 | ||
| self.score = -1 | ||
|
|
||
| def get_label(self): | ||
| if self.label == -1: | ||
| self.label = np.argmax(self.classes) | ||
|
|
||
| return self.label | ||
|
|
||
| def get_score(self): | ||
| if self.score == -1: | ||
| self.score = self.classes[self.get_label()] | ||
|
|
||
| return self.score | ||
|
|
||
| def get_position(self): | ||
| return x, y, w, h | ||
|
|
||
|
|
||
| class WeightReader: | ||
| def __init__(self, weight_file): | ||
| self.offset = 4 | ||
| self.all_weights = np.fromfile(weight_file, dtype='float32') | ||
|
|
||
| def read_bytes(self, size): | ||
| self.offset = self.offset + size | ||
| return self.all_weights[self.offset - size:self.offset] | ||
|
|
||
| def reset(self): | ||
| self.offset = 4 | ||
|
|
||
|
|
||
| def normalize(image): | ||
| image = image / 255. | ||
|
|
||
| return image | ||
|
|
||
|
|
||
| def bbox_iou(box1, box2): | ||
| x1_min = box1.x - box1.w / 2 | ||
| x1_max = box1.x + box1.w / 2 | ||
| y1_min = box1.y - box1.h / 2 | ||
| y1_max = box1.y + box1.h / 2 | ||
|
|
||
| x2_min = box2.x - box2.w / 2 | ||
| x2_max = box2.x + box2.w / 2 | ||
| y2_min = box2.y - box2.h / 2 | ||
| y2_max = box2.y + box2.h / 2 | ||
|
|
||
| intersect_w = interval_overlap([x1_min, x1_max], [x2_min, x2_max]) | ||
| intersect_h = interval_overlap([y1_min, y1_max], [y2_min, y2_max]) | ||
|
|
||
| intersect = intersect_w * intersect_h | ||
|
|
||
| union = box1.w * box1.h + box2.w * box2.h - intersect | ||
|
|
||
| return float(intersect) / union | ||
|
|
||
|
|
||
| def interval_overlap(interval_a, interval_b): | ||
| x1, x2 = interval_a | ||
| x3, x4 = interval_b | ||
|
|
||
| if x3 < x1: | ||
| if x4 < x1: | ||
| return 0 | ||
| else: | ||
| return min(x2, x4) - x1 | ||
| else: | ||
| if x2 < x3: | ||
| return 0 | ||
| else: | ||
| return min(x2, x4) - x3 | ||
|
|
||
|
|
||
| def draw_boxes(image, boxes, labels): | ||
| for box in boxes: | ||
| xmin = int((box.x - box.w / 2) * image.shape[1]) | ||
| xmax = int((box.x + box.w / 2) * image.shape[1]) | ||
| ymin = int((box.y - box.h / 2) * image.shape[0]) | ||
| ymax = int((box.y + box.h / 2) * image.shape[0]) | ||
|
|
||
| cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 3) | ||
| cv2.putText(image, | ||
| labels[box.get_label()] + ' ' + str(box.get_score()), | ||
| (xmin, ymin - 13), | ||
| cv2.FONT_HERSHEY_SIMPLEX, | ||
| 1e-3 * image.shape[0], | ||
| (0, 255, 0), 2) | ||
|
|
||
| return image | ||
|
|
||
|
|
||
| def decode_netout(netout, obj_threshold, nms_threshold, anchors, nb_class): | ||
| grid_h, grid_w, nb_box = netout.shape[:3] | ||
|
|
||
| boxes = [] | ||
|
|
||
| # decode the output by the network | ||
| netout[..., 4] = sigmoid(netout[..., 4]) | ||
| netout[..., 5:] = netout[..., 4][..., np.newaxis] * softmax(netout[..., 5:]) | ||
| netout[..., 5:] *= netout[..., 5:] > obj_threshold | ||
|
|
||
| for row in range(grid_h): | ||
| for col in range(grid_w): | ||
| for b in range(nb_box): | ||
| # from 4th element onwards are confidence and class classes | ||
| classes = netout[row, col, b, 5:] | ||
|
|
||
| if classes.any(): | ||
| # first 4 elements are x, y, w, and h | ||
| x, y, w, h = netout[row, col, b, :4] | ||
|
|
||
| x = (col + sigmoid(x)) / grid_w # center position, unit: image width | ||
| y = (row + sigmoid(y)) / grid_h # center position, unit: image height | ||
| w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width | ||
| h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height | ||
| confidence = netout[row, col, b, 4] | ||
|
|
||
| box = BoundBox(x, y, w, h, confidence, classes) | ||
|
|
||
| boxes.append(box) | ||
|
|
||
| # suppress non-maximal boxes | ||
| for c in range(nb_class): | ||
| sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) | ||
|
|
||
| for i in range(len(sorted_indices)): | ||
| index_i = sorted_indices[i] | ||
|
|
||
| if boxes[index_i].classes[c] == 0: | ||
| continue | ||
| else: | ||
| for j in range(i + 1, len(sorted_indices)): | ||
| index_j = sorted_indices[j] | ||
|
|
||
| if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: | ||
| boxes[index_j].classes[c] = 0 | ||
|
|
||
| # remove the boxes which are less likely than a obj_threshold | ||
| boxes = [box for box in boxes if box.get_score() > obj_threshold] | ||
|
|
||
| return boxes | ||
|
|
||
|
|
||
| def sigmoid(x): | ||
| return 1. / (1. + np.exp(-x)) | ||
|
|
||
|
|
||
| def softmax(x, axis=-1, t=-100.): | ||
| x = x - np.max(x) | ||
|
|
||
| if np.min(x) < t: | ||
| x = x / np.min(x) * t | ||
|
|
||
| e_x = np.exp(x) | ||
|
|
||
| return e_x / e_x.sum(axis, keepdims=True) |