# Tensorflow 2 Implementation of MobileNet2 and Yolo2 from scratch
# Sources and Github repos used
https://github.com/jmpap/YOLOV2-Tensorflow-2.0/blob/master/Yolo_V2_tf_2.ipynb \
https://github.com/zzh8829/yolov3-tf2 \
https://github.com/zzxvictor/YOLO_Explained \


## 1. Define MobilNet Model

##### Necessary basic imports

In [2]:
import tensorflow as tf
import numpy as np
import cv2

In [3]:
from tensorflow.keras.layers import Conv2D, BatchNormalization, DepthwiseConv2D, AveragePooling2D
from tensorflow.keras.activations import relu, softmax


class BConv2D(tf.keras.Model):

    """Convolution Block with Batchnorm and relu_6"""

    def __init__(self, *args, conv='normal', alpha=0., max_value=6, **kwargs):
        super().__init__()
        if 'normal' == conv:
            self.conv = Conv2D(*args, **kwargs)
        elif 'depth' == conv:
            self.conv = DepthwiseConv2D(*args, **kwargs)
        else:
            raise ValueError("conv parameter must be 'normal' or 'depth'")
        self.batch = BatchNormalization()
        self.alpha = alpha
        self.max_value = max_value

    def call(self, x):
        x = self.conv(x)
        x = self.batch(x)
        return relu(x, alpha=self.alpha, max_value=self.max_value)


class BottleneckBlock(tf.keras.Model):
    """Convolution Block with expansion, conv and projection layer, see mobileNetV2
        ex = expansion factor"""

    def __init__(self, channels_in, channels_out, kernel=3, ex=6, stride =1, **kwargs):
        super().__init__()

        self.exp = BConv2D(channels_in * ex, 1, **kwargs)
        self.conv = BConv2D(kernel, conv='depth', strides=stride, **kwargs)
        self.proj = Conv2D(channels_out, 1, **kwargs)
        self.batch = BatchNormalization()

    def call(self, x):
        x = self.exp(x)
        x = self.conv(x)
        x = self.proj(x)
        x = self.batch(x)
        return x


class BottleneckSequence(tf.keras.Model):
    """Sequence of bottleneck-blocks with length n. First layer of sequence is performed
    with stride x if given, remaining are stride 1 with residual connections added respectively"""

    def __init__(self, channels_in, channels_out, depth, kernel=3, ex=6, stride=2, **kwargs):
        super().__init__()
        if depth == 1:
            raise ValueError("Minimum Depth must be two. For depth=1, use 'BottleneckBlock' instead")
        # calculate length of middle sequence, start and end layer differs
        length = depth-2
        self.initial = BottleneckBlock(channels_in, channels_in, kernel, ex, stride, **kwargs)
        if length >= 1:
            self.middle = [BottleneckBlock(channels_in, channels_in, kernel, ex, **kwargs) for _ in range(depth)]
        else:
            self.middle = None
        self.end = BottleneckBlock(channels_in, channels_out, kernel, ex, **kwargs)

    def call(self, x):
        x = self.initial(x)
        if self.middle is not None:
            for layer in self.middle:
                res = tf.identity(x)
                x = layer(x)
                # will fail if padding != same, assert equal tensor shape
                x += res
        x = self.end(x)
        return x


class MobileNetV2(tf.keras.Model):

    """Input: Type: 'Base': returns the last three Output-Blocks in ascending order
                            , skips softmax and pooling
                    'Full': for Classification of k Classes
              k: Number of classes to classify if Type = 'Full' """

    def __init__(self, expansion_factor=6, net_type='base', k=1): 
        super().__init__()

        kwargs = {'padding': 'same'}
        self.type = net_type.lower()

        # feature extraction layers
        self.layer1 = BConv2D(32, 3, strides=2, **kwargs)
        self.bottle2 = BottleneckBlock(16, 16, ex=1, **kwargs)
        self.sequ3 = BottleneckSequence(16, 24, 2, ex=expansion_factor, **kwargs)
        self.sequ4 = BottleneckSequence(24, 32, 3, ex=expansion_factor, **kwargs)
        self.sequ5 = BottleneckSequence(32, 64, 4, ex=expansion_factor, **kwargs)
        self.sequ6 = BottleneckSequence(64, 96, 3, ex=expansion_factor, stride=1, **kwargs)
        self.sequ7 = BottleneckSequence(96, 160, 3, ex=expansion_factor, **kwargs)
        self.bottle8 = BottleneckBlock(160, 320, ex=expansion_factor, **kwargs)
        self.layer9 = BConv2D(1280, 1, **kwargs)

        if self.type == 'full':
            # detector layers, change them for yolo or ssd detectors
            # as of know, mobilenet is solely able to classify, not detect!
            self.avg10 = AveragePooling2D(pool_size=(7, 7))
            self.out = Conv2D(k, 1, **kwargs)  # k classes

    def call(self, x):
        x = self.layer1(x)
        x = self.bottle2(x)
        x = self.sequ3(x)
        out_1 = self.sequ4(x)
        x = self.sequ5(out_1)
        out_2 = self.sequ6(x)
        x = self.sequ7(out_2)
        x = self.bottle8(x)
        out_3 = self.layer9(x)
        if self.type == 'full':
            out_3 = self.avg10(out_3)
            out_3 = self.out(out_3)
            return softmax(out_3)
        else:
            return out_3, out_2, out_1

## 2. Define the Yolo Model

In [4]:
from tensorflow.keras.layers import Conv2D, BatchNormalization, LeakyReLU, UpSampling2D
from tensorflow.keras.losses import binary_crossentropy, sparse_categorical_crossentropy
from tensorflow.keras.activations import sigmoid, softmax


class YoloDetectBlock(tf.keras.Model):
    def __init__(self, base):
        super().__init__()
        kwargs = {'padding': 'same'}

        self.l1 = BConv2D(base, 1, alpha=0.1, max_value=None, **kwargs)
        self.l2 = BConv2D(base*2, 3, alpha=0.1, max_value=None, **kwargs)
        self.l3 = BConv2D(base, 1, alpha=0.1, max_value=None, **kwargs)
        self.l4 = BConv2D(base*2, 3, alpha=0.1, max_value=None, **kwargs)
        self.l5 = BConv2D(base, 1, alpha=0.1, max_value=None, **kwargs)

    def call(self, x):
        x = self.l1(x)
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        return self.l5(x)


class YOLOVX(tf.keras.Model):
    def __init__(self, n_bb=2, classes=2, iou_thresh = 0.5, coeff_bb=5, coeff_obj=5, coeff_nonObj=0.5):
        super(YOLOVX, self).__init__()
        # set params
        self.n_bb = n_bb
        self.classes = classes
        self.anchors = [(1, 2), (3, 4)] 
        self.iou_thresh = iou_thresh
        self.coeff_bb = coeff_bb
        self.coeff_obj = coeff_obj
        self.coeff_nonObj = coeff_nonObj

        self.b1_layer1 = YoloDetectBlock(512)
        self.b1_layer2 = BConv2D(512, 3, alpha=0.1, max_value=None, padding='same')

        self.b2_layer1 = YoloDetectBlock(256)
        self.b2_layer2 = BConv2D(256, 3, alpha=0.1, max_value=None, strides =2, padding='same')

        self.b3_layer1 = YoloDetectBlock(128)
        self.b3_layer2 = BConv2D(128, 3, alpha=0.1, max_value=None, strides = 4, padding='same')

        self.pred = Conv2D(n_bb * (5+classes), 1, padding='same')

    def call(self, input1, input2, input3):
        x3 = self.b3_layer1(input3)
        x3 = self.b3_layer2(x3)

        x2 = self.b2_layer1(input2)
        x2 = self.b2_layer2(x2)

        x1 = self.b1_layer1(input1)
        x1 = self.b1_layer2(x1)

        x = tf.concat([x1,x2,x3], axis=-1)
        pred = self.pred(x)

        b, h, w, *_ = pred.shape
        pred = tf.reshape(pred, [b, h, w, self.n_bb, 4 + 1 + self.classes])
        return pred
    
    def set_anchors(self, anchors):
        self.anchors = anchors

    def __get_grid__(self, shape):
        hIndex = tf.reshape(tf.range(start=0, limit=shape[0]), (shape[0], 1))
        hIndex = tf.tile(hIndex, [1, shape[1]])  # expand in the height direction
        wIndex = tf.reshape(tf.range(start=0, limit=shape[1]), (1, shape[1]))
        wIndex = tf.tile(wIndex, [shape[0], 1])  # expand in the width direction
        grid = tf.stack([wIndex, hIndex], axis=-1)
        grid = tf.reshape(grid, shape=(1, *shape, 1, 2))  # reshape the offset so that it can add to boxXY directly
        return tf.cast(grid, dtype=tf.float32)

    def __getIOU__(self, box1, box2):
        mini = tf.math.maximum(box1[..., 0:2] - box1[..., 2:4] / 2, box2[..., 0:2] - box2[..., 2:4] / 2)
        maxi = tf.math.minimum(box1[..., 0:2] + box1[..., 2:4] / 2, box2[..., 0:2] + box2[..., 2:4] / 2)
        interWH = tf.math.maximum(maxi - mini, 0)
        interArea = interWH[..., 0] * interWH[..., 1]
        area1 = box1[..., 2] * box1[..., 3]
        area2 = box2[..., 2] * box2[..., 3]
        return interArea / (area1 + area2 - interArea)

    def process_output(self, output):
        # output is already reshaped to (batch, grid, grid, nboxes, (4 + 1 + classes)
        b, h, w, *_ = output.shape
        box_xy = sigmoid(output[..., :2])
        grid = self.__get_grid__((h, w))
        grid = tf.cast(grid, tf.float32)
        anchors = tf.cast(tf.reshape(self.anchors, (1, 1, 1, self.n_bb, 2)), tf.float32)
        box_xy = grid + box_xy

        box_wh = tf.math.exp(output[..., 2:4])
        box_wh = box_wh * anchors

        obj_score = sigmoid(output[..., 4:5])
        class_pred = softmax(output[..., 5:])

        return box_xy, box_wh, obj_score, class_pred

    def getBB(self, box_xy, box_wh, scale=(32, 32)):
        # rescale bounding boxes to original image
        x1y1 = box_xy - box_wh / 2  # top left
        x2y2 = box_xy + box_wh / 2  # bottom right
        bb = tf.concat([x1y1, x2y2], axis=-1)
        shape = tf.stack([*scale, *scale])
        shape = tf.reshape(shape, [1, 4])
        shape = tf.cast(shape, tf.float32)
        return bb * shape

    def filterBB(self, bb, obj_score, class_pred, maxBox=20, score_thresh=0.5, iou_thresh=0.5): #TODO: als Parameter setzen

        boxScore = obj_score * class_pred
        boxClass = tf.argmax(boxScore, axis=-1)
        boxScore = tf.math.reduce_max(boxScore, axis=-1)
        mask = boxScore >= score_thresh

        # filter out low-confidence boxes
        boxes = tf.boolean_mask(bb, mask)  # -> array[0...N]
        scores = tf.boolean_mask(boxScore, mask)
        classes = tf.boolean_mask(boxClass, mask)

        # perform nms
        idx = tf.image.non_max_suppression(boxes, scores, maxBox, iou_threshold=iou_thresh)
        boxes = tf.gather(boxes, idx)
        scores = tf.gather(scores, idx)
        classes = tf.gather(classes, idx)
        return boxes, scores, classes


    def raw2Box(self, output, maxBox=20, scoreThresh=0.5, iouThresh=0.5):

        # convert raw yolo output
        box_xy, box_wh, obj_scores, class_probs = self.process_output(output)
        # scale gridscale back to imagescale, return bb in shape (x1,y1,x2,y2)
        boxes = self.getBB(box_xy, box_wh)

        for bb, score, prob in zip(boxes, obj_scores, class_probs):
            # filter out low confidence boxes and do nms
            f_bb, f_score, f_prob = self.filterBB(bb, score, prob, maxBox=maxBox, score_thresh=scoreThresh, iou_thresh=iouThresh)
            yield f_bb, f_score, f_prob

    def loss(self, gt, output):
        b, h, w, *_ = output.shape
        #get obj mask
        # grid grid, nbb, (xywh, obj, class)
        mask = tf.cast(gt[...,4], tf.float32)

        # =========BB-Loss===============================
        # get transformed output
        box_xy, box_wh, obj_score, class_pred = self.process_output(output)
        grid = self.__get_grid__((h,w))

        # get the right labels (pre-processed ofc)
        anchors = tf.cast(tf.reshape(self.anchors, (1, 1, 1, self.n_bb, 2)), tf.float32)
        # [grid, grid, nbb, (xywh, obj, classe)]
        gt_xy = gt[...,0:2] + grid
        gt_wh = gt[...,2:4] * anchors

        xy_loss = mask * self.coeff_bb * tf.reduce_sum(tf.square(gt_xy - box_xy), axis=-1)
        wh_loss = mask * self.coeff_bb * tf.reduce_sum(tf.square(tf.sqrt(gt_wh) - tf.sqrt(box_wh)), axis=-1)

        # reshape mask for further use
        mask = tf.expand_dims(mask, axis=-1)
        #===========Confidence-Loss=============================
        iou = self.__getIOU__(tf.concat([box_xy, box_wh], axis=-1), tf.concat([gt_xy, gt_wh], axis=-1))
        best_iou = tf.reduce_max(iou, axis=-1, keepdims=True)
        non_obj = tf.cast(tf.expand_dims(best_iou < self.iou_thresh, axis=-1), tf.float32)
        non_obj_loss = non_obj * (1.0 - mask) * self.coeff_nonObj * tf.square(0-obj_score)
        obj_loss = mask * self.coeff_obj * tf.square(obj_score - tf.expand_dims(iou, axis=-1))

        #==========Class-Loss=============================
        gt_class = tf.cast(gt[...,5], tf.int32)
        gt_class = tf.one_hot(gt_class, depth=self.classes)
        class_loss = mask * tf.square(gt_class-class_pred)
        

        # sum over batches
        xy_loss = tf.reduce_sum(xy_loss, axis=[1,2,3])
        wh_loss = tf.reduce_sum(wh_loss, axis=[1,2,3])
        obj_loss = tf.reduce_sum(non_obj_loss+obj_loss, axis=[1,2,3,4])
        class_loss = tf.reduce_sum(class_loss, axis=[1,2,3,4])
        return xy_loss,wh_loss, obj_loss, class_loss

## 3. Build Final Stacked-Network

In [5]:
class Detector(tf.keras.Model):

    def __init__(self, expansion_factor, n_bb, n_classes, iou_thresh = 0.5,  coeff_bb=5, coeff_obj=5, coeff_nonObj=0.5):
        super(Detector, self).__init__()
        
        # Need to use pre-defined model due to different building API's (Subclassed vs Functional) -> cant load weights
        #self.backbone = MobileNetV2(expansion_factor=expansion_factor)
        self.backbone = tf.keras.applications.MobileNetV2(input_shape=(96, 416, 3), include_top=False, classes=1)
        self.detector = YOLOVX(n_bb=n_bb, classes=n_classes, iou_thresh = iou_thresh,
                               coeff_bb=coeff_bb, coeff_obj=coeff_obj,
                               coeff_nonObj=coeff_nonObj)
        self.input3 = tf.keras.models.Model(inputs=self.backbone.input,
                                     outputs= self.backbone.get_layer("block_6_expand_relu").output)
        self.input2 = tf.keras.models.Model(inputs=self.backbone.input,
                                     outputs= self.backbone.get_layer("block_13_expand_relu").output)
        
        self.backbone.trainable = False
        self.input3.trainable = False
        self.input2.trainable = False

    def call(self, input1):
        x1 = self.backbone(input1)
        x2 = self.input2(input1)
        x3 = self.input3(input1)
        return self.detector(x1,x2,x3)

    def loss(self, y_true, y_pred):
        return self.detector.loss(y_true, y_pred)



## 4. Data Preprocessing

In [6]:
import json
import boto3
import io
from scipy import ndimage
import matplotlib.pyplot as plt

In [7]:
SCALE_HEIGHT = 371.0/96.0
SCALE_WIDHT = 1613.0/416.0
SCALE = 32
GRID_W = 13
GRID_H = 3
anchors = [1.0, 2.0, 3.0, 4.0]
anchors_count = len(anchors) // 2
anchors = np.array(anchors)
anchors = anchors.reshape(len(anchors) // 2, 2)
detector_mask = np.zeros((GRID_H,GRID_W , anchors_count, 1))
matching_true_boxes = np.zeros((GRID_H,GRID_W , anchors_count, 6))
WIDTH = 1613


def scaleToImgSize(height,width,x,y):
    return int(height/SCALE_HEIGHT),int(width/SCALE_WIDHT),int(x/SCALE_WIDHT),int(y/SCALE_HEIGHT)

def format_annotations(img, annotations):
    img = tf.image.resize(img, (96,416)) 
    img = tf.cast(img, tf.float32)

    #Umwandeln der annotations
    detector_mask = np.zeros((GRID_H,GRID_W , anchors_count, 1))
    matching_true_boxes = np.zeros((GRID_H,GRID_W , anchors_count, 6))       

    for p in annotations:
        height = p['height']
        width  = p['width']
        x = p['left']
        y = p['top']
        height,width,x,y = scaleToImgSize(height,width,x,y)
        bbx = (x+(width/2))
        bby = (y+(height/2))
        index_x = int(bbx//SCALE)
        index_y = int(bby//SCALE)
        tx = (bbx-(index_x*SCALE))/SCALE
        ty = (bby-(index_y*SCALE))/SCALE
        w = width / SCALE
        h = height / SCALE

        batch_true_boxes_grid = np.array([w,h,tx,ty,1.0,0.0])            

        if w * h > 0:  # box exists
             # calculate iou between box and each anchors and find best anchors
            best_iou = 0
            best_anchor = 0
            for i in range(anchors_count):
                # iou (anchor and box are shifted to 0,0)
                intersect = np.minimum(w, anchors[i, 0]) * np.minimum(h, anchors[i, 1])
                union = (anchors[i, 0] * anchors[i, 1]) + (w * h) - intersect
                iou = intersect / union
                if iou > best_iou:
                    best_iou = iou
                    best_anchor = i
            # localize box in detector_mask and matching true_boxes
            if best_iou > 0:
                x_coord = np.floor(x).astype('int')
                y_coord = np.floor(y).astype('int')
                detector_mask[index_y, index_x, best_anchor] = 1
                #old yolo_box = np.array([tx, ty, w, h, 1.0, 1, 0.0])
                #x, y, w, h, obj,bee,other                            
                yolo_box = np.array([tx, ty, w, h, 1.0, 0.0])
                matching_true_boxes[index_y, index_x, best_anchor] = yolo_box
    return img, matching_true_boxes

def get_data():
    images = np.empty((4000, 96, 416, 3), dtype=np.float32)
    labels = np.empty((4000, 3, 13, anchors_count, 6), dtype = np.float32)
    #with smart_open('s3://labeling-test-ai-supported-enbeemoresearchdata-k8wchfbly0wb/bee-labeling-2k-batch-01/manifests/output/output.manifest', 'rb') as s3_source:
        #for count, line in enumerate(s3_source):
    with open("annotations/annotations.json") as f:
        count = 0
        my_list = f.readlines()
        mydict = {}
        myVolumeDict = {}       
        for x in my_list:
            if (x != "\n"):
                lineAsJson = json.loads(x)
                source = lineAsJson['source-ref']           
                splitted_source=source.split('/')        
                annotations = lineAsJson["bee-labeling-2k-batch-01"]["annotations"]
                s3_bucket, s3_key = splitted_source[2], splitted_source[3]

                # Load image from s3 (old)
                #with io.BytesIO() as f:
                # boto3.client("s3").download_fileobj(Bucket=s3_bucket, Key=s3_key, Fileobj=f)
                #f.seek(0)

                # Load image from local directory (faster)
                path = "images/" + s3_key
                originalImage = plt.imread(path, format='png')

                img, matching_true_boxes = format_annotations(originalImage, annotations)
                images[count, :,:,:] = img
                labels[count, :,:,:,:] = matching_true_boxes
                #Flipped Image
                flippedImage = cv2.flip(originalImage, 1)
                flippedAnnotations = annotations
                for annotation in flippedAnnotations:
                    annotation['left'] = WIDTH - annotation['left'] - annotation['width']
                imgFlipped, matching_true_boxesFlipped = format_annotations(flippedImage, flippedAnnotations)
                
                images[count + 2000, :,:,:] = imgFlipped
                labels[count + 2000, :,:,:,:] = matching_true_boxesFlipped

                if count % 100 == 0:
                    print("read files: {}".format(count))
                count += 1         

    return images, labels


### View the data (if needed)

In [None]:
#data_images, data_labels = get_data()
#data = tf.data.Dataset.from_tensor_slices((data_images, data_labels))

In [None]:
for one_img, matching_true_boxes in data:
    one_img = np.array(one_img)
   
    SCALE = 32
    for i in range(3):
        for j in range(13):
            if matching_true_boxes[i][j][0][4] > 0.0:
                center_x = j*SCALE+matching_true_boxes[i][j][0][0]*SCALE
                center_y = i*SCALE+matching_true_boxes[i][j][0][1]*SCALE

                upper_left = (int(center_x-matching_true_boxes[i][j][0][2]*SCALE),int( center_y-matching_true_boxes[i][j][0][3]*SCALE))
                under_right = (int(center_x+matching_true_boxes[i][j][0][2]*SCALE),int(center_y+matching_true_boxes[i][j][0][3]*SCALE))
                one_img = cv2.rectangle(one_img, upper_left, under_right, (0, 0, 255), 2)


            elif matching_true_boxes[i][j][1][4] > 0.0:

                center_x = j*SCALE+matching_true_boxes[i][j][1][0]*SCALE
                center_y = i*SCALE+matching_true_boxes[i][j][1][1]*SCALE

                upper_left = (int(center_x-matching_true_boxes[i][j][1][2]*SCALE),int(center_y-matching_true_boxes[i][j][1][3]*SCALE))
                under_right = (int(center_x+matching_true_boxes[i][j][1][2]*SCALE),int(center_y+matching_true_boxes[i][j][1][3]*SCALE))
                one_img = cv2.rectangle(one_img, upper_left, under_right, (0, 0, 255), 2)
               
                
    plt.figure(figsize=(2,2))
    f, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5, figsize=(10, 10))
    ax1.imshow(one_img*255)
    ax1.set_title('image')

    ax2.matshow(matching_true_boxes[:,:,0,4]) # YOLO Confidence value
    ax2.set_title('mask1')
    ax2.xaxis.set_ticks_position('bottom')

    ax3.matshow(matching_true_boxes[:,:,1,4]) # YOLO Confidence value
    ax3.set_title('mask2')
    ax3.xaxis.set_ticks_position('bottom')
    
    ax4.matshow(matching_true_boxes[:,:,2,4]) # YOLO Confidence value
    ax4.set_title('mask3')
    ax4.xaxis.set_ticks_position('bottom')
    
    ax5.matshow(tf.reduce_sum(matching_true_boxes[:,:,:,4], axis=-1)) # YOLO Confidence value
    ax5.set_title('summed mask')
    ax5.xaxis.set_ticks_position('bottom')

    f.tight_layout()
    plt.show()
    

## 5. Training

### 5.1 Utility Functions

In [8]:
import os
import time

def cast_img(img):
    return tf.cast(img*255, dtype=tf.uint8)

def get_session_name():
    current_time = int(time.time())
    description = input("Describe the session: ")
    session_name = "s{}: {}".format(current_time, description)
    print(session_name)
    return session_name

def load_fake_dataset():
    img = tf.image.decode_jpeg(
        open("/content/drive/My Drive/tests/ich.jpg", 'rb').read(), channels=3)
    img = tf.image.resize(img, (96,416))
    # shape grid, grid, nbb, (xywh, obj, classes)
    labels = tf.zeros([3, 13, 3, 4+1+2], tf.float32)
    data = tf.data.Dataset.from_tensor_slices(([img], [labels]))
    data = data.batch(4)
    return data

def create_output(results, image_batch):
    final_image= []
    blue = (0,0,255) #rgb
    for image, (boxes, scores, classes) in zip(image_batch, results):

        boxes = tf.cast(boxes, tf.int32)
        for bb, score, cl in zip(boxes, scores, classes):
            x1y1 = (bb[0], bb[1])
            x2y2 = (bb[2], bb[3])
            xtext = (bb[0], bb[1]-2)
            image = cv2.rectangle(np.array(image), x1y1, x2y2, blue, 1)
            image = cv2.putText(image, str(round(score.numpy(), 2)), xtext, cv2.FONT_HERSHEY_SIMPLEX, 0.3, blue, 1)
        final_image.append(image)
    final_image = tf.stack(final_image)
    return final_image

def metric(gt_labels, pred_labels, threshold=0.5):
    print(np.shape(gt_labels))
    
    SCALE = 32 #grid, grid, nbb, xywh, mask, class
    gt_coords = []
    for gt_label in gt_labels:
        gt_xy = []
        for i in range(3):
            for j in range(13):
                if gt_label[i][j][0][4] > 0.0:
                    center_x = j*SCALE+gt_label[i][j][0][0]*SCALE
                    center_y = i*SCALE+gt_label[i][j][0][1]*SCALE

                    upper_left = (int(center_x-gt_label[i][j][0][2]*SCALE),int( center_y-gt_label[i][j][0][3]*SCALE))
                    under_right = (int(center_x+gt_label[i][j][0][2]*SCALE),int(center_y+gt_label[i][j][0][3]*SCALE))
                        
                    gt_xy.append([*upper_left, *under_right])

                elif gt_label[i][j][1][4] > 0.0:

                    center_x = j*SCALE+gt_label[i][j][1][0]*SCALE
                    center_y = i*SCALE+gt_label[i][j][1][1]*SCALE

                    upper_left = (int(center_x-gt_label[i][j][1][2]*SCALE),int(center_y-gt_label[i][j][1][3]*SCALE))
                    under_right = (int(center_x+gt_label[i][j][1][2]*SCALE),int(center_y+gt_label[i][j][1][3]*SCALE))
                        
                    gt_xy.append([*upper_left, *under_right])
            
        gt_coords.append(gt_xy)
    
    total_precision = []
    total_recall = []
   
    for gt_labels, (boxes, scores, classes) in zip(gt_coords, pred_labels): #pro bild      
        fn = 0
        tp = 0
        boxes = tf.cast(boxes, tf.int32)
        boxes_copy = boxes.numpy()
        
        for gt_label in gt_labels:               
                
            gt_w = abs(gt_label[2] - gt_label[0])
            gt_h = abs(gt_label[1]- gt_label[3])
            check = False
            
            best_threshold = 0.5
            
            for index, bb, in enumerate(boxes_copy): #1 bb pro bild
                
                w = abs(bb[2] - bb[0])
                h = abs(bb[1]-bb[3])

                inter_top_left = (max(bb[0], gt_label[0]), max(bb[1], gt_label[1]))
                inter_bottom_right = (min(bb[2], gt_label[2]), min(bb[3], gt_label[3]))

                intersect_w = max(0, inter_top_left[0]-inter_bottom_right[0])
                intersect_h = max(0, inter_top_left[1]-inter_bottom_right[1])
                intersect = intersect_w * intersect_h
                union = gt_w * gt_h + w*h - intersect

                iou = intersect / union

                if iou > best_threshold:
                    check = True
                    best_threshold = iou
                    best_index = index
                    
            if check:
                tp +=1
                boxes_copy = np.delete(boxes_copy, best_index, 0)
            else:
                fn +=1
                
        precision = 0
        recall = 0
        if len(boxes) !=0:
            precision = tp / len(boxes)
        else:
            precision = 0.0
        if tp+fn != 0:
            recall = tp / (tp + fn)
        else: 
            recall = 0.0

        total_precision.append(precision)
        total_recall.append(recall)
        
    return total_precision, total_recall

def augment_image(image, label):
    image = tf.image.random_contrast(image, 0.8, 1.2)
    image = tf.image.random_brightness(image, 0.2)
    noise = tf.random.normal(shape=(96,416,3), mean = 0.0, stddev = std, dtype = tf.float32)
    image = image + noise
    return image, label
    

### 5.2 Training Loop

In [9]:
def train(session_logs_path,epochs,batch_size,lr,iou_threshhold,coeff_bb,coeff_obj,coeff_nonObj,verbose=False):
    
    #fetch data, preproces...
    total_precision = []
    total_recall = []
    total_ap = 0.0
    AUTOTUNE = tf.data.AUTOTUNE
    
    data_images, data_labels = get_data()
    data = tf.data.Dataset.from_tensor_slices((data_images, data_labels))
    data = data.shuffle(1000)
    
    # 800 Test images, 3200 training images
    test_data = data.take(800)
    train_data = data.skip(800)
    
    test_data = test_data.batch(batch_size).prefetch(AUTOTUNE)
    train_data = train_data.batch(batch_size).map(augment_image, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

    # ===============================================
    # Build the network

    bee_detector = Detector(expansion_factor=expansion_factor, n_bb=number_bb, n_classes=classes, 
                            iou_thresh=iou_threshhold, coeff_bb=coeff_bb, coeff_obj =coeff_obj,
                            coeff_nonObj = coeff_nonObj)

    bee_detector.detector.set_anchors([(1,2), (3,4)])
    bee_detector.backbone.trainable = False
    bee_detector.build((1, 96, 416, 3))
    bee_detector.summary()
    
    # ==========================================
    #optimizer, routines...

    optimizer = tf.keras.optimizers.RMSprop(lr)
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=bee_detector)
    
    # load weights (for inference)
    #gewichte = "s3://sagemaker-studio-2hudhs2eoc/Weights/1e-5-batch8-rms-5-5-05/ckpt-400"
    #status = checkpoint.restore(gewichte).expect_partial()

    #latest = tf.train.latest_checkpoint(checkpoint_dir)
    #bee_detector.load_weights(latest)

    writer = tf.summary.create_file_writer(session_logs_path, max_queue=0)
    
    #======================================================
    # Training, testfunctions for eager mode
    
    @tf.function  
    def train_step(img, labels):

        with tf.GradientTape() as tape:
            output = bee_detector(img, training=True)
            print("Outputshape: {}".format(output.shape))

            xy_loss, wh_loss, obj_loss, class_loss = bee_detector.loss(labels, output)
            loss = tf.reduce_sum(xy_loss+wh_loss+obj_loss+class_loss)

        gradient = tape.gradient(loss, bee_detector.trainable_weights)
        optimizer.apply_gradients(zip(gradient, bee_detector.trainable_weights))

        return (xy_loss, wh_loss, obj_loss, class_loss, loss), output

    @tf.function
    def test_step(img, labels):
        output = bee_detector(img, training=False)
        xy_loss, wh_loss, obj_loss, class_loss = bee_detector.loss(labels, output)
        loss = tf.reduce_mean(xy_loss+wh_loss+obj_loss+class_loss)
        return (xy_loss, wh_loss, obj_loss, class_loss, loss), output


    logging_steps = (1600//batch_size) / 10
    s = np.int64(0)
    j = np.int64(0)
    for e in range(epochs):  # actual training

        for image_batch, label_batch in train_data:

            processed_batch = tf.keras.applications.mobilenet.preprocess_input(image_batch)

            loss, output = train_step(processed_batch, label_batch)
            
            s += 1
            if verbose:
                if s % 50 == 0:  # log every 50 training steps

                    results = bee_detector.detector.raw2Box(output, iouThresh = 0.5, scoreThresh = 0.5)
                    sample = create_output(results, image_batch)[0,...]

                    plt.figure(figsize=(2,2))
                    f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(10, 10))
                    ax1.matshow(tf.math.reduce_sum(output[0, :,:,:,4], axis=-1))
                    ax1.set_title('prediction')
                    ax1.xaxis.set_ticks_position('bottom')

                    ax2.matshow(tf.math.reduce_sum(label_batch[0, :,:,:,4], axis=-1)) 
                    ax2.set_title('mask')
                    ax2.xaxis.set_ticks_position('bottom')
                    
                    ax3.imshow(image_batch[0,:,:,:]) 
                    ax3.set_title('image')
                    
                    ax4.imshow(sample)
                    ax4.set_title("output")

                    f.tight_layout()
                    plt.show()

            if s % 50 == 0:

                results = bee_detector.detector.raw2Box(output, iouThresh = 0.3, scoreThresh= 0.3)
                final_image = create_output(results, image_batch)

                with writer.as_default():
                    print("Epoch: {}  Step: {}  Loss: {}".format(e, s, loss[4]))
                    summary.scalar("train/xy-loss", tf.reduce_mean(loss[0]),step=s)
                    summary.scalar("train/wh-loss", tf.reduce_mean(loss[1]), step=s)
                    summary.scalar("train/obj-loss", tf.reduce_mean(loss[2]), step=s)
                    summary.scalar("train/class-loss", tf.reduce_mean(loss[3]), step=s)
                    summary.scalar("train/total-loss", loss[4], step=s)
                    summary.image("train/output-image", final_image, max_outputs=3, step=s)
                    summary.image("train/mask",tf.math.reduce_sum(label_batch[...,4],axis = -1,keepdims=True), step=s)
                    summary.image("train/predict",(tf.math.reduce_sum(output[...,4],axis = -1,keepdims=True).numpy()*255).astype(np.uint8), step=s)
                    writer.flush()
        
        #=======================================================
        # test routine
        for test_image_batch, test_label_batch in test_data:
            
            processed_test_batch = tf.keras.applications.mobilenet.preprocess_input(test_image_batch)
            test_loss, test_output = test_step(processed_test_batch,  test_label_batch)
            j += 1
            
            
            if j % 50 == 0: 
                
                test_results = bee_detector.detector.raw2Box(test_output, scoreThresh = 0.3, iouThresh = 0.3)
                precision, recall = metric(test_label_batch, test_results)
                
                total_precision.append(precision)
                total_recall.append(recall)
                
                test_results = bee_detector.detector.raw2Box(test_output, scoreThresh = 0.3, iouThresh = 0.3)
                test_final_image = create_output(test_results, test_image_batch)
                
                with writer.as_default():
                    print("Epoch: {}  Step: {}  Test-Loss: {}".format(e, j, test_loss[4]))
                    summary.scalar("test/xy-loss", tf.reduce_mean(test_loss[0]),step=j)
                    summary.scalar("test/wh-loss", tf.reduce_mean(test_loss[1]), step=j)
                    summary.scalar("test/obj-loss", tf.reduce_mean(test_loss[2]), step=j)
                    summary.scalar("test/class-loss", tf.reduce_mean(test_loss[3]), step=j)
                    summary.scalar("test/total-loss", test_loss[4], step=j)
                    summary.image("test/output-image", test_final_image, max_outputs=3, step=j)
                    summary.image("test/mask",tf.math.reduce_sum(test_label_batch[...,4],axis = -1,keepdims=True), step=j)
                    summary.image("test/predict",(tf.math.reduce_sum(test_output[...,4],axis = -1,keepdims=True).numpy()*255).astype(np.uint8), step=j)
                    writer.flush()

        checkpoint.save(file_prefix=checkpoint_prefix)
        
    total_precision = [prec for sub_list in total_precision for prec in sub_list]
    total_recall = [rec for sub_list in total_recall for rec in sub_list]

    total_precision = sum(total_precision) / (800/batch_size)
    total_recall = sum(total_recall) / (800/batch_size)
    print("Average-total_precision:", total_precision)
    print("Average-total_recall:", total_recall)

## 5.3 Set Hyperparameter

In [10]:
# Training Related
epochs = int(400)  
batch_size = 8
lr = 1e-5

# Loss Related
iou_threshhold = 0.5
coeff_bb = 5
coeff_obj = 5
coeff_nonObj = 0.5

#noise
std = 0.0001

# Architecture Related
expansion_factor = 6
number_bb = 2
classes = 1


## Main

In [11]:
import tensorboard
import tensorflow.summary as summary

logs_path = "s3://sagemaker-studio-2hudhs2eoc/Logs"

checkpoint_directory = "s3://sagemaker-studio-2hudhs2eoc/Checkpoints"
checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")

In [12]:
if __name__ == '__main__':
    session_name = get_session_name()
    session_logs_path = os.path.join(logs_path, session_name)
    train(session_logs_path,epochs,batch_size,lr,iou_threshhold,coeff_bb,coeff_obj,coeff_nonObj, verbose=False)


Describe the session:  ggd


s1612477773: ggd
read files: 0
read files: 100
read files: 200
read files: 300
read files: 400
read files: 500
read files: 600
read files: 700
read files: 800
read files: 900
read files: 1000
read files: 1100
read files: 1200
read files: 1300
read files: 1400
read files: 1500
read files: 1600
read files: 1700
read files: 1800
read files: 1900
Model: "detector"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
mobilenetv2_1.00_224 (Functi (None, 3, 13, 1280)       2257984   
_________________________________________________________________
yolovx (YOLOVX)              multiple                  17733132  
_________________________________________________________________
model (Functional)           (None, 12, 52, 192)       65920     
_________________________________________________________________
model_1 (Functional)         (None, 6, 26, 576)        616256    
Total params: 19,991,116
Trainable params: 

NameError: name 'ap' is not defined