# Model Loading - Part 2 (Modular Code)

In [1]:
import os
os.chdir('../')

In [2]:
# Creating a module dir
module_dirpath = 'module'
if not os.path.exists(module_dirpath):
    os.makedirs(module_dirpath)

## Creating a Utility Script - utils.py

In [3]:
%%writefile module/utils.py
import numpy as np
import cv2

# Rescale any bounding box
def rescale_boxes(boxes, input_shape, output_shape):
    """
    This functions helps in re-scaling bounding box from one object to another.
    
    Parameters:
        boxes: An array containing the values of the bounding box.
        input_shape: A tuple or list containing values of the original object shape. E.g. (height, width)
        output_shape: A tuple or list containing values of the output object shape. E.g. (height, width)
    
    Returns:
        boxes: An array containing the values of the rescale boxes.
    """
    input_shape = np.array([input_shape[1], input_shape[0], input_shape[1], input_shape[0]])
    boxes = np.divide(boxes, input_shape, dtype=np.float32)
    boxes *= np.array([output_shape[1], output_shape[0], output_shape[1], output_shape[0]])
    return boxes

# Convert bounding box from YOLO format (x_c, y_c, w, h) into Pascal VOC format (x1, y1, x2, y2)
def bbox_yolo_to_pascal(boxes):
    """
    This function helps in converting the bounding box format from YOLO to Pascal VOC.
    
    Parameters:
        boxes: An array containing the values of the bounding box in YOLO format.
    
    Returns:
        boxes_cp: An array containing the values of the bounding box in Pascal VOC format.
    """
    boxes_cp = boxes.copy()
    boxes_cp[..., 0] = boxes[..., 0] - boxes[..., 2] / 2
    boxes_cp[..., 1] = boxes[..., 1] - boxes[..., 3] / 2
    boxes_cp[..., 2] = boxes[..., 0] + boxes[..., 2] / 2
    boxes_cp[..., 3] = boxes[..., 1] + boxes[..., 3] / 2
    return boxes_cp

# Clipping the bounding box values
def clip_bbox(boxes, height, width):
    """
    This function helps in clipping the values of the bounding box.
    
    Parameters:
        boxes: An array containing the values of the bounding box. 
        height: An int value of the height of a Image or Frame.
        width: An int value of the width of a Image or Frame.
    
    Return:
        clip_boxes: An array containing the clipped values of the bounding box.
    """
    clip_boxes = boxes.copy()
    clip_boxes[..., 0] = np.clip(boxes[..., 0], 0, width)
    clip_boxes[..., 1] = np.clip(boxes[..., 1], 0, height)
    clip_boxes[..., 2] = np.clip(boxes[..., 2], 0, width)
    clip_boxes[..., 3] = np.clip(boxes[..., 3], 0, height)
    return clip_boxes

# Computing the Intersection over Union of the bounding box.
def compute_iou(box, boxes):
    """
    This function helps in calculating the intersection over union of the bounding boxes.
    This function best works with prediction result, where one predicted box is computed with 
    multiple different predicted boxes.
    
    Parameters:
        box: An array containing values of a bounding box.
        boxes: An array containing values of multiple different bounding box.
    
    Returns:
        iou: An array containing iou values in between range (0, 1) for all the boxes array.
    """
    # Getting the intersection box
    xmin = np.maximum(box[0], boxes[:, 0])
    ymin = np.maximum(box[1], boxes[:, 1])
    xmax = np.minimum(box[2], boxes[:, 2])
    ymax = np.minimum(box[3], boxes[:, 3])
    
    # Compute intersection area
    intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
    
    # Compute union area
    box_area = (box[2] - box[0]) * (box[3] - box[1])
    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
    union_area = box_area + boxes_area - intersection_area
    
    # Compute IoU
    iou = intersection_area / union_area
    return iou

# Computing Non Maximum Suppression on all the bounding box
def compute_nms(boxes, scores, iou_threshold):
    """
    This function helps in computing the Non Maximum Suppression on the 
    predicted bounding boxes.
    
    Parameters:
        boxes: An array containing the values of the bounding boxes.
        scores: An array containing the values of the confidence scores
                for each bounding box.
        iou_threshold: A float value to suppress the bounding box.
                       Value should be within the range (0, 1).
    
    Returns: 
        Keep_boxes: A list containing the index for the boxes and scores 
                    array after computing Non Maximum Suppression.
    """
    # Getting the list of indices of sorted scores - descending order
    sorted_indices = np.argsort(scores)[::-1]
    
    # Looping through the indices and computing nms
    keep_boxes = []
    while sorted_indices.size > 0:
        # Picking the box with best score
        box_id = sorted_indices[0]
        keep_boxes.append(box_id)
        
        # Compute IoU of the picked box with rest of the boxes
        ious = compute_iou(box=boxes[box_id, :], boxes=boxes[sorted_indices[1:], :])
        
        # Remove boxes with IoU over the threshold
        keep_indices = np.where(ious < iou_threshold)[0]
        
        # Keeping only the indices that fit within the threshold
        sorted_indices = sorted_indices[keep_indices + 1]
        
    return keep_boxes

# Compute sigmoid
def sigmoid(x):
    """
    This function computes mathematical sigmoid function.
    Parameters: x: An int or array.
    Returns: An int or array containing values after computing.
    """
    return 1 / (1 + np.exp(-x))

# Drawing the mask prediction on the image or frame
def draw_masks(image, boxes, class_ids, class_list, mask_alpha=0.5, mask_maps=None):
    """
    This function draws the predicted mask on the base image.
    
    Parameters:
        image: An array containing the values of the base image in RGB format.
        boxes: An array containing the values of the predicted bounding box in Pascal Voc format. 
        class_ids: An array containing the values of the predicted classes indices. 
        class_list: A list containing all the class names in proper order. 
        mask_alpha: Default = 0.5, A float in range (0, 1) for opacity of the mask area. 
        mask_maps: Default = None, An array containing the values of the mask area.
    
    Returns:
        (masked_image, colors): A tuple containing the masked image array and colors list used for the classes.
    """
    mask_image = image.copy()
    
    # Generating colors for every class
    rng = np.random.default_rng(3)
    colors = rng.uniform(0, 255, size=(len(class_list), 3))
    
    # Drawing predicted objects
    for i, (box, class_id) in enumerate(zip(boxes, class_ids)):
        color = colors[class_id]
        x1, y1, x2, y2 = box.astype(int)
        
        # Fill bounding box on condition
        if mask_maps is None:
            cv2.rectangle(mask_image, (x1, y1), (x2, y2), color, -1)
        else:
            # Fill mask on condition
            crop_mask = mask_maps[i][y1:y2, x1:x2, np.newaxis] # Cropping the mask area
            crop_mask_image = mask_image[y1:y2, x1:x2] # Cropping the mask area from the image
            crop_mask_image = crop_mask_image * (1 - crop_mask) + crop_mask * color # Adding color to the mask area
            mask_image[y1:y2, x1:x2] = crop_mask_image # Replacing the mask area in the image
            
    # Returning mask image with color opacity
    return cv2.addWeighted(mask_image, mask_alpha, image, 1 - mask_alpha, 0), colors

# Drawing the bounding box and adding label text on the predicted image mask
def draw_detections(image, boxes, scores, class_ids, class_list, mask_alpha=0.5, mask_maps=None):
    """
    This function helps in drawing the predicted detection bounding box and mask.
    
    Parameters:
        image: An array containing the values of the base image in RGB format.
        boxes: An array containing the values of the predicted bounding box in Pascal Voc format.
        scores: An array containing the values of the confidence score for each predicted bounding box.
        class_ids: An array containing the values of the predicted classes indices. 
        class_list: A list containing all the class names in proper order. 
        mask_alpha: Default = 0.5, A float in range (0, 1) for opacity of the mask area. 
        mask_maps: Default = None, An array containing the values of the mask area.
    
    Returns:
        mask_image: An array containing the values for image with objects predicted.
    """
    image_height, image_width = image.shape[:2]
    size = min([image_height, image_width]) * 0.001 # Dynamic fontscale
    text_thickness = int(min([image_height, image_width]) * 0.001) # Dynamic thickness
    
    # Getting the Image with mask prediction using the function
    mask_image, colors = draw_masks(image, boxes, class_ids, class_list, mask_alpha, mask_maps)
    
    # Draw predicted bounding box and labels on the mask image
    for box, score, class_id in zip(boxes, scores, class_ids):
        color = colors[class_id]
        x1, y1, x2, y2 = box.astype(int)
        
        # Drawing rectangle
        cv2.rectangle(mask_image, (x1, y1), (x2, y2), color, 2)
        
        # Getting the box coords of the label text
        label = class_list[class_id]
        caption = f'{label} {int(score * 100)}%'
        (tw, th), _ = cv2.getTextSize(text=caption, 
                                      fontFace=cv2.FONT_HERSHEY_SIMPLEX, 
                                      fontScale=size, 
                                      thickness=text_thickness)
        th = int(th * 1.2)
        
        # Drawing rectangle for the text
        cv2.rectangle(mask_image, 
                      (x1, y1), 
                      (x1 + tw, y1 - th if y1 - 10 > 0 else y1 + 10 + th), 
                      color, 
                      -1)
        
        # Adding the label text
        cv2.putText(mask_image, 
                    caption, 
                    (x1, y1 if y1 - 10 > 0 else y1 + 15), 
                    cv2.FONT_HERSHEY_SIMPLEX, 
                    size, 
                    (255, 255, 255), 
                    text_thickness, 
                    cv2.LINE_AA)
    return mask_image

Overwriting module/utils.py


## Creating a engine script - engine.py

In [4]:
%%writefile module/engine.py
import math
import cv2
import numpy as np
import onnxruntime
import matplotlib.pyplot as plt
from utils import rescale_boxes, bbox_yolo_to_pascal, clip_bbox, compute_nms, sigmoid, draw_detections

class YoloSegPredict:
    """
    This class helps in loading the model, predicting an image and providing coords and mask 
    array for the segmentation.
    Parameters:
        model_path: A string to the path directing towards the model location.
        conf_threshold: A float in the range (0, 1) for the confidence scores.
        iou_threshold: A float in the range (0, 1) for the Non maximum supression.
        num_masks: An int that contains the value of the predicted mask value used by Yolov8.
    """
    def __init__(self, model_path, conf_threshold = 0.7, iou_threshold = 0.5, num_masks=32):
        self.conf_threshold = conf_threshold
        self.iou_threshold = iou_threshold
        self.num_masks = num_masks
        
        # Initializing the model
        self.initialize_model(model_path)
    
    def initialize_model(self, model_path):
        EP_LIST = ['CUDAExecutionProvider', 'CPUExecutionProvider']
        self.ort_session = onnxruntime.InferenceSession(model_path, 
                                                        providers = EP_LIST)
        # Get meta data from the model
        self.get_meta_details()
        self.get_input_details()
        self.get_output_details()
    
    def get_meta_details(self):
        # Getting the model meta data.
        model_meta = self.ort_session.get_modelmeta()
        self.class_dict = eval(model_meta.custom_metadata_map['names'])
        self.class_list = list(self.class_dict.values())
        return self.class_list
    
    def get_input_details(self):
        # Getting the input data
        model_inputs = self.ort_session.get_inputs()
        self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
        self.input_shape = model_inputs[0].shape
        self.input_height = self.input_shape[2]
        self.input_width = self.input_shape[3]
        
    def get_output_details(self):
        # Getting the output data
        model_outputs = self.ort_session.get_outputs()
        self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
        
    def __call__(self, image):
        return self.segment_objects(image)
    
    def segment_objects(self, image):
        # Prepare the image array as a input tensor.
        input_tensor, self.input_img_resized = self.prepare_input(image)
        
        # Perform inference on the image
        outputs = self.inference(input_tensor)
        
        # Extract prediction data
        self.boxes, self.scores, self.class_ids, mask_pred = self.process_box_output(outputs[0])
        self.mask_maps = self.process_mask_output(mask_pred, outputs[1])
        
        return self.input_img_resized, self.boxes, self.scores, self.class_ids, self.mask_maps
    
    def prepare_input(self, image):
        # Getting image info
        self.image_height, self.image_width = image.shape[:2]
        
        # Resize input image to input size
        input_img_resized = cv2.resize(image, (self.input_width, self.input_height))
        
        # Preprocessing the input image
        input_img = input_img_resized / 255.0 # Normalizing
        input_img = input_img.transpose(2, 0, 1)
        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
        
        return input_tensor, input_img_resized
    
    def inference(self, input_tensor):
        # Predicting using the Yolo onnx model
        outputs = self.ort_session.run(self.output_names, {self.input_names[0]: input_tensor})
        
        return outputs
    
    def process_box_output(self, box_output):
        # Extracting predictions from box outputs
        predictions = np.squeeze(box_output).T
        num_classes = box_output.shape[1] - self.num_masks - 4 # box data - mask data - box coords
        
        # Filter out confidence scores below threshold
        scores = np.max(predictions[:, 4:4+num_classes], axis=1)
        predictions = predictions[scores > self.conf_threshold, :]
        scores = scores[scores > self.conf_threshold]
        
        # Validating for no scores
        if len(scores) == 0:
            return [], [], [], np.array([])
        
        # Seprating the prediction from the first output
        box_predictions = predictions[..., :num_classes+4]
        mask_predictions = predictions[..., num_classes+4:]
        
        # Getting class with the highest confidense score
        class_ids = np.argmax(box_predictions[:, 4:], axis=1)
        
        # Getting the bounding box for all the objects
        boxes = self.extract_boxes(box_predictions)
        
        # Apply Non Maximum Suooression to suppress overlapping box
        indices = compute_nms(boxes=boxes, 
                              scores=scores, 
                              iou_threshold=self.iou_threshold)
        return boxes[indices], scores[indices], class_ids[indices], mask_predictions[indices]
    
    def extract_boxes(self, box_predictions):
        # Extract box from predictions
        boxes = box_predictions[:, :4]
        
        # Scale boxes to original image dimension
        boxes = rescale_boxes(boxes=boxes, 
                              input_shape=(self.input_height, self.input_width), 
                              output_shape=(self.image_height, self.image_width))
        
        # Convert the boxes to pascal voc format
        boxes = bbox_yolo_to_pascal(boxes=boxes)
        
        # Clipping the boxes range to a image limit
        boxes = clip_bbox(boxes=boxes, 
                          height=self.image_height, 
                          width=self.image_width)
        
        return boxes
    
    def process_mask_output(self, mask_predictions, mask_output):
        # if no mask prediction
        if mask_predictions.shape[0] == 0:
            return []
        
        mask_output = np.squeeze(mask_output)
        
        # Calculate the mask area for all the box
        num_mask, mask_height, mask_width = mask_output.shape
        masks = sigmoid(mask_predictions @ mask_output.reshape((num_mask, -1)))
        masks = masks.reshape((-1, mask_height, mask_width))
        
        # Rescale the boxes to match the mask size
        scale_boxes = rescale_boxes(boxes=self.boxes,
                                    input_shape=(self.image_height, self.image_width),
                                    output_shape=(mask_height, mask_width))
        
        # Mask map for each box and mask pair
        mask_maps = np.zeros((len(scale_boxes), self.image_height, self.image_width))
        blur_size = (int(self.image_width/mask_width), int(self.image_height/mask_height))
        for i in range(len(scale_boxes)):
            # Rounding the scaled boxes
            scale_x1 = int(math.floor(scale_boxes[i][0]))
            scale_y1 = int(math.floor(scale_boxes[i][1]))
            scale_x2 = int(math.ceil(scale_boxes[i][2]))
            scale_y2 = int(math.ceil(scale_boxes[i][3]))
            
            # Rounding the base boxes
            x1 = int(math.floor(self.boxes[i][0]))
            y1 = int(math.floor(self.boxes[i][1]))
            x2 = int(math.ceil(self.boxes[i][2]))
            y2 = int(math.ceil(self.boxes[i][3]))
            
            # Cropping the scaled mask and resizing it to image dimension
            scale_crop_mask = masks[i][scale_y1: scale_y2, scale_x1: scale_x2]
            crop_mask = cv2.resize(scale_crop_mask, 
                                   (x2 - x1, y2 - y1), 
                                   interpolation=cv2.INTER_CUBIC)
            crop_mask = cv2.blur(crop_mask, blur_size)
            crop_mask = (crop_mask > 0.5).astype(np.uint8)
            mask_maps[i, y1:y2, x1:x2] = crop_mask
        
        return mask_maps
    
    def draw_bbox(self, image, mask_alpha=0.5):
        # Drawing only the bounding box and filling it.
        return draw_detections(image=image,
                               boxes=self.boxes,
                               scores=self.scores,
                               class_ids=self.class_ids,
                               class_list=self.class_list,
                               mask_alpha=mask_alpha)
    
    def draw_masks(self, image, mask_alpha=0.5):
        # Drawing both the bounding box and the mask
        return draw_detections(image=image,
                               boxes=self.boxes,
                               scores=self.scores,
                               class_ids=self.class_ids,
                               class_list=self.class_list,
                               mask_alpha=mask_alpha,
                               mask_maps=self.mask_maps)

Overwriting module/engine.py


## Creating a script predict.py

In [5]:
%%writefile module/predict.py
import cv2
import matplotlib.pyplot as plt
from engine import YoloSegPredict

def predict(image_array, 
            model_path, 
            conf_threshold=0.5, 
            iou_threshold=0.5, 
            num_masks=32, 
            mask_alpha=0.5, 
            task='segment'):
    """
    This function predicts and plots the prediction on the image.
    
    Parameters:
        image_array: An array of the image.
        model_path: A string to the path directing towards the model location.
        conf_threshold=0.5: A float in the range (0, 1) for the confidence scores.
        iou_threshold=0.5: A float in the range (0, 1) for the Non maximum supression.
        num_masks=32: An int that contains the value of the predicted mask value used by Yolov8.
        mask_alpha=0.5: A float for the predicted mask opacity on the image.
        task='segment': A string containing either 'segment' or 'detect'.
    
    Returns: A dict containing:
        org_image: An array of the original image resized to the shape of the training data (512, 512).
        result_image: An array of resulted image after drawing prediction on the image. 
        boxes: An array containing the bounding box in pascal VoC format.
        masks: An array containing the predicted objects mask area.
        classes: An list containing the all the classes used for training.
        scores: An array containing the probability or confidence score. 
        class_ids: An array containing the indices of the detected object class/labels.
    """
    # Loading the model
    model = YoloSegPredict(model_path=model_path,
                           conf_threshold=conf_threshold,
                           iou_threshold=iou_threshold,
                           num_masks=num_masks)
    
    # Detecting objects in the image
    org_image, boxes, scores, class_ids, masks = model(image_array)
    
    # Getting the classes list
    classes = model.get_meta_details()
    
    # Drawing and Visualizing the resulted image
    if task == 'segment':
        result_image = model.draw_masks(image=image_array, mask_alpha=mask_alpha)
    elif task == 'detect':
        result_image = model.draw_bbox(image=image_array, mask_alpha=mask_alpha)
        
    return {'org_image': org_image, 
            'result_image': result_image, 
            'boxes': boxes, 
            'masks': masks, 
            'classes': classes, 
            'scores': scores, 
            'class_ids': class_ids} 

if __name__ == '__main__':
    # Getting the model path
    model_path = '../models/yolov8_5class_10percent/best.onnx'
    
    # Reading the image
    image_array = plt.imread('../datasets/images/val/pizza/420409.jpg')
    
    # Drawing and Visualizing the resulted image
    results = predict(image_array, 
                      model_path)
    
    for i in range(len(results['masks'])):
        print(f"[INFO] Detected: {results['classes'][results['class_ids'][i]]} and the confidence score: {results['scores'][i]}.")
    plt.imshow(results['result_image'])
    plt.axis(False)
    plt.show();

Overwriting module/predict.py
