In [2]:
# https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/401-object-detection-webcam/401-object-detection.ipynb

In [3]:
import urllib.request
urllib.request.urlretrieve(
    url='https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/main/notebooks/utils/notebook_utils.py',
    filename='notebook_utils.py'
)

('notebook_utils.py', <http.client.HTTPMessage at 0x7f7e8ae304f0>)

In [4]:
import collections
import tarfile
import time
from pathlib import Path

import cv2
import numpy as np
from IPython import display
import openvino as ov
from openvino.tools.mo.front import tf as ov_tf_front
from openvino.tools import mo

import notebook_utils as utils

In [5]:
import ipywidgets as widgets

core = ov.Core()

device = widgets.Dropdown(
    options=core.available_devices + ["AUTO"],
    value='AUTO',
    description='Device:',
    disabled=False,
)

device

Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO')

In [6]:
model_path = "./models/quantized_model.xml"
# Read the network and corresponding weights from a file.
model = core.read_model(model=model_path)
# Compile the model for CPU (you can choose manually CPU, GPU etc.)
# or let the engine choose the best available device (AUTO).
compiled_model = core.compile_model(model=model, device_name=device.value)

# Get the input and output nodes.
input_layer = compiled_model.input(0)
output_layer = compiled_model.output(0)

# Get the input size.
height, width = list(input_layer.shape)[1:3]

In [29]:
classes = ['0']

# Colors for the classes above (Rainbow Color Map).
colors = cv2.applyColorMap(
    src=np.arange(0, 255, 255 / len(classes), dtype=np.float32).astype(np.uint8),
    colormap=cv2.COLORMAP_RAINBOW,
).squeeze()

def process_results(frame, results, thresh=0.6):
    # The size of the original frame.
    h, w = frame.shape[:2]
    # The 'results' variable is a [1, 1, 100, 7] tensor.

    results = results.squeeze()
    boxes = []
    labels = []
    scores = []
    # for _, label, score, xmin, ymin, xmax, ymax in results:
    #     # Create a box with pixels coordinates from the box with normalized coordinates [0,1].
    #     boxes.append(
    #         tuple(map(int, (xmin * w, ymin * h, (xmax - xmin) * w, (ymax - ymin) * h)))
    #     )
    #     labels.append(int(label))
    #     scores.append(float(score))
    for xmin, ymin, xmax, ymax in results:
        # print(xmin, ymin, xmax, ymax)
        # Create a box with pixels coordinates from the box with normalized coordinates [0,1].
        boxes.append(
            (xmin, ymin, xmax, ymax)
            # tuple(map(int, (xmin * w, ymin * h, (xmax - xmin) * w, (ymax - ymin) * h)))
        )
        scores.append(0)
        labels.append(0)
    # Apply non-maximum suppression to get rid of many overlapping entities.
    # See https://paperswithcode.com/method/non-maximum-suppression
    # This algorithm returns indices of objects to keep.
    # indices = cv2.dnn.NMSBoxes(
    #     bboxes=boxes, scores=scores, score_threshold=thresh, nms_threshold=0.6
    # )

    # # If there are no boxes.
    # if len(indices) == 0:
    #     return []
    

    #Filter detected objects.
    # return [(labels[idx], scores[idx], boxes[idx]) for idx in indices.flatten()]
    # print("RETURNING: ", boxes)
    return boxes 

def draw_boxes(frame, boxes):
    print("BOXES: ", boxes)
    for box in boxes:
        print("BOX: ", box)
        # Choose color for the label.
        # color = tuple(map(int, colors[label]))
        # Draw a box.
        x2 = box[0] + box[2]
        y2 = box[1] + box[3]
        cv2.rectangle(img=frame, pt1=box[:2], pt2=(x2, y2), color=(0, 0, 255), thickness=3)

        color = (0, 0, 255)
        # Draw a label name inside the box.
        cv2.putText(
            img=frame,
            text=f"vehicle",
            org=(box[0] + 10, box[1] + 30),
            fontFace=cv2.FONT_HERSHEY_COMPLEX,
            fontScale=frame.shape[1] / 1000,
            color=color,
            thickness=1,
            lineType=cv2.LINE_AA,
        )

    return frame


In [26]:
def addPadding(img, pad_width, pad_value):
  """
  Adds padding to an image represented as a NumPy array.

  Args:
      img: NumPy array representing the image.
      pad_width: The width of the padding to add on all sides of the image (integer).
      pad_value: The value to fill the padded pixels with (scalar or array).

  Returns:
      A new NumPy array with the padded image.
  """
  
  # Get the original image shape
  height, width, channels = img.shape

  # Create a new array with the padded dimensions (padding only on bottom)
  padded_img = np.full((height + pad_width, width, channels), pad_value)

  # Insert the original image into the padded array
  padded_img[:-pad_width, :] = img  # Efficiently copy frame excluding bottom padding

  return padded_img

In [27]:
# Main processing function to run object detection.
def run_object_detection(source=0, flip=False, use_popup=False, skip_first_frames=0):
    player = None
    try:
        # Create a video player to play with target fps.
        player = utils.VideoPlayer(
            source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames
        )
        # Start capturing.
        player.start()
        if use_popup:
            title = "Press ESC to Exit"
            cv2.namedWindow(
                winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE
            )

        processing_times = collections.deque()
        while True:
            # Grab the frame.
            frame = player.next()
            if frame is None:
                print("Source ended")
                break
            # If the frame is larger than full HD, reduce size to improve the performance.
            # scale = 1280 / max(frame.shape)
            # if scale < 1:
            #     frame = cv2.resize(
            #         src=frame,
            #         dsize=None,
            #         fx=scale,
            #         fy=scale,
            #         interpolation=cv2.INTER_AREA,
            #     )

            # # Resize the image and change dims to fit neural network input.
            # input_img = cv2.resize(
            #     src=frame, dsize=(width, height), interpolation=cv2.INTER_AREA
            # )
            # Create a batch of images (size = 1).
            pad_width = 8
            pad_value = (0, 0, 255)  # Blue color for padding (BGR format)
            input_img = addPadding(frame.copy(), pad_width, pad_value)
            input_img = input_img[np.newaxis, ...]

            input_img = input_img.reshape(1, 3, 1088, 1920)
            #Measure processing time.

            start_time = time.time()
            # Get the results.
            results = compiled_model([input_img])[output_layer]
            stop_time = time.time()

            # # Get poses from network results.
            boxes = process_results(frame=frame, results=results)

            print("PROCESS: ", boxes)

            # # Draw boxes on a frame.
            frame = draw_boxes(frame=frame, boxes=boxes)

            processing_times.append(stop_time - start_time)
            # Use processing times from last 200 frames.
            if len(processing_times) > 200:
                processing_times.popleft()

            _, f_width = frame.shape[:2]
            # Mean processing time [ms].
            processing_time = np.mean(processing_times) * 1000
            fps = 1000 / processing_time
            cv2.putText(
                    img=frame,
                    text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)",
                    org=(20, 40),
                    fontFace=cv2.FONT_HERSHEY_COMPLEX,
                    fontScale=f_width / 1000,
                    color=(0, 0, 255),
                    thickness=1,
                    lineType=cv2.LINE_AA,
                )

            # Use this workaround if there is flickering.
            if use_popup:
                cv2.imshow(winname=title, mat=frame)
                key = cv2.waitKey(1)
                # escape = 27
                if key == 27:
                    break
            else:
                # Encode numpy array to jpg.
                _, encoded_img = cv2.imencode(
                    ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]
                )
                # Create an IPython image.
                i = display.Image(data=encoded_img)
                # Display the image in this notebook.
                display.clear_output(wait=True)
                display.display(i)
    # ctrl-c
    except KeyboardInterrupt:
        print("Interrupted")
    # any different error
    except RuntimeError as e:
        print(e)
    finally:
        if player is not None:
            # Stop capturing.
            player.stop()
        if use_popup:
            cv2.destroyAllWindows()

In [30]:
USE_WEBCAM = False

video_file = "./media/day_thermal_camera.mp4"
cam_id = 0

source = cam_id if USE_WEBCAM else video_file

run_object_detection(source=source, flip=isinstance(source, int), use_popup=False)

PROCESS:  [(0.0, 0.0, 7.58299, 30.33196), (0.0, 0.0, 22.74897, 30.33196), (0.0, 0.0, 30.33196, 7.58299), (0.0, 0.0, 60.66392, 15.16598), (0.0, 0.0, 113.74485, 15.16598), (0.0, 0.0, 121.32784, 7.58299), (0.0, 0.0, 121.32784, 7.58299), (15.06676, 0.0, 113.74485, 7.58299), (45.20028, 0.0, 113.74485, 7.58299), (45.20028, 0.0, 136.49382, 7.58299), (60.26704, 0.0, 151.6598, 7.58299), (60.26704, 0.0, 151.6598, 7.58299), (75.3338, 0.0, 151.6598, 7.58299), (75.3338, 0.0, 174.40877, 7.58299), (90.40056, 0.0, 189.57475, 7.58299), (90.40056, 0.0, 197.15775, 7.58299), (105.46732, 0.0, 197.15775, 7.58299), (105.46732, 0.0, 219.90671, 7.58299), (105.46732, 0.0, 227.4897, 7.58299), (105.46732, 0.0, 235.0727, 7.58299), (135.60085, 0.0, 242.65569, 7.58299), (135.60085, 0.0, 257.82166, 7.58299), (135.60085, 0.0, 265.40466, 7.58299), (135.60085, 0.0, 272.98764, 7.58299), (150.6676, 0.0, 280.57065, 7.58299), (150.6676, 0.0, 288.15363, 7.58299), (165.73436, 0.0, 295.7366, 7.58299), (165.73436, 0.0, 295.7366

error: OpenCV(4.10.0) :-1: error: (-5:Bad argument) in function 'rectangle'
> Overload resolution failed:
>  - Can't parse 'pt1'. Sequence item with index 0 has a wrong type
>  - Can't parse 'pt1'. Sequence item with index 0 has a wrong type
>  - rectangle() missing required argument 'rec' (pos 2)
>  - rectangle() missing required argument 'rec' (pos 2)
