In [1]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

In [2]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

In [3]:
# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGB')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [4]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [5]:
!pip3 install torch==1.13 torchvision --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.1/890.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [6]:
!pip3 install yolox@git+https://github.com/braiansmarzaro/ByteTrack.git loguru thop lap cython_bbox --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m973.4 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for yolox (setup.py) ... [?25l[?25hdone
  Building wheel for lap (setup.py) ... [?25l[?25hdone
  Building wheel for cython_bbox (setup.py) ... [?25l[?25hdone


In [7]:
!pip3 install cython 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pycocotools (setup.py) ... [?25l[?25hdone


In [8]:
class ByteTrackerArguments:
    def __init__(self, *args):
        self.track_thresh, self.track_buffer, self.match_thresh = args
        self.mot20 = True

args = ByteTrackerArguments(0.50, 25, 0.80)

In [9]:
from yolox.tracker.byte_tracker import BYTETracker

tracker = BYTETracker(args)

In [10]:
!pip3 install -U openmim --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.8/302.8 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.4/239.4 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.1/953.1 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m6.0 

In [11]:
!git clone https://github.com/open-mmlab/mmdetection.git

Cloning into 'mmdetection'...
remote: Enumerating objects: 38019, done.[K
remote: Counting objects:   0% (1/102)[Kremote: Counting objects:   1% (2/102)[Kremote: Counting objects:   2% (3/102)[Kremote: Counting objects:   3% (4/102)[Kremote: Counting objects:   4% (5/102)[Kremote: Counting objects:   5% (6/102)[Kremote: Counting objects:   6% (7/102)[Kremote: Counting objects:   7% (8/102)[Kremote: Counting objects:   8% (9/102)[Kremote: Counting objects:   9% (10/102)[Kremote: Counting objects:  10% (11/102)[Kremote: Counting objects:  11% (12/102)[Kremote: Counting objects:  12% (13/102)[Kremote: Counting objects:  13% (14/102)[Kremote: Counting objects:  14% (15/102)[Kremote: Counting objects:  15% (16/102)[Kremote: Counting objects:  16% (17/102)[Kremote: Counting objects:  17% (18/102)[Kremote: Counting objects:  18% (19/102)[Kremote: Counting objects:  19% (20/102)[Kremote: Counting objects:  20% (21/102)[Kremote: Counting objects:  21% 

In [12]:
!mim install "mmengine" "mmcv-full" "mmdet" --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.7/451.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.7/254.7 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
!wget -O /content/checkpoint.pth https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth

--2024-04-13 19:40:23--  https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth
Resolving download.openmmlab.com (download.openmmlab.com)... 47.246.20.217, 47.246.20.218, 47.246.20.225, ...
Connecting to download.openmmlab.com (download.openmmlab.com)|47.246.20.217|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57532893 (55M) [application/octet-stream]
Saving to: ‘/content/checkpoint.pth’


2024-04-13 19:40:28 (13.8 MB/s) - ‘/content/checkpoint.pth’ saved [57532893/57532893]



In [19]:
import torch
from mmdet.apis import inference_detector, init_detector

device = "cuda" if torch.cuda.is_available() else "cpu"
model = init_detector("/content/rtmdet.py", "/content/checkpoint.pth", device=device)
js = video_stream()
label_html = 'Capturing...'
image_html = ''

while True:
    js_reply = video_frame(label_html, image_html)
    if not js_reply:
        break

    img = js_to_image(js_reply["img"])

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = inference_detector(model, img)

    predicted_bboxes = result.pred_instances.bboxes
    predicted_scores = result.pred_instances.scores
    predicted_labels = result.pred_instances.labels

    filtered_bboxes = []
    filtered_scores = []
    for box, label, score in zip(predicted_bboxes, predicted_labels, predicted_scores):
        if label == 0 and score > 0.60:
            filtered_bboxes.append(box.cpu().numpy())
            filtered_scores.append(score.cpu().numpy())

    max_bbox = None
    max_score = None
    max_area = 0
    for box in filtered_bboxes:
        area = (box[2] - box[0]) * (box[3] - box[1])
        if area > max_area:
            max_bbox = box
            max_area = area
            max_score = filtered_scores[filtered_bboxes.index(box)]


    try:
        dets = [(max_bbox[0], max_bbox[1], max_bbox[2], max_bbox[3], max_score)]
    except:
        print("Could not track the person")
        continue

    online_targets = tracker.update(np.array(dets), img.shape, img.shape)
    tracked_bbox = online_targets[0].tlbr
    tracked_id = online_targets[0].track_id

    if tracked_bbox.shape[0] == 0:
        print("No person was detected")
        continue
    else:
        cv2.rectangle(img, (int(tracked_bbox[0]), int(tracked_bbox[1])), (int(tracked_bbox[2]), int(tracked_bbox[3])), (0, 255, 0), 2)
        cv2.putText(img, f"ID: {tracked_id}", (int(tracked_bbox[0]), int(tracked_bbox[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        image_html = bbox_to_bytes(img)

    image_center = (img.shape[1] / 2, img.shape[0] / 2)
    bbox_center_x = max_bbox[0] + (max_bbox[2] - max_bbox[0]) / 2
    bbox_center_y = max_bbox[1] + (max_bbox[3] - max_bbox[1]) / 2

    message = ""
    if bbox_center_x > image_center[0]:
        message += "\033[92mMove left by {}\033[0m and ".format(round(bbox_center_x - image_center[0], 2))
    elif bbox_center_x < image_center[0]:
        message += "\033[92mMove right by {}\033[0m and ".format(round(image_center[0] - bbox_center_x, 2))
    if bbox_center_y > image_center[1]:
        message += "\033[91mMove up by {}\033[0m".format(round(bbox_center_y - image_center[1], 2))
    elif bbox_center_y < image_center[1]:
        message += "\033[91mMove down by {}\033[0m".format(round(image_center[1] - bbox_center_y, 2))
    print(message)

Loads checkpoint by local backend from path: /content/checkpoint.pth
The model and loaded state dict do not match exactly

unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std



<IPython.core.display.Javascript object>

Could not track the person
Could not track the person
[92mMove right by 0.01[0m and [91mMove up by 77.56[0m
[92mMove right by 11.34[0m and [91mMove up by 83.88[0m
[92mMove left by 10.14[0m and [91mMove down by 3.7[0m
[92mMove left by 91.2[0m and [91mMove down by 1.66[0m
[92mMove left by 163.48[0m and [91mMove down by 2.06[0m
[92mMove left by 163.9[0m and [91mMove down by 2.16[0m
[92mMove left by 124.48[0m and [91mMove down by 1.7[0m
[92mMove left by 138.67[0m and [91mMove down by 2.84[0m
[92mMove left by 15.79[0m and [91mMove up by 9.52[0m
[92mMove right by 90.76[0m and [91mMove down by 2.19[0m
[92mMove right by 128.62[0m and [91mMove down by 1.91[0m
[92mMove right by 135.76[0m and [91mMove up by 1.61[0m
Could not track the person
[92mMove right by 104.58[0m and [91mMove down by 1.94[0m
[92mMove right by 55.16[0m and [91mMove down by 0.74[0m
[92mMove right by 87.22[0m and [91mMove down by 1.41[0m
[92mMove right by 97.35[0m a