**To get acces to webcam in the colab notebook**

In [None]:
 from google.colab import drive
 drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
          bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 640);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 640; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)
  data = eval_js('takePhoto({})'.format(quality))
  binary = b64decode(data.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(binary)
  return filename

In [None]:
import torch
model = torch.hub.load('/content/gdrive/MyDrive/DLAVproj/yolov5', 'custom', path='/content/gdrive/MyDrive/DLAVproj/best_weight.pt', source='local')  # local repo
!pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt
model2 = torch.hub.load('ultralytics/yolov5', 'yolov5s')

[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...
Collecting PyYAML>=5.3.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Installing collected packages: PyYAML
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed PyYAML-6.0

[31m[1mrequirements:[0m 1 package updated per /content/gdrive/MyDrive/Checkpoint 1 V3/yolov5/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-4-25 torch 1.11.0+cu113 CUDA:0 (Tesla K80, 11441MiB)

Fusing layers... 
Model summary: 213 layers, 7012822 parameters, 0 gradients
Adding AutoShape... 


Collecting thop
  Downloading thop-0.0.31.post2005241907-py3-none-any.whl (8.7 kB)
Installing collected packages: thop
Successfully installed thop-0.0.31.post2005241907


Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to /root/.cache/torch/hub/master.zip
[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...

[31m[1mrequirements:[0m 1 package updated per /content/gdrive/MyDrive/Checkpoint 1 V3/yolov5/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-4-25 torch 1.11.0+cu113 CUDA:0 (Tesla K80, 11441MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt to yolov5s.pt...


  0%|          | 0.00/14.1M [00:00<?, ?B/s]


Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [None]:
# NMS confidence threshold
model.max_det = 1  # maximum number of detections per image
model.conf = 0.7
model2.conf = 0.7
model2.classes = [0]

In [None]:
!pip install deep-sort-realtime

Collecting deep-sort-realtime
  Downloading deep_sort_realtime-1.2-py3-none-any.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 5.2 MB/s 
Installing collected packages: deep-sort-realtime
Successfully installed deep-sort-realtime-1.2


In [None]:
from deep_sort_realtime.deepsort_tracker import DeepSort

In [None]:
from google.colab.patches import cv2_imshow
import pdb
import json
import time
import PIL

tracker = DeepSort(max_age=30, nn_budget=70, override_track_class=None)

# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox_f = ''
xmin_best = 0
ymin_best = 0
xmax_best = 0
ymax_best = 0
previous = 0
best_intersection = 0
# ok = False
init_detection = False
first_track = False


while True:
    js_reply = video_frame(label_html, bbox_f)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_model = PIL.Image.fromarray(img_rgb, "RGB")

    if init_detection == False :
      # Obtaining a bgr image
      # Inferences
      results = model(img_model)
      results2 = model2(img_model)
      results = results.pandas().xyxy[0].to_json(orient="records")  # JSON img1 predictions
      results2 = results2.pandas().xyxy[0].to_json(orient="records")  # JSON img1 predictions
      results = json.loads(results) # get a dictionnary from the string
      results2 = json.loads(results2) # get a dictionnary from the string
      bbox_array = np.zeros([640,640,4], dtype=np.uint8)

      # Initializing the tracking
      detection = False
      for result in results:
          detection = True
          xmin = int(result.get('xmin'))
          ymin = int(result.get('ymin'))
          xmax = int(result.get('xmax'))
          ymax = int(result.get('ymax'))
          confidence = str(result.get('confidence'))
          bbox_array = cv2.rectangle(bbox_array,(xmin,ymin),(xmax,ymax),(255,0,0),2)
          bbox_array = cv2.putText(bbox_array, "[{conf}]".format(conf = confidence),
                          (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                          (255,0,0), 2)
          
      if detection == True :
        for idx, result2 in enumerate(results2):
            if result2.get('name') == 'person':
              xmin2 = int(result2.get('xmin'))
              ymin2 = int(result2.get('ymin'))
              xmax2 = int(result2.get('xmax'))
              ymax2 = int(result2.get('ymax'))
              confidence2 = str(result2.get('confidence'))
              class2 = str(result2.get('class'))
              x_left = max(xmin, xmin2)
              y_top = max(ymin, ymin2)
              x_right = min(xmax, xmax2)
              y_bottom = min(ymax, ymax2)
              intersection = (x_right - x_left)*(y_bottom - y_top)
              if intersection > previous:
                best_intersection = intersection
                xmin_best = xmin2
                ymin_best = ymin2
                xmax_best = xmax2
                ymax_best = ymax2
                best_confidence = confidence2
                best_class = class2
              previous = intersection
              intersection = 0
        if best_intersection != 0 :
          bbox_array = cv2.rectangle(bbox_array,(xmin_best,ymin_best),(xmax_best,ymax_best),(0,0,255),2)
          init_detection = True
          bbox = [xmin_best, ymin_best, xmax_best - xmin_best, ymax_best - ymin_best]
          detections = [(bbox, confidence, best_class)]

    if init_detection == True :
        ##
        if first_track == True :
          results2 = model2(img_model)
          results2 = results2.pandas().xyxy[0].to_json(orient="records")  # JSON img1 predictions
          results2 = json.loads(results2) # get a dictionnary from the string
          detections = []
          for idx, result2 in enumerate(results2):
              if result2.get('name') == 'person':
                xmin2 = int(result2.get('xmin'))
                ymin2 = int(result2.get('ymin'))
                xmax2 = int(result2.get('xmax'))
                ymax2 = int(result2.get('ymax'))
                confidence2 = float(result2.get('confidence'))
                class2 = str(result2.get('class'))
                detections.append(([xmin2, ymin2, xmax2-xmin2, ymax2-ymin2], confidence2, class2))

        bbox_array = np.zeros([640,640,4], dtype=np.uint8)
        tracks = tracker.update_tracks(detections, frame=img) # bbs expected to be a list of detections, each in tuples of ( [left,top,w,h], confidence, detection_class )
        for track in tracks:
          track_id = track.track_id
          if first_track == False:
            target_id = track_id
          ltrb = track.to_ltrb()  
          if track_id == target_id :   
            bbox_array = cv2.rectangle(bbox_array, (int(ltrb[0]),int(ltrb[1])),(int(ltrb[2]),int(ltrb[3])),(0,0,255),2,1)
          else:
            bbox_array = cv2.rectangle(bbox_array, (int(ltrb[0]),int(ltrb[1])),(int(ltrb[2]),int(ltrb[3])),(0,255,0),2,1)
          bbox_array = cv2.putText(bbox_array, "[{id}]".format(id = track_id),
                    (int(ltrb[0]), int(ltrb[1]) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255,0,0), 2)
        first_track = True

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)

    # update bbox so next frame gets new overlay5
    bbox_f = bbox_bytes

MobileNetV2 Embedder for Deep Sort initialised
- gpu enabled: True
- half precision: True
- max batch size: 16
- expects BGR: True
DeepSort Tracker initialised
- max age: 30
- appearance threshold: 0.2
- nms threshold: OFF
- max num of appearance features: 70
- overriding track class : No
- today given : No
- in-build embedder : Yes
- polygon detections : No


<IPython.core.display.Javascript object>