<a href="https://colab.research.google.com/github/HollowMike8/object-tracking-dlib/blob/main/single_object_tracking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git
!git init

In [2]:
!git clone https://github.com/HollowMike8/object-tracking-dlib.git

Cloning into 'object-tracking-dlib'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 85 (delta 42), reused 32 (delta 8), pack-reused 0[K
Unpacking objects: 100% (85/85), done.


In [3]:
!pip install --upgrade imutils



In [4]:
%cd object-tracking-dlib/

/content/object-tracking-dlib


In [5]:
import os
import cv2
import dlib
import imutils
import datetime
import numpy as np
from google.colab.patches import cv2_imshow

In [6]:
path_dir: str = r"/content/object-tracking-dlib"

import sys
sys.path.insert(0,path_dir)
import single_object_config as soc
from centroidtracker import CentroidTracker

In [27]:
# load input video (race.mp4), intitialize the writer, tracker 
vs = cv2.VideoCapture(os.path.join(soc.input_dir, "race.mp4"))

tracker = None
writer = None

# intitialize the CentroidTracker, objects
ct = CentroidTracker(maxDisappeared=40, maxDistance=40)
objects = None

# refresh rate for object detection (object detection every N frames)
refresh_rate = 60

# initiate totalFrames processed
totalFrames = 0

# **mobilenet_ssd**

In [None]:
# list of all the classes mobilenet_ssd was trained on
CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", 
           "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", 
           "motorbike", "person", "pottedplant", "sheep", "sofa", "train", 
           "tvmonitor"]

In [None]:
# load the mobilenet_ssd caffe model
prototxt_path = os.path.join(soc.cnn_caffe_dir , "MobileNetSSD_deploy.prototxt")
model_path = os.path.join(soc.cnn_caffe_dir , "MobileNetSSD_deploy.caffemodel")

net = cv2.dnn.readNetFromCaffe(prototxt_path, model_path)

In [None]:
start_time = datetime.datetime.now()

# initialize list to capture the bounding box coordinates
rects = []

# loop over thr frames in the input video
while True:
  (grab, frame) = vs.read()

  # to break out of loop after the end of video
  if grab == False:
    break

  # convert from BGR to RGB for dlib tracker
  frame = imutils.resize(frame, width=600)
  img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

  # writing the video
  if writer is None:
    fourcc = cv2.VideoWriter_fourcc(*"MJPG")
    writer = cv2.VideoWriter(os.path.join(soc.output_dir, "race_dlib.avi"), 
                             fourcc, 30, (frame.shape[1], frame.shape[0]), True)

  # object detection (for every N frames)
  if totalFrames % refresh_rate == 0:
    (h, w) = frame.shape[:2]
    blob = cv2.dnn.blobFromImage(frame, 0.007843, (w, h), 127.5)
    net.setInput(blob)
    detections = net.forward()

    # find the index of the detection with the largest confidence (single obj)
    if objects == None and len(detections) > 0:
      i = np.argmax(detections[0, 0, :, 2])
      conf = detections[0, 0, i, 2]
      label = CLASSES[int(detections[0, 0, i, 1])]

    # find the index of the previouly existing single obj detection
    else:
      for i in range(0, detections.shape[2]):
        conf = detections[0, 0, i, 2]
        label = CLASSES[int(detections[0, 0, i, 1])]
        # print("Label of progressive detection:%s"% label)

        temp = detections[0, 0, i, 3:7]*np.array([w, h, w, h])
        (startX, startY, endX, endY) = temp.astype("int")
        # print("Rect of progressive detection:%s"% temp)

        if label == soc.label:
          rects = []
          objects_old = objects.copy()
          rects.append((startX, startY, endX, endY))
          objects = ct.update(rects)

          # check the new rect is already detected/tracked single obj
          if (objects_old[0] == objects[0]).all():
            continue
          else:
            break

        elif label == 'background':
          label = soc.label

    if conf > soc.thres_confidence and label == soc.label:
      # compute the bounding box coordinates
      box = detections[0, 0, i, 3:7]*np.array([w, h, w, h])
      (startX, startY, endX, endY) = box.astype("int")

      # construct the dlib correlation tracker using bouding box coordinates
      tracker = dlib.correlation_tracker()
      rect = dlib.rectangle(startX, startY, endX, endY)
      tracker.start_track(img, rect)

      # draw the bouding box rectangle and label in the frame
      cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
      cv2.putText(frame, label, (startX, startY-15), cv2.FONT_HERSHEY_SIMPLEX, 
                  0.45, (0, 255, 0), 2)
      
      # empty the rect list and update the centroid/centroids
      rects = []
      rects.append((startX, startY, endX, endY))
      objects = ct.update(rects)
  
  # object tracking     
  else:
    tracker.update(img)
    pos = tracker.get_position()

    # unpack the position object
    startX = int(pos.left())
    startY = int(pos.top())
    endX = int(pos.right())
    endY = int(pos.bottom())

    # draw the bouding box rectangle and label in the frame
    cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
    cv2.putText(frame, label, (startX, startY-15), cv2.FONT_HERSHEY_SIMPLEX, 
                0.45, (0, 255, 0), 2)
    
    # empty the rect list and update the centroid/centroids
    rects = []
    rects.append((startX, startY, endX, endY))
    objects = ct.update(rects)

  # write the sketched frame     
  if writer is not None:
    writer.write(frame)

  # show the output frame
  cv2_imshow(frame)
  key = cv2.waitKey(1) & 0xFF

  # if the `q` key was pressed, break from the loop
  if key == ord("q"):
    break

  # update the totalFrames processed
  totalFrames += 1

end_time = datetime.datetime.now()
elapsed_time = (end_time-start_time).total_seconds()
print("Elapsed time: {:.2f}".format(elapsed_time))
print("Approx. FPS: {:.2f}".format(totalFrames/elapsed_time))

# check to see if we need to release the video writer pointer
if writer is not None:
  writer.release()

# do a bit of cleanup
cv2.destroyAllWindows()
vs.release()

# **yolo-coco**

In [None]:
# download the files if required

# change the dir to location where downloads save
%cd /content/object-tracking-dlib/yolo-coco/

!wget "https://pjreddie.com/media/files/yolov3.weights"

!wget "https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"

# !wget "https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names"

#change the dir back to root dir
%cd /content/object-tracking-dlib/

In [9]:
# load the COCO class labels for YOLO model
labels_path = os.path.join(path_dir, "yolo-coco", "coco.names")
labels = open(labels_path, "r").read().strip().split("\n")

# assign random colours to all COCO class labels
np.random.seed(42)
colors = np.random.randint(0, 255, size=(len(labels), 3), dtype = "uint8")

In [10]:
# load the yolov3 model
weightsPath = os.path.join(path_dir, "yolo-coco", "yolov3.weights")
configPath = os.path.join(path_dir, "yolo-coco", "yolov3.cfg")

net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)

In [None]:
# initialize list to capture the bounding box coordinates
rects = []

# loop over thr frames in the input video
while True:
  (grab, frame) = vs.read()

  # to break out of loop after the end of video
  if grab == False:
    break

  # convert from BGR to RGB for dlib tracker
  frame = imutils.resize(frame, width=600)
  img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

  # writing the video
  if writer is None:
    fourcc = cv2.VideoWriter_fourcc(*"MJPG")
    writer = cv2.VideoWriter(os.path.join(soc.output_dir, "race_dlib_yolo.avi"), 
                             fourcc, 30, (frame.shape[1], frame.shape[0]), True)

  # object detection (for every N frames)
  # initialize lists to append the bounding boxes, confidences and class IDs
  boxes = []
  confidences = []
  classIDs = []

  if totalFrames % refresh_rate == 0:
    (h, w) = frame.shape[:2]

    # determine only *output* layer names we need from yolo (3 output layers)
    layer_names = net.getLayerNames()
    layer_names = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]

    # construct a blob from the input image and then perform a forward pass
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, 
                                 crop=False)
    net.setInput(blob)
    layerOutputs = net.forward(layer_names)

    # loop over each layer of the outputs (3)
    for output in layerOutputs:
      # loop over the detections in each output
      for detection in output:
        scores = detection[5:]
        classID = np.argmax(scores)
        confidence = scores[classID]

        # consider only predictions with confidence > threshold
        if confidence > soc.yolo_thres_confidence:
          # scale the bounding box parameters 
          box = detection[0:4] * np.array([w, h, w, h])
          (centerX, centerY, width, height) = box.astype("int")

          # find the corner points for cv2.rectangle
          startX = int(centerX - (width/2))
          startY = int(centerY - (height/2))
          endX = int(centerX + (width/2))
          endY = int(centerY + (height/2))

          boxes.append([startX, startY, endX, endY])
          confidences.append(float(confidence))
          classIDs.append(classID)
    
    # apply non-max supression with threshold IoU= 0.3 and 
    # threshold confidence=soc.yolo_thres_confidence
    idxs = cv2.dnn.NMSBoxes(boxes, confidences, soc.yolo_thres_confidence, 0.3)

    # capture the all detections with class labels "person"
    if len(idxs) > 0:
      person_list = {confidences[i]:boxes[i] for i in idxs.flatten()
                    if labels[classIDs[i]]==soc.label}
      # print(person_list)

    # find the detection with the largest confidence (single obj)
    if objects == None and bool(person_list):
      label = soc.label
      max_conf = max(person_list.keys())
      (startX, startY, endX, endY) = person_list[max_conf]

    # find the index of the previouly existing single obj detection
    elif objects != None and bool(person_list):
      label = soc.label
      rects_old = rects

      for key in person_list.keys():
        conf = key
        (startX, startY, endX, endY) = person_list[conf]

        rects = []
        objects_old = objects.copy()
        rects.append((startX, startY, endX, endY))
        objects = ct.update(rects)

        # check the new rect is already detected/tracked single obj
        if (objects_old[0] == objects[0]).all():
          (startX, startY, endX, endY) = rects_old[0]
          continue
        else:
          break

    # construct the dlib correlation tracker using bouding box coordinates
    tracker = dlib.correlation_tracker()
    rect = dlib.rectangle(startX, startY, endX, endY)
    tracker.start_track(img, rect)

    # draw the bouding box rectangle and label in the frame
    cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
    cv2.putText(frame, label, (startX, startY-15), cv2.FONT_HERSHEY_SIMPLEX, 
                0.45, (0, 255, 0), 2)
    
    # empty the rect list and update the centroid/centroids
    rects = []
    rects.append((startX, startY, endX, endY))
    objects = ct.update(rects)

  # object tracking     
  else:
    tracker.update(img)
    pos = tracker.get_position()

    # unpack the position object
    startX = int(pos.left())
    startY = int(pos.top())
    endX = int(pos.right())
    endY = int(pos.bottom())

    # draw the bouding box rectangle and label in the frame
    cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
    cv2.putText(frame, label, (startX, startY-15), cv2.FONT_HERSHEY_SIMPLEX, 
                0.45, (0, 255, 0), 2)
    
    # empty the rect list and update the centroid/centroids
    rects = []
    rects.append((startX, startY, endX, endY))
    objects = ct.update(rects)

  # write the sketched frame     
  if writer is not None:
    writer.write(frame)

  # show the output frame
  cv2_imshow(frame)
  key = cv2.waitKey(1) & 0xFF

  # if the `q` key was pressed, break from the loop
  if key == ord("q"):
    break

  # update the totalFrames processed
  totalFrames += 1

# check to see if we need to release the video writer pointer
if writer is not None:
  writer.release()

# do a bit of cleanup
cv2.destroyAllWindows()
vs.release()

# **Notes**
1. Object detection is performed once in every 60 frames

2. Object detections (except initial) use additional criteria of checking if  the new detection is close to previous bounding box (from tracking)

3. mobilenet_ssd fails in some detection steps (for refresh_rate = 30 frames) due to occlusion

4. Unsuccessful detection steps are skipped and tracking is used as before
  1. Tracking is re-initiated but with the last successful bounding box 

5. yolov3 fails in some detection steps (for refresh_rate = 30 frames) possibly due to occlusion