# Implementing centroid tracking with OpenCV

In [1]:
# import the necessary packages
from scipy.spatial import distance as dist
from collections import OrderedDict
import numpy as np
 
class CentroidTracker():
    def __init__(self, maxDisappeared=50):
        # initialize the next unique object ID along with two ordered
        # dictionaries used to keep track of mapping a given object
        # ID to its centroid and number of consecutive frames it has
        # been marked as "disappeared", respectively
        self.nextObjectID = 0
        self.objects = OrderedDict()
        self.disappeared = OrderedDict()

        # store the number of maximum consecutive frames a given
        # object is allowed to be marked as "disappeared" until we
        # need to deregister the object from tracking
        self.maxDisappeared = maxDisappeared
        
    def register(self, centroid):
        # when registering an object we use the next available object
        # ID to store the centroid
        self.objects[self.nextObjectID] = centroid
        self.disappeared[self.nextObjectID] = 0
        self.nextObjectID += 1
        
    def deregister(self, objectID):
        # to deregister an object ID we delete the object ID from
        # both of our respective dictionaries
        del self.objects[objectID]
        del self.disappeared[objectID]
        
    def update(self, rects):
        # check to see if the list of input bounding box rectangles
        # is empty
        if len(rects) == 0:
            # loop over any existing tracked objects and mark them
            # as disappeared
            for objectID in list(self.disappeared.keys()):
                self.disappeared[objectID] += 1

                # if we have reached a maximum number of consecutive
                # frames where a given object has been marked as
                # missing, deregister it
                if self.disappeared[objectID] > self.maxDisappeared:
                    self.deregister(objectID)
 
            # return early as there are no centroids or tracking info
            # to update
            return self.objects
        
        # initialize an array of input centroids for the current frame
        inputCentroids = np.zeros((len(rects), 2), dtype="int")

        # loop over the bounding box rectangles
        for (i, (startX, startY, endX, endY)) in enumerate(rects):
            # use the bounding box coordinates to derive the centroid
            cX = int((startX + endX) / 2.0)
            cY = int((startY + endY) / 2.0)
            inputCentroids[i] = (cX, cY)
            
        # if we are currently not tracking any objects take the input
        # centroids and register each of them
        if len(self.objects) == 0:
            for i in range(0, len(inputCentroids)):
                self.register(inputCentroids[i])
                
        # otherwise, are are currently tracking objects so we need to
        # try to match the input centroids to existing object
        # centroids
        else:
            # grab the set of object IDs and corresponding centroids
            objectIDs = list(self.objects.keys())
            objectCentroids = list(self.objects.values())

            # compute the distance between each pair of object
            # centroids and input centroids, respectively -- our
            # goal will be to match an input centroid to an existing
            # object centroid
            D = dist.cdist(np.array(objectCentroids), inputCentroids)

            # in order to perform this matching we must (1) find the
            # smallest value in each row and then (2) sort the row
            # indexes based on their minimum values so that the row
            # with the smallest value is at the *front* of the index
            # list
            rows = D.min(axis=1).argsort()

            # next, we perform a similar process on the columns by
            # finding the smallest value in each column and then
            # sorting using the previously computed row index list
            cols = D.argmin(axis=1)[rows]
            
            # in order to determine if we need to update, register,
            # or deregister an object we need to keep track of which
            # of the rows and column indexes we have already examined
            usedRows = set()
            usedCols = set()

            # loop over the combination of the (row, column) index
            # tuples
            for (row, col) in zip(rows, cols):
                # if we have already examined either the row or
                # column value before, ignore it
                # val
                if row in usedRows or col in usedCols:
                    continue
 
                # otherwise, grab the object ID for the current row,
                # set its new centroid, and reset the disappeared
                # counter
                objectID = objectIDs[row]
                self.objects[objectID] = inputCentroids[col]
                self.disappeared[objectID] = 0
 
                # indicate that we have examined each of the row and
                # column indexes, respectively
                usedRows.add(row)
                usedCols.add(col)
                
            # compute both the row and column index we have NOT yet
            # examined
            unusedRows = set(range(0, D.shape[0])).difference(usedRows)
            unusedCols = set(range(0, D.shape[1])).difference(usedCols)
            
            # in the event that the number of object centroids is
            # equal or greater than the number of input centroids
            # we need to check and see if some of these objects have
            # potentially disappeared
            if D.shape[0] >= D.shape[1]:
                # loop over the unused row indexes
                for row in unusedRows:
                    # grab the object ID for the corresponding row
                    # index and increment the disappeared counter
                    objectID = objectIDs[row]
                    self.disappeared[objectID] += 1
 
                    # check to see if the number of consecutive
                    # frames the object has been marked "disappeared"
                    # for warrants deregistering the object
                    if self.disappeared[objectID] > self.maxDisappeared:
                        self.deregister(objectID)
                        
            # otherwise, if the number of input centroids is greater
            # than the number of existing object centroids we need to
            # register each new input centroid as a trackable object
            else:
                for col in unusedCols:
                    self.register(inputCentroids[col])
 
        # return the set of trackable objects
        return self.objects

On Lines 2-4 we import our required packages and modules — distance , OrderedDict , and numpy .

Our ***CentroidTracker***  class is defined on Line 6. The constructor accepts a single parameter, the maximum number of consecutive frames a given object has to be lost/disappeared for until we remove it from our tracker (Line 7).

***Constructor*** builds four class variables:

* nextObjectID : A counter used to assign unique IDs to each object (Line 12). In the case that an object leaves the frame and does not come back for maxDisappeared  frames, a new (next) object ID would be assigned.
  
  
* objects : A dictionary that utilizes the object ID as the key and the centroid (x, y)-coordinates as the value (Line 13).
  
  
* disappeared : Maintains number of consecutive frames (value) a particular object ID (key) has been marked as “lost”for (Line 14).
  
  
* maxDisappeared : The number of consecutive frames an object is allowed to be marked as “lost/disappeared” until we deregister the object.

The ***register***  method is defined on Line 21. register method is responsible for adding new objects to our tracker: It accepts a centroid  and then adds it to the objects dictionary using the next available object ID.

The number of times an object has disappeared is initialized to 0  in the disappeared  dictionary (Line 25).

Finally, we increment the nextObjectID  so that if a new object comes into view, it will be associated with a unique ID (Line 26).

Just like we can add new objects to our tracker, we also need the ability to remove old ones that have been lost or disappeared from our the input frames themselves.

The ***deregister***  method is defined on Line 28. It simply deletes the objectID  in both the objects  and disappeared  dictionaries, respectively (Lines 31 and 32).

The ***update*** method, defined on Line 34, accepts a list of bounding box rectangles, presumably from an object detector (Haar cascade, HOG + Linear SVM, SSD, Faster R-CNN, etc.). The format of the rects  parameter is assumed to be a tuple with this structure: (startX, startY, endX, endY) .

If there are no detections, we’ll loop over all object IDs and increment their disappeared  count (Lines 37-41). We’ll also check if we have reached the maximum number of consecutive frames a given object has been marked as missing. If that is the case we need to remove it from our tracking systems (Lines 46 and 47). Since there is no tracking info to update, we go ahead and return  early on Line 51

Otherwise, we have quite a bit of work to do over the next seven code blocks in the update  method:

On Line 54 we’ll initialize a NumPy array to store the centroids for each rect .

Then, we loop over bounding box rectangles (Line 57) and compute the centroid and store it in the inputCentroids  list (Lines 59-61).

If there are currently no objects we are tracking, we’ll register each of the new objects:

Otherwise, we need to update any existing object (x, y)-coordinates based on the centroid location that minimizes the Euclidean distance between them:

The updates to existing tracked objects take place beginning at the else  on Line 72. The goal is to track the objects and to maintain correct object IDs — this process is accomplished by computing the Euclidean distances between all pairs of objectCentroids  and inputCentroids , followed by associating object IDs that minimize the Euclidean distance.

Inside of the else block beginning on Line 72, we will:

* Grab objectIDs  and objectCentroid  values (Lines 74 and 75).


* Compute the distance between each pair of existing object centroids and new input centroids (Line 81). The output NumPy array shape of our distance map D  will be (# of object centroids, # of input centroids) .


* To perform the matching we must (1) Find the smallest value in each row, and (2) Sort the row indexes based on the minimum values (Line 88). We perform a very similar process on the columns, finding the smallest value in each column, and then sorting them based on the ordered rows (Line 93). Our goal is to have the index values with the smallest corresponding distance at the front of the lists.


Then

* Initialize two sets to determine which row and column indexes we have already used (Lines 98 and 99). Keep in mind that a set is similar to a list but it contains only unique values.


* Then we loop over the combinations of (row, col)  index tuples (Line 103) in order to update our object centroids:

    * If we’ve already used either this row or column index, ignore it and continue  to loop (Lines 107 and 108).
    
    * Otherwise, we have found an input centroid that:
        * Has the smallest Euclidean distance to an existing centroid
        * And has not been matched with any other object
        * In that case, we update the object centroid (Lines 113-115) and make sure to add the row  and col  to their respective usedRows  and usedCols  sets
        
        
There are likely indexes in our usedRows  + usedCols  sets that we have NOT examined yet:

So we must determine which centroid indexes we haven’t examined yet and store them in two new convenient sets ( unusedRows  and unusedCols ) on Lines 124 and 125.

Our final check handles any objects that have become lost or if they’ve potentially disappeared:


To finish up:

* If the number of object centroids is greater than or equal to the number of input centroids (Line 131):
    * We need to verify if any of these objects are lost or have disappeared by looping over unused row indexes if any (Line 133).
    * In the loop, we will:
        * Increment their disappeared  count in the dictionary (Line 137).
        * Check if the disappeared  count exceeds the maxDisappeared  threshold (Line 142), and, if so we’ll deregister the object (Line 143).
        
Otherwise, the number of input centroids is greater than the number of existing object centroids, so we have new objects to register and track:

We loop over the unusedCols  indexes (Line 149) and we register each new centroid (Line 150). Finally, we’ll return the set of trackable objects to the calling method (Line 153).

# Understanding the centroid tracking distance relationship

Our centroid tracking implementation was quite long, and admittedly, the most confusing aspect of the algorithm is Lines 81-93.

If you’re having trouble following along with what that code is doing you should consider opening a Python shell and performing the following experiment:

In [2]:
from scipy.spatial import distance as dist
import numpy as np
np.random.seed(42)
objectCentroids = np.random.uniform(size=(2, 2))
centroids = np.random.uniform(size=(3, 2))
D = dist.cdist(objectCentroids, centroids) # Euclidean distance between the pairs
D

array([[0.82421549, 0.32755369, 0.33198071],
       [0.72642889, 0.72506609, 0.17058938]])

# Implementing the object tracking driver script

Now that we have implemented our CentroidTracker  class, let’s put it to work with an object tracking driver script.

The driver script is where you can use your own preferred object detector, provided that it produces a set of bounding boxes. This could be a Haar Cascade, HOG + Linear SVM, YOLO, SSD, Faster R-CNN, etc. For this example script, I’m making use of OpenCV’s deep learning face detector, but feel free to make your own version of the script which implements a different detector.

Inside this script, we will:

* Work with a live VideoStream  object to grab frames from your webcam


* Load and utilize OpenCV’s deep learning face detector


* Instantiate our CentroidTracker  and use it to track face objects in the video stream


* And display our results which includes bounding boxes and object ID annotations overlaid on the frames

In [None]:
# import the necessary packages
from tracker.centroidtracker import CentroidTracker
from imutils.video import VideoStream
import numpy as np
import argparse
import imutils
import time
import cv2
 
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-p", "--prototxt", required=True,
    help="path to Caffe 'deploy' prototxt file")
ap.add_argument("-m", "--model", required=True,
    help="path to Caffe pre-trained model")
ap.add_argument("-c", "--confidence", type=float, default=0.5,
    help="minimum probability to filter weak detections")
args = vars(ap.parse_args())

# initialize our centroid tracker and frame dimensions
ct = CentroidTracker()
(H, W) = (None, None)
 
# load our serialized model from disk
print("[INFO] loading model...")
net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])
 
# initialize the video stream and allow the camera sensor to warmup
print("[INFO] starting video stream...")
vs = VideoStream(src=0).start()
time.sleep(2.0)

# loop over the frames from the video stream
while True:
    # read the next frame from the video stream and resize it
    frame = vs.read()
    frame = imutils.resize(frame, width=400)
 
    # if the frame dimensions are None, grab them
    if W is None or H is None:
        (H, W) = frame.shape[:2]
 
    # construct a blob from the frame, pass it through the network,
    # obtain our output predictions, and initialize the list of
    # bounding box rectangles
    blob = cv2.dnn.blobFromImage(frame, 1.0, (W, H),
        (104.0, 177.0, 123.0))
    net.setInput(blob)
    detections = net.forward()
    rects = []
    
    # loop over the detections
    for i in range(0, detections.shape[2]):
        # filter out weak detections by ensuring the predicted
        # probability is greater than a minimum threshold
        if detections[0, 0, i, 2] > args["confidence"]:
            # compute the (x, y)-coordinates of the bounding box for
            # the object, then update the bounding box rectangles list
            box = detections[0, 0, i, 3:7] * np.array([W, H, W, H])
            rects.append(box.astype("int"))
 
            # draw a bounding box surrounding the object so we can
            # visualize it
            (startX, startY, endX, endY) = box.astype("int")
            cv2.rectangle(frame, (startX, startY), (endX, endY),
                (0, 255, 0), 2)
            
    # update our centroid tracker using the computed set of bounding
    # box rectangles
    objects = ct.update(rects)
 
    # loop over the tracked objects
    for (objectID, centroid) in objects.items():
        # draw both the ID of the object and the centroid of the
        # object on the output frame
        text = "ID {}".format(objectID)
        cv2.putText(frame, text, (centroid[0] - 10, centroid[1] - 10),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        cv2.circle(frame, (centroid[0], centroid[1]), 4, (0, 255, 0), -1)
 
    # show the output frame
    cv2.imshow("Frame", frame)
    key = cv2.waitKey(1) & 0xFF
 
    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break

# do a bit of cleanup
cv2.destroyAllWindows()
vs.stop()

First, we specify our imports. Most notably we’re using the CentroidTracker  class that we just reviewed. We’re also going to use VideoStream  from imutils  and OpenCV.

We have three command line arguments which are all related to our deep learning face detector:

* --prototxt : The path to the Caffe “deploy” prototxt.


* --model : The path to the pre-trained model models.


* --confidence : Our probability threshold to filter weak detections. I found that a default value of 0.5  is sufficient.

The prototxt and model files come from OpenCV’s repository 

***Note:*** In case you missed it at the start of this section, I’ll repeat that you can use any detector you wish. As an example, we’re using a deep learning face detector which produces bounding boxes. Feel free to experiment with other detectors, just be sure that you have capable hardware to keep up with the more complex ones (some may run best with a GPU, but this face detector can easily run on a CPU).

Next, let’s perform our initializations:

* Instantiate our CentroidTracker , ct  (Line 21). Recall from the explanation in the previous section that this object has three methods: (1) register , (2) deregister , and (3) update . We’re only going to use the update  method as it will register and deregister objects automatically. We also initialize H  and W  (our frame dimensions) to None  (Line 22).


* Load our serialized deep learning face detector model from disk using OpenCV’s DNN module (Line 26).


* Start our VideoStream , vs  (Line 30). With vs  handy, we’ll be able to capture frames from our camera in our next while  loop. We’ll allow our camera 2.0  seconds to warm up (Line 31).

Now let’s begin our while  loop and start tracking face objects:

We loop over frames and resize  them to a fixed width (while preserving aspect ratio) on Lines 34-47. Our frame dimensions are grabbed as needed (Lines 40 and 41).

Then we pass the frame through the CNN object detector to obtain predictions and object locations (Lines 46-49).

We initialize a list of rects , our bounding box rectangles on Line 50.

From there, let’s process the detections:

We loop over the detections beginning on Line 53. If the detection exceeds our confidence threshold, indicating a valid detection, we:

* Compute the bounding box coordinates and append them to the rects  list (Lines 59 and 60)


* Draw a bounding box around the object (Lines 64-66)


Finally, let’s call update  on our centroid tracker object, ct :

The ct.update  call on Line 70 handles the heavy lifting in our simple object tracker with Python and OpenCV script.

We would be done here and ready to loop back to the top if we didn’t care about visualization.

But that’s no fun!

On Lines 73-79 we display the centroid as a filled in circle and the unique object ID number text. Now we’ll be able to visualize the results and check to see if our CentroidTracker  properly keeps track of our objects by associating the correct IDs with the objects in the video stream.

We’ll display the frame on Line 82 until the quit key (“q”) has been pressed (Lines 83-87). If the quit key is pressed, we simply  break  and perform cleanup (Lines 87-91).