### OpenCV + Caffe Face Detection Video ###

In [3]:
# ref: https://www.pyimagesearch.com/2018/02/26/face-detection-with-opencv-and-deep-learning/

In [1]:
import numpy as np
import time
import cv2
import matplotlib.pyplot as plt

### Setup Functions ###

In [2]:
def caffe_config_net(proto, model):
    # Config net using cv2 (needs weights and config files)
    net = cv2.dnn.readNetFromCaffe(proto, model)
    
    return net

In [3]:
def caffe_processing(img, net):
    # Pass img/frame, net and output layers get output:

    start= time.time()

    # Modify img to input format:
    blob = cv2.dnn.blobFromImage(cv2.resize(img, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))

    # Pass to net:
    net.setInput(blob)
    detections = net.forward()

    end = time.time()
    
    process_time = end - start
    
    return detections, process_time

In [4]:
def caffe_detect(img, net, des_threshold):
    # Output format:
    # (pc, bx, by, bh, bw, .... class_preds)

    (H, W) = img.shape[:2]
    
    detections, process_time = caffe_processing(img, net)
    
    locations = np.zeros((5,))
    
    for i in range(0, detections.shape[2]):
        # extract the confidence (i.e., probability) associated with the prediction
        confidence = detections[0, 0, i, 2]
        # filter out weak detections by ensuring the `confidence` is
        # greater than the minimum confidence
        if confidence >= des_threshold:
            # compute the (x, y)-coordinates of the bounding box for the object
            box = detections[0, 0, i, 3:7] * np.array([W, H, W, H])
            (startX, startY, endX, endY) = box.astype("int")
            
            box_info = [(i+1), startX, startY, endX, endY]
            locations = np.vstack([locations, box_info])
            
            # draw the bounding box of the face along with the associated probability
            text = "Face{}: {:.2f}%".format((i+1), confidence * 100)
            y = startY - 10 if startY - 10 > 10 else startY + 10
            cv2.rectangle(img, (startX, startY), (endX, endY), (0, 0, 255), 2)
            cv2.putText(img, text, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 255), 1)            
    
    return img, locations[1:], process_time

### Object Detection and Classification: ###

In [39]:
## Fast Caffe files from: https://github.com/vinuvish/Face-detection-with-OpenCV-and-deep-learning/tree/master/models

In [5]:
# Get Yolo Net Architecture Weights and Configuration:
caffe_proto = 'deploy.prototxt.txt'
caffe_model = 'res10_300x300_ssd_iter_140000.caffemodel'
net = caffe_config_net(caffe_proto, caffe_model)

In [6]:
# Define video file:
video = 'people.mp4'

# Define threshold:
des_threshold = 0.4

In [8]:
# Load Video:
cap = cv2.VideoCapture(video)
video_fps = cap.get(cv2.CAP_PROP_FPS)

while True:
    
    start = time.time()
    
    success, frame = cap.read()
    
    if not success:
        print('Server OFF')
        break
        
    frame = cv2.resize(frame, (600,500))
    
    # Keyboard Controls:
    
    key = cv2.waitKey(1) or 0xff   
        
    if key == ord('k'):
        break
    
    #######################################
    
    frame, locations, process_time = caffe_detect(frame, net, des_threshold)
    
    #######################################
    
    end = time.time()

    frame_time = (end - start) + 0.0001
    fps = np.floor(1/frame_time)
        
    if (fps > video_fps):
        time.sleep(1/video_fps)
        fps = video_fps
            
    cv2.putText(frame, "FPS: {}".format(fps), (30,20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
    
    if success:
        cv2.imshow("Output", frame)
    
cv2.destroyAllWindows()
cap.release()

print('Caffe Process time:', process_time)
print('Last locations:', locations)

Caffe Process time: 0.06248283386230469
Last locations: [[  1. 241.  91. 288. 177.]
 [  2. 101. 104. 144. 184.]
 [  3. 431. 100. 488. 187.]]
