In [1]:
import cv2
import numpy as np 

In [7]:
prototxt='./MobileNetSSD_deploy.prototxt'
caffemodel='./MobileNetSSD_deploy.caffemodel'
net=cv2.dnn.readNetFromCaffe(prototxt,caffemodel)


In [10]:
# Prepare labels of the network (20 class labels + background):
class_names = {0: 'background', 1: 'aeroplane', 2: 'bicycle', 3: 'bird', 4: 'boat', 5: 'bottle', 6: 'bus', 7: 'car',
               8: 'cat', 9: 'chair', 10: 'cow', 11: 'diningtable', 12: 'dog', 13: 'horse', 14: 'motorbike',
               15: 'person', 16: 'pottedplant', 17: 'sheep', 18: 'sofa', 19: 'train', 20: 'tvmonitor'}

In [20]:
def processes_detection(image,detections):
        # Size of frame resize (300x300)
    dim = 300
    # Process all detections:
    for i in range(detections.shape[2]):
        # Get the confidence of the prediction:
        confidence = detections[0, 0, i, 2]
        # Filter predictions by confidence:
        if confidence > 0.4:
            # Get the class label:
            class_id = int(detections[0, 0, i, 1])
            # Get the coordinates of the object location:
            xLeftBottom = int(detections[0, 0, i, 3] * dim)
            yLeftBottom = int(detections[0, 0, i, 4] * dim)
            xRightTop = int(detections[0, 0, i, 5] * dim)
            yRightTop = int(detections[0, 0, i, 6] * dim)
            # Factor for scale to original size of frame
            heightFactor = image.shape[0] / dim
            widthFactor = image.shape[1] / dim
            # Scale object detection to frame
            xLeftBottom = int(widthFactor * xLeftBottom)
            yLeftBottom = int(heightFactor * yLeftBottom)
            xRightTop = int(widthFactor * xRightTop)
            yRightTop = int(heightFactor * yRightTop)
            # Draw rectangle:
            cv2.rectangle(image, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop), (0, 255, 0), 2)
            # Draw label and confidence:
            if class_id in class_names:
                label = class_names[class_id] + ": " + str(confidence)
                labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
                yLeftBottom = max(yLeftBottom, labelSize[1])
                cv2.rectangle(image, (xLeftBottom, yLeftBottom - labelSize[1]),
                              (xLeftBottom + labelSize[0], yLeftBottom + 0), (0, 255, 0), cv2.FILLED)
                cv2.putText(image, label, (xLeftBottom, yLeftBottom), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 5)
    

In [24]:
capture=cv2.VideoCapture(0)
while True:
    ret,frame=capture.read()
    if ret:
        blob = cv2.dnn.blobFromImage(frame, 0.007843, (300, 300), (127.5, 127.5, 127.5))
        # Feed the input blob to the network, perform inference and get the output:
        net.setInput(blob)
        detections = net.forward()
        processes_detection(frame,detections)

        cv2.imshow('webcam',frame)
        if cv2.waitKey(20) & 0xFF == ord('q'):
            image=frame.copy()
            break
capture.release()
cv2.destroyAllWindows()    
# Create the blob with a size of (300,300), mean subtraction values (127.5, 127.5, 127.5):
# and also a scalefactor of 0.007843:
cv2.waitKey()
cv2.destroyAllWindows()    

