Use TensorFlow Hub’s SSD MobileNet model to perform object detection on live webcam feed using OpenCV and draw bounding boxes with class labels.

In [32]:
import tensorflow as tf
import tensorflow_hub as hub
import cv2
import numpy as np

In [33]:
#Load the SSD MobileNet V2 model from TF hub
detector=hub.load("https://tfhub.dev/tensorflow/ssd_mobilenet_v2/2") #This model is trained on the COCO dataset.

In [34]:
#COCO label map(first 90 labels)
labels = [
    "???", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
    "truck", "boat", "traffic light", "fire hydrant", "???", "stop sign",
    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
    "elephant", "bear", "zebra", "giraffe", "???", "backpack", "umbrella", "???",
    "???", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "???", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
    "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
    "donut", "cake", "chair", "couch", "potted plant", "bed", "???", "dining table",
    "???", "???", "toilet", "???", "tv", "laptop", "mouse", "remote", "keyboard",
    "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "???",
    "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]

In [35]:
cap = cv2.VideoCapture(0)

In [None]:
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess frame
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    resized = tf.image.resize(rgb_frame, (320, 320))
    input_tensor = tf.expand_dims(tf.cast(resized, dtype=tf.uint8), 0)

    # Run detection
    detections = detector(input_tensor)
    detections = {k: v.numpy() for k, v in detections.items()}

    h, w, _ = frame.shape
    boxes = detections['detection_boxes'][0]
    scores = detections['detection_scores'][0]
    classes = detections['detection_classes'][0].astype(int)

    # Apply Non-Maximum Suppression
    selected_indices = tf.image.non_max_suppression(
        boxes=boxes,
        scores=scores,
        max_output_size=20,
        iou_threshold=0.5,
        score_threshold=0.6
    ).numpy()

    for i in selected_indices:
        box = boxes[i]
        score = scores[i]
        class_id = classes[i]

        # Safer label access
        if 0 < class_id < len(labels):
            label = labels[class_id]
        else:
            label = "N/A"

        ymin, xmin, ymax, xmax = box
        left, top = int(xmin * w), int(ymin * h)
        right, bottom = int(xmax * w), int(ymax * h)

        # Draw bounding box and label
        cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
        cv2.putText(frame, f"{label}: {score:.2f}", (left, top - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Display result
    cv2.imshow('Real-Time Object Detection', frame)

    # Exit on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
