In [None]:
# !pip install tensorflow tensorflow-hub
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
import cv2
import tensorflow as tf
import tensorflow_hub as hub
import torch
import time
import numpy as np

In [None]:
coco_label_map = {
    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane',
    6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light',
    11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench',
    16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow',
    22: 'elephant', 23: 'bear', 24: 'zebra', 25: 'giraffe', 27: 'backpack',
    28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',
    35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat',
    40: 'baseball glove', 41: 'skateboard', 42: 'surfboard', 43: 'tennis racket',
    44: 'bottle', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon',
    51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', 56: 'broccoli',
    57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair',
    63: 'couch', 64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet',
    72: 'tv', 73: 'laptop', 74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cell phone',
    78: 'microwave', 79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator',
    84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddy bear',
    89: 'hair drier', 90: 'toothbrush'
}

In [None]:
def load_models():
    """
    Load the MobileNet and YOLO models.

    Returns
    -------
    mobilenet_model : tensorflow.Module
        The loaded MobileNet model from TensorFlow Hub.
    yolo_model : torch.nn.Module
        The loaded YOLOv5 model from Ultralytics.

    Examples
    --------
    >>> mobilenet_model, yolo_model = load_models()
    """
    mobilenet_model = hub.load("https://tfhub.dev/tensorflow/ssd_mobilenet_v2/2")
    yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
    return mobilenet_model, yolo_model

In [None]:
def resize_frame(frame, width=None, height=None):
    """
    Resize a video frame to the specified width and height while maintaining the aspect ratio.

    Parameters
    ----------
    frame : numpy.ndarray
        The input video frame.
    width : int, optional
        The desired width of the resized frame. If None, the width will be calculated based on the height.
    height : int, optional
        The desired height of the resized frame. If None, the height will be calculated based on the width.

    Returns
    -------
    resized_frame : numpy.ndarray
        The resized video frame.

    Examples
    --------
    >>> resized_frame = resize_frame(frame, width=640)
    """
    if width is None and height is None:
        return frame

    h, w = frame.shape[:2]
    if width and height:
        # Both width and height are specified
        resized_frame = cv2.resize(frame, (width, height))
    elif width:
        # Only width is specified, calculate height to maintain aspect ratio
        ratio = width / float(w)
        resized_frame = cv2.resize(frame, (width, int(h * ratio)))
    elif height:
        # Only height is specified, calculate width to maintain aspect ratio
        ratio = height / float(h)
        resized_frame = cv2.resize(frame, (int(w * ratio), height))

    return resized_frame

In [None]:
def process_results_yolo(frame, model):
    """
    Process the YOLO model detections and draw bounding boxes on the frame.

    Parameters
    ----------
    frame : numpy.ndarray
        The input video frame.
    model : torch.nn.Module
        The YOLO model.

    Returns
    -------
    frame : numpy.ndarray
        The frame with drawn bounding boxes and labels.

    Examples
    --------
    >>> processed_frame = process_results_yolo(frame, yolo_model)
    """
    detections = model(frame)

    # Filter detections for people (class 0) and clothing (class 2)
    selected_detections = detections.pandas().xyxy[0]
    selected_detections = selected_detections[(selected_detections['class'] == 0) | (selected_detections['class'] == 2)]

    # Draw bounding boxes for selected detections and print classes
    for _, detection in selected_detections.iterrows():
        x1, y1, x2, y2 = detection['xmin'], detection['ymin'], detection['xmax'], detection['ymax']
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        frame = cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Print class
        class_name = detection['name']
        confidence = detection['confidence']

        # Display class name and confidence at top-left corner of bounding box
        text = f'{class_name}: {confidence:.2f}'
        cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return frame

In [None]:
def process_results_mobilenet(frame, model):
    """
    Process the MobileNet model detections and draw bounding boxes on the frame.

    Parameters
    ----------
    frame : numpy.ndarray
        The input video frame.
    model : tensorflow.Module
        The MobileNet model.

    Returns
    -------
    frame : numpy.ndarray
        The frame with drawn bounding boxes and labels.

    Examples
    --------
    >>> processed_frame = process_results_mobilenet(frame, mobilenet_model)
    """
    # Convert frame to tensor and perform detection
    input_tensor = tf.convert_to_tensor(frame)
    input_tensor = input_tensor[tf.newaxis, ...]
    detections = model(input_tensor)

    # Process detections from the dictionary format
    num_detections = int(detections.pop('num_detections'))
    detections = {key: value[0, :num_detections].numpy() for key, value in detections.items()}
    detections['num_detections'] = num_detections

    detection_boxes = detections['detection_boxes']
    detection_scores = detections['detection_scores']
    detection_classes = detections['detection_classes'].astype(np.uint8)

    # Iterate over each detection
    for i in range(num_detections):
        # confidence of prediction
        confidence = detection_scores[i]
        # set confidence level threshold to filter weak predictions
        if confidence > 0.5:
            # get class id
            class_id = int(detection_classes[i])
            class_name = coco_label_map[class_id]

            # scale to the frame
            y1, x1, y2, x2 = detection_boxes[i]
            y_top_left = int(y1 * frame.shape[0])
            x_top_left = int(x1 * frame.shape[1])
            y_bottom_right = int(y2 * frame.shape[0])
            x_bottom_right = int(x2 * frame.shape[1])

            # draw bounding box around the detected object
            cv2.rectangle(frame, (x_top_left, y_top_left), (x_bottom_right, y_bottom_right), (0, 255, 0), 2)

            # Add class label and confidence at the top of the bounding box
            label = f'{class_name}: {confidence:.2f}'
            (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
            cv2.rectangle(frame, (x_top_left, y_top_left - label_height - 10),
                        (x_top_left + label_width, y_top_left - 10), (0, 0, 0), cv2.FILLED)
            cv2.putText(frame, label, (x_top_left, y_top_left - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)

In [None]:
def process_results(frame, model, model_type='tf'):
    """
    Process the detections from the specified model and draw bounding boxes on the frame.

    Parameters
    ----------
    frame : numpy.ndarray
        The input video frame.
    model : object
        The model to use for detection (MobileNet or YOLO).
    model_type : str, optional
        The type of the model ('mobilenet' or 'yolo'). Default is 'tf'.

    Returns
    -------
    frame : numpy.ndarray
        The frame with drawn bounding boxes and labels.

    Examples
    --------
    >>> processed_frame = process_results(frame, mobilenet_model, model_type='mobilenet')
    """
    # Resize frame to reduce computation
    resized_frame = resize_frame(frame, width=640, height=None)  # Adjust dimensions as needed

    if model_type == 'mobilenet':
        process_results_mobilenet(resized_frame, model)
    elif model_type == 'yolo':
        process_results_yolo(resized_frame, model)
    return frame

In [None]:
def benchmark_model(model, model_type, frame, iterations=10):
    """
    Benchmark the specified model by measuring the average processing time over a number of iterations.

    Parameters
    ----------
    model : object
        The model to benchmark (MobileNet or YOLO).
    model_type : str
        The type of the model ('mobilenet' or 'yolo').
    frame : numpy.ndarray
        The input video frame.
    iterations : int, optional
        The number of iterations to run the benchmark. Default is 10.

    Returns
    -------
    avg_time : float
        The average processing time per frame in seconds.

    Examples
    --------
    >>> avg_time = benchmark_model(yolo_model, 'yolo', frame, iterations=10)
    """
    times = []
    for _ in range(iterations):
        start_time = time.time()
        process_results(frame, model, model_type)
        end_time = time.time()
        times.append(end_time - start_time)

    avg_time = sum(times) / len(times)
    return avg_time

In [None]:
mobilenet_model, yolo_model = load_models()
current_model = mobilenet_model
current_model_type = 'mobilenet'

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open camera.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame
    resized_frame = resize_frame(frame, width=640, height=320)

    frame = process_results(resized_frame, current_model, current_model_type)
    cv2.imshow('Camera Feed', frame)

    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
    elif key == ord('m'):
        current_model = mobilenet_model
        current_model_type = 'mobilenet'
    elif key == ord('y'):
        current_model = yolo_model
        current_model_type = 'yolo'
    elif key == ord('b'):
        benchmark_mobilenet = benchmark_model(mobilenet_model, 'mobilenet', frame)
        benchmark_yolo = benchmark_model(yolo_model, 'torch', frame)
        print(f"MobileNet Model Average Time: {benchmark_mobilenet} seconds")
        print(f"YOLO Model Average Time: {benchmark_yolo} seconds")

    # Display the frame
    cv2.imshow('Object Detection', frame)

cap.release()
cv2.destroyAllWindows()