In [1]:
%reload_ext autoreload
%autoreload 2

import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
import time
import torch.nn as nn
import torch.nn.functional as F
import cam_util

from models import *
import utils
import random

In [2]:
# Checks GPU availability
assert(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print("Number of GPUs: {}".format(num_gpus))
for i in range(num_gpus):
    print("GPU #{}: {}".format(i, torch.cuda.get_device_name(i)))

Number of GPUs: 1
GPU #0: GeForce GTX 1050 Ti with Max-Q Design


In [3]:
# Defaults to CPU if GPU is not available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device to be used: {}".format(device))

Device to be used: cuda


In [24]:
# OpenCV variables
# Choose camera
cam_id = 0
# Speech variables
with_speech = False

In [25]:
cam_array = cam_util.get_camera_array(verbose=True)

Found 0 camera(s)


In [14]:
# YOLO paths and configuration variables
model_def = "config/yolov3.cfg"
weights_path = "weights/yolov3.weights"
class_path = "data/coco.names"
# img size on which to run inference
img_size = 416 # should be 416 as this is the size YOLO was trained on
# confidence threshold for class consideration
conf_thres = 0.9
nms_thres = 0.5

In [15]:
# Get COCO classes
classes = utils.load_classes(class_path) 
print(np.sort(classes))

['aeroplane' 'apple' 'backpack' 'banana' 'baseball bat' 'baseball glove'
 'bear' 'bed' 'bench' 'bicycle' 'bird' 'boat' 'book' 'bottle' 'bowl'
 'broccoli' 'bus' 'cake' 'car' 'carrot' 'cat' 'cell phone' 'chair' 'clock'
 'cow' 'cup' 'diningtable' 'dog' 'donut' 'elephant' 'fire hydrant' 'fork'
 'frisbee' 'giraffe' 'hair drier' 'handbag' 'horse' 'hot dog' 'keyboard'
 'kite' 'knife' 'laptop' 'microwave' 'motorbike' 'mouse' 'orange' 'oven'
 'parking meter' 'person' 'pizza' 'pottedplant' 'refrigerator' 'remote'
 'sandwich' 'scissors' 'sheep' 'sink' 'skateboard' 'skis' 'snowboard'
 'sofa' 'spoon' 'sports ball' 'stop sign' 'suitcase' 'surfboard'
 'teddy bear' 'tennis racket' 'tie' 'toaster' 'toilet' 'toothbrush'
 'traffic light' 'train' 'truck' 'tvmonitor' 'umbrella' 'vase'
 'wine glass' 'zebra']


In [16]:
# Load model from file
# Initiate model
model = Darknet(model_def).to(device)
if weights_path.endswith(".weights"):
    # Load darknet weights
    model.load_darknet_weights(weights_path)
else:
    # Load checkpoint weights
    model.load_state_dict(torch.load(weights_path))
# Put model on desired device
model = model.to(device)
# Put model in evaluation mode
model.eval()
print("Model loaded from: {}".format(weights_path))

Model loaded from: weights/yolov3.weights


In [17]:
def run_YOLO_on_frame(frame, model, conf_thres=0.9, nms_thres=0.5, colors=None):
    detections = None
    unique_labels = None
    bbox_colors = None
    if colors is None:
        colors = [cmap(i) for i in np.linspace(0, 1, 20)]
    # Convert webcam frame to Torch batch
    img_batch = cam_util.webcam_frame_to_torch_batch(frame, BGR_TO_RGB=True, pad_to_square=True, img_size=img_size)
    # Throw image on chosen device
    img_batch = img_batch.to(device)
    # Perform inference on network
    outputs = model(img_batch)
    # Eliminate low confidence detections
    detections = non_max_suppression(outputs, conf_thres=conf_thres, nms_thres=nms_thres)
    # Detections is a list, but we know it only contains detections for one input image
    detections = detections[0]
    if detections is not None:
        # Rescale bounding boxes to current frame img size
        detections = utils.rescale_boxes(detections, img_size, frame_webcam.shape[:2])
        # Get uniquely predicted labels
        unique_labels = detections[:, -1].cpu().unique()
        # Get number of predicted classes
        n_cls_preds = len(unique_labels)
        # Get bounding boxes' colors
        bbox_colors = random.sample(colors, n_cls_preds)
    return detections, unique_labels, bbox_colors

In [18]:
def show_YOLO_detections_on_frame(frame, detections, bbox_colors, unique_labels):
    for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections: 
        cls_str = classes[int(cls_pred)]
        # Get bbox coordinates, width and height
        left = int(x1.item())  
        bottom = int(y1.item())
        right = int(x2.item() )
        top = int(y2.item()) 
        box_w = right - left
        box_h = top - bottom
        # Get bbox color
        bbox_label = "{} {:.2f}".format(cls_str, cls_conf.item())
        color = bbox_colors[int(np.where(unique_labels == int(cls_pred))[0])]
        cv2.putText(frame, bbox_label, (left, bottom), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255))
        cv2.rectangle(frame, (left, bottom), (right, top), (0, 0, 255), 2)

In [19]:
# Possible bounding-box colors
cmap = plt.get_cmap("tab20b")
colors = [cmap(i) for i in np.linspace(0, 1, 20)]
# Interval on which to run YOLO inference
yolo_frame_interval = 10
# Get a reference to a video capturer
video_capture = cv2.VideoCapture(cam_array[cam_id])
#video_capture2 = cv2.VideoCapture(cam_array[1])
# loop until ESC key is pressed
frame_idx = 0
detections = None
detections2 = None
while not cv2.waitKey(33) == 27:
    frame_idx += 1
    if frame_idx > 1000:
        frame_idx = 1
    # Grab a single frame of video
    _, frame_webcam = video_capture.read()
    #_, frame_webcam2 = video_capture2.read()
    # predict for current downsized frame    
    start = time.time()
    # Only run inference if we are at the right interval
    if frame_idx % yolo_frame_interval == 0:
        detections, unique_labels, bbox_colors = run_YOLO_on_frame(frame_webcam, model, colors=colors, conf_thres=conf_thres)
        #detections2, unique_labels2, bbox_colors2 = run_YOLO_on_frame(frame_webcam2, model, colors=colors)
    # Calculate elapsed time (also in FPS)
    end = time.time()
    elapsed_sec = end - start        
    if elapsed_sec < 0.0417:
        fps = 'MAX'
    else:
        fps = str(int(1/elapsed_sec))
    # Display frame inference time
    time_lapse_label = 'Inference time: {:.0f} ms or {} fps'.format(elapsed_sec*1000, fps)
    cv2.putText(frame_webcam, time_lapse_label, (0, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
    # Display latest inferred detections
    if detections is not None:
        show_YOLO_detections_on_frame(frame_webcam, detections, bbox_colors, unique_labels)
    #if detections2 is not None:
    #    show_YOLO_detections_on_frame(frame_webcam2, detections2, bbox_colors2, unique_labels2, speech_engine=speech_engine)
    # Display the resulting image
    cv2.imshow('OPenCV YOLO 1', frame_webcam)
    #cv2.imshow('OPenCV YOLO 2', frame_webcam2)
# Release handle to the webcam
video_capture.release()
cv2.destroyAllWindows()

IndexError: list index out of range