In [None]:
import torch
import cv2
import yolov5
import onnx
import torch.onnx as torch_onnx

# load pretrained model
model = yolov5.load("yolov5s.pt")
# set model parameters
model.conf = 0.25  # NMS confidence threshold
model.iou = 0.45  # NMS IoU threshold
model.agnostic = False  # NMS class-agnostic
model.multi_label = False  # NMS multiple labels per box
model.max_det = 50  # maximum number of detections per image

model.to("cpu")

# Set the model to evaluation mode
model.eval()

# Create a dummy input tensor of the same size as your input images
x = torch.randn(1, 3, 640, 640, requires_grad=True)

# Specify the name of the output ONNX file
onnx_file_name = "yolov5s.onnx"

# Export the model to an ONNX file
torch.onnx.export(model,               # model being run
                  x,                   # model input (or a tuple for multiple inputs)
                  onnx_file_name,      # where to save the model (can be a file or file-like object)
                  export_params=True,  # store the trained parameter weights inside the model file
                  opset_version=11,    # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ["input"],   # the model"s input names
                  output_names = ["output"], # the model"s output names
                  dynamic_axes={"input" : {0 : "batch_size"},    # variable length axes
                                "output" : {0 : "batch_size"}})


In [20]:
# quantize
import onnxruntime
from onnxruntime.quantization import QuantType
from onnxruntime.quantization.quantize import quantize_dynamic

# Load the ONNX model
onnx_file_name = 'yolov5s.onnx'
onnx_model_quantized_path = 'yolov5s_qt.onnx'
onnx_model = onnx.load(onnx_file_name)

# Specify the quantization method and parameters
quantize_dynamic(
    model_input=onnx_file_name,
    model_output=onnx_model_quantized_path,
    optimize_model=True,
    per_channel=False,
    reduce_range=False,
    weight_type=QuantType.QUInt8,
)

In [21]:
# load the ONNX model
import cv2
import numpy as np
import onnxruntime as rt
import torchvision

coco_labels = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus",
    "train", "truck", "boat", "traffic light", "fire hydrant",
    "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
    "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis",
    "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass",
    "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
    "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
    "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv",
    "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
    "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
    "teddy bear", "hair drier", "toothbrush"
]

# Load the ONNX model
sess = rt.InferenceSession("yolov5s_qt.onnx")

# Open the video stream
cap = cv2.VideoCapture(0)  # 0 for default camera, or "path_to_video_file" for a video file

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()

    if not ret:
        break

    # debug: open an image instead
    #frame = cv2.imread("F:\\Hacarus\\SAM_zhu\\notebooks\\images\\bike0.jpg")

    # Preprocess the image for YOLOv5
    # Resize to the input size expected by the model, convert to RGB, normalize, add batch dimension
    input_img = cv2.cvtColor(cv2.resize(frame, (640, 640)), cv2.COLOR_BGR2RGB)
    input_img = input_img / 255.0
    input_img = np.transpose(input_img, (2, 0, 1))
    input_img = np.expand_dims(input_img, 0).astype(np.float32)

    # Perform inference
    input_name = sess.get_inputs()[0].name
    outputs = sess.run(None, {input_name: input_img})

    # Here outputs are the raw YOLOv5 outputs. You would need to post-process them
    # to get bounding box coordinates, class labels, etc.

    # Parse outputs
    for output in outputs:
        boxes = output[0, :, :4]
        objectness = output[0, :, 4]
        class_probs = output[0, :, 5:]
        
        # Select top 4 objects based on objectness score
        top_indices = np.argsort(objectness)[-1:]
        top_boxes = boxes[top_indices]
        top_class_probs = class_probs[top_indices]
        top_classes = np.argmax(top_class_probs, axis=1)

        # Draw bounding boxes and class labels on the frame
        for i in range(1):
            box = top_boxes[i]
            class_id = top_classes[i]

            # The box coordinates need to be rescaled to the original frame size: 640*640 input tensor -> 640*480 frame
            x1, y1, x2, y2 = int(box[0] - box[2]/2), int(box[1] - box[3]/2), int(box[0] + box[2]/2), int(box[1] + box[3]/2)

            y1 = int(y1 * (480/640))
            y2 = int(y2 * (480/640))

            # Draw the bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Draw the class label
            label = coco_labels[class_id]
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36,255,12), 2)

    # Display the resulting frame
    cv2.imshow("frame", frame)
    
    # Break the loop on "q" key press
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()
