Reference:
https://opencv-tutorial.readthedocs.io/en/latest/yolo/yolo.html

In [None]:
import pandas as pd

class DetectionOutput:
    def __init__(self):
        self.dataFrame = []
        self.frame = "frame"
        self.xmax = "xmax"
        self.xmin = "xmin"
        self.ymax = "ymax"
        self.ymin = "ymin"
        self.conf = "confidence"
        self.classID = "classID"
        self.frameList = []
        self.xmaxList = []
        self.xminList = []
        self.ymaxList = []
        self.yminList = []
        self.confList = []
        self.classIDList = []

    def addData(self, frame, xmax, xmin, ymax, ymin, confidence, classID):
        self.frameList.append(frame)
        self.xmaxList.append(xmax)
        self.xminList.append(xmin)
        self.ymaxList.append(ymax)
        self.yminList.append(ymin)
        self.confList.append(confidence)
        self.classIDList.append(classID)

    def makeDF(self):
        data = {
            self.frame : self.frameList,
            self.xmax : self.xmaxList,
            self.xmin : self.xminList,
            self.ymax : self.ymaxList,
            self.ymin : self.yminList,
            self.conf: self.confList,
            self.classID : self.classIDList
        }
        self.dataFrame = pd.DataFrame(data)
    
    def writeCSV(self, fileName):
        self.makeDF()
        file_name = fileName + '.csv'
        self.dataFrame.to_csv(file_name, sep=',', encoding='utf-8',index=False)

In [2]:
import numpy as np
import time
import cv2
import imutils
from imutils.video import FPS

OUTPUT_NAME='output2'
INPUT_FILE='../fish2_Trim.mp4'
OUTPUT_FILE=OUTPUT_NAME+'.mp4'
LABELS_FILE='train/obj.names'
CONFIG_FILE='cfg/yolov4-obj2.cfg'
WEIGHTS_FILE='yolov4-obj2_best.weights'
CONFIDENCE_THRESHOLD=0.3

H=None
W=None

d_output = DetectionOutput()

# capture input video
video_capture = cv2.VideoCapture(INPUT_FILE)

# get input video's frame size
frame_width = int(video_capture.get(3))
frame_height = int(video_capture.get(4))
frame_size = (frame_width,frame_height)

# get input video's fps
input_fps = video_capture.get(cv2.CAP_PROP_FPS)

fps = FPS().start()

# fourcc = cv2.VideoWriter_fourcc(*"MJPG") # for avi
fourcc = cv2.VideoWriter_fourcc(*"mp4v") # for mp4
writer = cv2.VideoWriter(OUTPUT_FILE, fourcc, input_fps, frame_size, True)

# make Labels with labels_file
LABELS = open(LABELS_FILE).read().strip().split("\n")

# set random color for labels and bounding boxes
np.random.seed(4)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")

# load the YOLO network model with config and weights file
net = cv2.dnn.readNetFromDarknet(CONFIG_FILE, WEIGHTS_FILE)

# determine only the *output* layer names that we need from YOLO
ln = net.getLayerNames()
ln = [ln[i - 1] for i in net.getUnconnectedOutLayers()]
cnt =0

# iterate through video frames
while True:
	cnt+=1
	print ("Frame number", cnt)
	ok, image = video_capture.read()
	if not ok:
		break
	# transform image into a blob
	blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), swapRB=True, crop=False)
	net.setInput(blob)
	if W is None or H is None:
		(H, W) = image.shape[:2]
	layerOutputs = net.forward(ln)

	# initialize our lists of detected bounding boxes, confidences, and
	# class IDs, respectively
	boxes = []
	confidences = []
	classIDs = []

	# loop over each of the layer outputs
	for output in layerOutputs:
		# loop over each of the detections
		for detection in output:
			# extract the class ID and confidence (i.e., probability) of
			# the current object detection
			scores = detection[5:]
			classID = np.argmax(scores)
			confidence = scores[classID]

			# filter out weak predictions by ensuring the detected
			# probability is greater than the minimum probability
			if confidence > CONFIDENCE_THRESHOLD:
				# scale the bounding box coordinates back relative to the
				# size of the image, keeping in mind that YOLO actually
				# returns the center (x, y)-coordinates of the bounding
				# box followed by the boxes' width and height
				box = detection[0:4] * np.array([W, H, W, H])
				(centerX, centerY, width, height) = box.astype("int")

				# use the center (x, y)-coordinates to derive the top and
				# and left corner of the bounding box
				x = int(centerX - (width / 2))
				y = int(centerY - (height / 2))

				# update our list of bounding box coordinates, confidences,
				# and class IDs
				boxes.append([x, y, int(width), int(height)])
				confidences.append(float(confidence))
				classIDs.append(classID)

	# apply non-maxima suppression to suppress weak, overlapping bounding
	# boxes
	idxs = cv2.dnn.NMSBoxes(boxes, confidences, CONFIDENCE_THRESHOLD,
		CONFIDENCE_THRESHOLD)

	# ensure at least one detection exists
	if len(idxs) > 0:
		print("object detected")
		# loop over the indexes we are keeping
		for i in idxs.flatten():
			# extract the bounding box coordinates
			(x, y) = (boxes[i][0], boxes[i][1])
			(w, h) = (boxes[i][2], boxes[i][3])

			color = [int(c) for c in COLORS[classIDs[i]]]

			cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
			text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
			cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
				0.5, color, 2)

			d_output.addData(cnt, x+w, x, y+h, y, format(confidences[i], ".4f"), classIDs[i])

	# show the output image
	cv2.imshow("output", image)
	writer.write(cv2.resize(image,frame_size))
	fps.update()
	key = cv2.waitKey(1) & 0xFF
	## if q is pressed terminate program
	if key == ord("q"):
		break
	## if space bar is pressed stop video
	if key == 32:
		key2 = cv2.waitKey()
	d_output.writeCSV(OUTPUT_NAME)

fps.stop()

print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

# do a bit of cleanup
cv2.destroyAllWindows()

# release the file pointers
print("[INFO] cleaning up...")
writer.release()
video_capture.release()

FileNotFoundError: [Errno 2] No such file or directory: 'train/obj.names'