In [1]:
# importing essential libraries
import numpy as np
import imutils
import time
import cv2
import os

In [2]:
# defining the base config variables
THRESH = 0.3
CONFIDENCE = 0.5
INPUT_VID = "videos/airplane.mp4"

In [3]:
# Changing the directory into notebooks paths
%cd /content/drive/MyDrive/My Projects/YOLO-pretrained-model

/content/drive/MyDrive/My Projects/YOLO-pretrained-model


In [4]:
# loading the coco data and labels that yolo algorithm was trained on
labelsPath = os.path.sep.join(["yolo-coco" , "coco.names"])
# reading the labels by their path, remving the potential white spaces and split them by \n
LABELS = open(labelsPath).read().strip().split("\n")

# initializing some random colors for each label
np.random.seed(42)
COLORS = np.random.randint(0 , 255 , size = (len(LABELS) , 3) , dtype = "uint8")

In [5]:
# passing the yolo weight and yolo config file locations into some varialbles
weightsPath = os.path.sep.join(["yolo-coco" , "yolov3.weights"])
configPath = os.path.sep.join(["yolo-coco" , "yolov3.cfg"])

# loading the pretrained yolo model using opencv module
net = cv2.dnn.readNetFromDarknet(configPath , weightsPath)

# determine only the *output* layer names that we need from yolo
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]

In [6]:
# loading the video file and initializing the frame dimensions
vs = cv2.VideoCapture(INPUT_VID)
writer = None
(W , H) = (None , None)

# determine the total number of frames in input video file
try:
  prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
      else cv2.CAP_PROP_FRAME_COUNT
  total = int(vs.get(prop))
  print("Total number of frames in the video is : " , total)

# if we can not get the total # of frames in the 
# video, the code below will run
except:
  print("Could not determine # of frames in video")
  total = -1

Total number of frames in the video is :  187


In [7]:
# looping over frames in the input video file
while True:
  # reading the next frame from the video file
  (grabbed , frame) = vs.read()

  # if the frame was not grabbed then we end
  # the loop
  if not grabbed:
    break
  # if the frame dimensions are empty, grab them
  if W is None or H is None:
    (H , W) = frame.shape[:2]

  # before feeding the frame to the yolo to get the bounding boxes and probabilities
  # first we have to preprocess our input frame
  blob = cv2.dnn.blobFromImage(frame , 1 / 255.0 , (416 , 416) , 
                              swapRB = True , crop = False)
  # performing a forwardpass through the yolo
  net.setInput(blob)
  start = time.time()
  layerOutputs = net.forward(ln)
  end = time.time()

  # initializing the list of detected bounding boxes, confidence
  # and class IDs
  boxes = []
  confidences = []
  classIDs = []
  # looping over each of the layer output
  for output in layerOutputs:
    # looping over each of the detections
    for detection in output:
      # extracting the clase ID and the confidence
      # for each of the current object detection
      scores = detection[5:]
      classID = np.argmax(scores)
      confidence = scores[classID]

      if confidence > CONFIDENCE:
        # YOLO returns the (x,y) center coordinates of the ROI
        # so we have to scale it to the relative size of an image
        box = detection[0:4] * np.array([W , H , W , H])
        (centerX , centerY , width , height) = box.astype("int")

        # using the (x , y) center coordinates to calculate the top left
        # corner coordinate of the bounding box
        x = int(centerX - (width / 2))
        y = int(centerY - (height / 2))
        boxes.append([x , y , int(width) , int(height)])
        confidences.append(float(confidence))
        classIDs.append(classID)
  # using the cv2 nonMaxSuppression function to remove redundant
  # overlapping bounding boxes
  idxs = cv2.dnn.NMSBoxes(boxes , confidences , CONFIDENCE , THRESH)

  # ensuring that at least one detection exists
  if len(idxs) > 0:
    for i in idxs.flatten():
      # extracting the bounding box coordinates
      (x , y) = (boxes[i][0] , boxes[i][1])
      (w , h) = (boxes[i][2], boxes[i][3])

      # drawing the bounding box rectangle(s)
      color = [int(c) for c in COLORS[classIDs[i]]]
      cv2.rectangle(frame , (x ,y) , (x + w , y + h) , 
                    color , 3)
      text = "{} : {:.3f}".format(LABELS[classIDs[i]] , confidences[i])
      cv2.putText(frame , text , (x , y - 5) , 
                  cv2.FONT_HERSHEY_SIMPLEX , 0.5 , color , 2)
    
  # check if the video writer in None
  if writer is None:
    # initializing our video writer
    fourcc = cv2.VideoWriter_fourcc(*"MP4V")
    writer = cv2.VideoWriter("airplane_final.mp4" , fourcc , 20.0 , 
                  (frame.shape[1] , frame.shape[0]) , True)
    
    # some information on processing single frame
    if total > 0:
      elap = (end - start)
      print("A single frame took {:.3f} seconds.".format(elap))
      print("The whole video took {:.3f} seconds.".format(elap * total))
  
  # writing the output frame to disk
  writer.write(frame)

writer.release()
vs.release()

A single frame took 2.496 seconds.
The whole video took 466.792 seconds.
