In [1]:
from tkinter import *
from tkinter import filedialog
import numpy as np
import cv2
import os
import imutils

In [2]:
nms_threshold = 0.3      #Non-Maximal Suppression - threshold for separating overlapping predictions
min_confidence = 0.2     #hreshold for confidence score returned by the model which a prediction is considered true
videoPath = "video_1.mp4"

In [3]:
def openFile():
    filetypes=(
        ("mp4 files", "*.mp4"), 
        ("avi files", "*.avi")
    )
    
    root.filename = filedialog.askopenfilename(
        initialdir="/Users/Lim Yen Qi/Documents/Degree Y2S3/Artificial Intelligence/Assg/Code",
        title="Select a video", 
        filetypes=filetypes
    )
    lblVideoPath.config(text=root.filename)

In [4]:
def run():
    try: 
        num = float(entConfSc.get())
        if 0.2 <= num < 1.0:
            min_confidence = num
            lblMsg.config(text=" ")
            #check path
            if lblVideoPath["text"] != "": 
                start_detection(nms_threshold, min_confidence, lblVideoPath["text"])
            else: 
                lblVideoPath.config(text="Please choose a file")
        else: 
            lblMsg.config(text="The number entered must in between 0.2 - 1.0 \n(1.0 is not included)")
    except: 
        lblMsg.config(text="Invalid input. Please enter a number between 0.2 - 1.0 \n(1.0 is not included)")

In [5]:
def pedestrian_detection_YOLO(image, model, layer_name, nms_threshold, min_confidence, personidz=0):
    (H, W) = image.shape[:2]   #dimensions of the frame passed 
    results = []
    
    #blob = Binary Large OBject; a data type that can store binary data
    blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), swapRB=True, crop=False)
    model.setInput(blob)
    layerOutputs = model.forward(layer_name)
    #yolo model perform forward pass & will return the bounding box for the detections & the confidence value
    #thus, layerOutputs is a list of output lists(bounding box, confidence value) 
    
    boxes = []
    centroids = []
    confidences = []
    
    for output in layerOutputs:
        for detection in output:
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            
            #only get the detection for PERSON class (person class id = 0) & only if the confidence > min_confidence 
            if classID == personidz and confidence > min_confidence:
                box = detection[0:4] * np.array([W, H, W, H])
                (centerX, centerY, width, height) = box.astype("int")
                
                #get the top-right coordinate of the bounding box
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                
                #add them to the lists created
                boxes.append([x, y, int(width), int(height)])
                centroids.append((centerX, centerY))
                confidences.append(float(confidence))
                
    # apply non-maxima suppression to suppress weak, overlapping bounding boxes
    delOverlap = cv2.dnn.NMSBoxes(boxes, confidences, min_confidence, nms_threshold)
    
    #check if there is any detection exists
    if len(delOverlap) > 0:
        #loop through all the detections 
        for i in delOverlap.flatten():
            # extract the bounding box coordinates
            (x, y) = (boxes[i][0], boxes[i][1])
            (w, h) = (boxes[i][2], boxes[i][3])
            # update the results list to consist of the person prediction probability, bounding box coordinates and the centroid
            res = (confidences[i], (x, y, x + w, y + h), centroids[i])
            results.append(res)
            
    # return the list of results
    return results

In [6]:
def start_detection(nms_threshold, min_confidence, videoPath):
    labelsPath = "coco.names"
    LABELS = open(labelsPath).read().strip().split("\n")
    weights_path = "yolov4-tiny.weights"
    config_path = "yolov4-tiny.cfg"
    
    model = cv2.dnn.readNetFromDarknet(config_path, weights_path)
    
    #run OpenCV on GPU
    model.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
    model.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
    
    layer_name = model.getLayerNames()
    layer_name = [layer_name[i - 1] for i in model.getUnconnectedOutLayers()]

    #videoPath = "video_1.mp4"
    #pass 0 if want to use webcam
    cap = cv2.VideoCapture(videoPath)
    
    
    #read all the frames from the video, break the loop if the video/frame ends
    while True:
        (grabbed, image) = cap.read()   #grabbed - A boolean indicating if the frame was successfully read or not.

        if not grabbed:
            break
        image = imutils.resize(image, width=700)  #resize the image without changing the ratio
        results = pedestrian_detection_YOLO(image, model, layer_name, nms_threshold, min_confidence, personidz=LABELS.index("person"))

        for res in results:
            #draw the bounding boxes
            #(image, topLeft, bottomRight, colour, thickness)
            cv2.rectangle(image, (res[1][0],res[1][1]), (res[1][2],res[1][3]), (0, 255, 0), 2)
            #put text on the boxes
            #(image, text, org, font, fontScale, color[, thickness[, lineType[, bottomLeftOrigin]]])
            cv2.putText(image, str(round(res[0], 6)), (res[1][0],res[1][1]), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0), 2)

        cv2.imshow("Pedestrian Detection - YOLO",image)   #display the image in a window

        key = cv2.waitKey(1)
        if key == 27:   #break when esc key is pressed
            break

    cap.release()             #release the capture 
    cv2.destroyAllWindows()   #close the window

In [7]:
root = Tk() 
root.title('Pedestrian Detection - YOLO')
root.geometry("500x350") #width * height


lblConfidenceSc = Label(root, text="Minimum Confidence Score: \n(0.2 <= x < 1.0) ", justify=RIGHT)
lblConfidenceSc.grid(row=0, column=0, padx=(30,10), pady=(20,0), sticky = 'e') 

entConfSc = Entry(root, borderwidth=5)
entConfSc.grid(row=0, column=1, padx=(0,0), pady=(20,0), sticky = 'w')

lblMsg = Label(root, text=" ", justify=LEFT, wraplengt=300)
lblMsg.grid(row=1, column=1, padx=(0,0), pady=(0,10), sticky = 'w')

lblVideoFile = Label(root, text="Choose a video file: ")
lblVideoFile.grid(row=2, column=0, padx=(30,10), pady=(20,0), sticky = 'e')

btnGetFile = Button(root, text="Browse", command=openFile)
btnGetFile.grid(row=2, column=1, padx=(0,0), pady=(20,0), sticky = 'w')

lblMsgFile = Label(root, text="File chosen: ", justify=RIGHT)
lblMsgFile.grid(row=3, column=0, padx=(0,10), pady=(0,20), stick='e')

lblVideoPath = Label(root, text="", justify=LEFT, wraplengt=300)
lblVideoPath.grid(row=3, column=1, padx=(0,0), pady=(0,20), stick='w')

btnRun = Button(root, text="Run", width="8", command=run) #command=functionName
btnRun.grid(row=4, column=0, padx=(30,0), pady=(30,10), sticky = 'e')

# sticky = 'e' : Align to Right.
# sticky = 'w': Align to Left.
# sticky = 'n': Align to Top.
# sticky = 's': Align to bottom.


root.mainloop()