In [2]:
import cv2
import math
import time
import numpy as np
import argparse
import warnings
import OpCV_Utils

import os
import sys
sys.path.append('C:\\Users\\MATHEUS DANTAS PEREI\\Desktop\\DeepSortYOLOv5\\DeepSORT_YOLOv5_Pytorch\\')
sys.path.append('C:\\Users\\MATHEUS DANTAS PEREI\\Desktop\\DeepSortYOLOv5\\DeepSORT_YOLOv5_Pytorch\\deep_sort\\deep\\checkpoint\\')

from yolov5.utils.general import (check_img_size, non_max_suppression, scale_coords, xyxy2xywh)
from yolov5.utils.torch_utils import select_device, time_synchronized
from yolov5.utils.datasets import letterbox

from utils_ds.parser import get_config
from utils_ds.draw import draw_boxes
from deep_sort import build_tracker

import torch
import torch.backends.cudnn as cudnn
cudnn.benchmark = True

from IPython.display import clear_output

In [3]:
# Pose Detection Method:
def mpPoseDetection(img, pose, mp_pose, mp_drawing, mp_drawing_styles):

    # Get results:
    results = pose.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

    # Get dims:
    H, W, _ = img.shape

    if not results.pose_landmarks:
        return np.zeros(img.shape, dtype=np.uint8)
        
    # Create copy:
    annotated_img  = np.zeros(img.shape, dtype=np.uint8)
    
    # Draw pose landmarks on the image.
    mp_drawing.draw_landmarks(annotated_img,
                              results.pose_landmarks,
                              mp_pose.POSE_CONNECTIONS,
                              landmark_drawing_spec = mp_drawing_styles.get_default_pose_landmarks_style())
    
    return annotated_img

In [4]:
def image_track(im0, 
                detector,
                deepsort,
                img_size,
                device,
                conf_thres,
                iou_thres,
                classes):
    """
        Deep Sort Tracking for YOLOv5 Inference method
    """
    
    # preprocess ************************************************************
    # Padded resize
    img = letterbox(im0, new_shape=img_size)[0]
    
    # Convert:
    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
    img = np.ascontiguousarray(img)

    # numpy to tensor
    img = torch.from_numpy(img).to(device)
    img = img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    s = '%gx%g ' % img.shape[2:]    # print string

    # Detection time *********************************************************
    # Inference
    t1 = time_synchronized()
    with torch.no_grad():
        pred = detector(img)[0]  # list: bz * [ (#obj, 6)]

    # Apply NMS and filter object other than person (cls:0)
    pred = non_max_suppression(pred, conf_thres, iou_thres,
                               classes=classes)
    t2 = time_synchronized()

    # get all obj ************************************************************
    det = pred[0]  # for video, bz is 1
    if det is not None and len(det):  # det: (#obj, 6)  x1 y1 x2 y2 conf cls

        # Rescale boxes from img_size to original im0 size
        det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

        # Print results. statistics of number of each obj
        for c in det[:, -1].unique():
            n = (det[:, -1] == c).sum()  # detections per class
            s += '%g %ss, ' % (n, names[int(c)])  # add to string

        bbox_xywh = xyxy2xywh(det[:, :4]).cpu()
        confs = det[:, 4:5].cpu()

        # ****************************** deepsort ****************************
        outputs = deepsort.update(bbox_xywh, confs, im0)
        # (#ID, 5) x1,y1,x2,y2,track_ID
    else:
        outputs = torch.zeros((0, 5))

    t3 = time.time()
    return outputs, t2-t1, t3-t2

###########################################################################################################################

In [5]:
def build_detector(device, config_deepsort, weights):
    
    # ***************************** initialize DeepSORT **********************************
    cfg = get_config()
    cfg.merge_from_file(config_deepsort)

    use_cuda = device != 'cpu' and torch.cuda.is_available()
    deepsort = build_tracker(cfg, use_cuda=use_cuda)

    # ***************************** initialize YOLO-V5 **********************************
    detector = torch.load(weights, map_location=device)['model'].float()  # load to FP32
    detector.to(device).eval()

    names = detector.module.names if hasattr(detector, 'module') else detector.names
    
    return detector, deepsort, names

In [6]:
def mouseCallback(event, x, y, flags, param):
    
    global index
    global outputs
    global resize
    global frame_size
               
    if event == cv2.EVENT_LBUTTONDOWN:
        
        for detection in outputs:
            
            # Get bbox and crop it:
            x1 = detection[0]
            y1 = detection[1]
            x2 = detection[2]
            y2 = detection[3]
                        
            xt = 2*int(x*frame_size[1]/resize[0])
            yt = 2*int(y*frame_size[0]/resize[1])
            
            if (x1 < xt < x2) and (y1 < yt < y2):
                index = detection[4]
                
    if event == cv2.EVENT_LBUTTONUP:

        for detection in outputs:
            
            # Get bbox and crop it:
            x1 = detection[0]
            y1 = detection[1]
            x2 = detection[2]
            y2 = detection[3]
            
            xt = 2*int(x*frame_size[1]/resize[0])
            yt = 2*int(y*frame_size[0]/resize[1])
            
            if (x1 < xt < x2) and (y1 < yt < y2):
                index = detection[4]
    pass

In [18]:
# Select Device and define weights and deep sort paths:
device  = select_device('0')
weights = 'yolov5s6.pt'
config_deepsort = 'C:\\Users\\MATHEUS DANTAS PEREI\\Desktop\\DeepSortYOLOv5\\DeepSORT_YOLOv5_Pytorch\\configs\\deep_sort.yaml'

# Load YOLOv5 model and build detector:
detector, deepsort, names = build_detector(device, config_deepsort, weights)

# Global YOLOv5 inference parameters:
conf_thres = 0.35
iou_thres  = 0.10
classes    = [0]

# Config Mediapipe Pose Estimation Objects:
import mediapipe as mp

mp_drawing        = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose           = mp.solutions.pose

# Config:
pose = mp_pose.Pose(static_image_mode        = True,
                    model_complexity         = 1,
                    enable_segmentation      = False,
                    min_detection_confidence = 0.10)

# Set which frequency detection models will process frames:
frame_interval      = 2
frame_interval_pose = 1

# Load Video:
video_path     = 'football.mp4'
cap            = cv2.VideoCapture(video_path)

# Get text Scale:
success, frame = cap.read()
img_size       = (max(frame.shape[:2]), (max(frame.shape[:2])))
img_size       = (640, 640)
text_scale     = max(1, frame.shape[1] // 1600)

# Auxiliars:
idx_frame = 0
last_out  = None
yolo_time, sort_time, avg_fps = [], [], []
pose_output = np.zeros_like(frame)
pose_det    = np.zeros((200, 400)).astype('uint8')
crop_show   = np.zeros_like(frame)
crop        = np.zeros_like(frame)

# Wants Pose Estimation:
detect_pose = True

# Config Window:
cv2.namedWindow('Detection:')
cv2.setMouseCallback('Detection:', mouseCallback)
resize     = (900, 700)
frame_size = frame.shape[:2]

index   = -1000
outputs = []

# Video Loop:
while True:
    
     # Load pitch:
    pitch_img = cv2.imread('football_pitch.jpg')
    pitch_img = cv2.resize(pitch_img, frame.shape[:2])
    pitch_img = cv2.rotate(pitch_img, cv2.cv2.ROTATE_90_CLOCKWISE)

    # Start timer and read video frames:
    t0 = time.time()
    success, frame = cap.read()
    
    # Check if was read properly:
    if success:
        
        # Make a frame copy and get black board auxiliar:
        frame_copy  = frame.copy()

    # Check if video ends:
    if not success:
        
        # Repeat video:
        cap = cv2.VideoCapture(video_path)
        success, frame = cap.read()
        
        # Make a frame copy and get black board auxiliar:
        frame_copy  = frame.copy()

    #####################################################################################################
    # Apply Deep Sort Track:
    if idx_frame % frame_interval == 0:
                
        outputs, _, _ = image_track(frame, 
                                      detector,
                                      deepsort,
                                      img_size,
                                      device,
                                      conf_thres,
                                      iou_thres,
                                      classes)
        
        # Update last output detection:
        last_out = outputs
    
    # Use prediction of the previous frames:
    else: outputs = last_out  
        
    #####################################################################################################
    # For each YOLOv5 Detection after Deep Sort, draw bbox and apply pose estimation:
    if len(outputs) > 0:

        bbox_xyxy  = outputs[:, :4]
        identities = outputs[:, -1]
        frame = draw_boxes(frame, bbox_xyxy, identities)  # BGR

        ##############################################################################################
        # Use Mediapipe Pose detection applying to YOLOv5 detected objects:
        if idx_frame % 5*frame_interval == 0:
            pose_output = np.zeros_like(frame)
        
        for detection in outputs:
            
            # Get bbox and crop it:
            w    = max(0, detection[2]-detection[0])
            h    = max(0, detection[3]-detection[1])
            x    = max(0, int(detection[0] - w/2))
            y    = max(0, int(detection[1] - h/2))
                                    
            # Plot pitch:
            fx = pitch_img.shape[1]/frame.shape[1]
            fy = pitch_img.shape[0]/frame.shape[0]
            
            cX = int(min(int(detection[0] + w/2), frame.shape[1])*fx)
            cY = int(min(int(detection[1] + h/2), frame.shape[0])*fy)
                        
            cv2.circle(pitch_img, (cX, cY), 5, (255,0,0), thickness = -1, lineType = cv2.LINE_AA)
            cv2.putText(pitch_img, f'Player: {detection[4]}',
                       (max(0, cX-5), max(0, cY-8)), cv2.FONT_HERSHEY_PLAIN, text_scale, (80, 0, 0), thickness=2)
            
            if detection[4] == index:
                
                # Bbox will be twice as big as the original detection:
                crop = frame_copy[y:y+2*h, x:x+2*w, :]
                cv2.rectangle(frame, (detection[0], detection[1]), (detection[2], detection[3]), [255, 255, 255], 3) 
                cv2.circle(pitch_img, (cX, cY), 6, (0,0,255), thickness = -1, lineType = cv2.LINE_AA)
                
                # If frame is correct:
                if idx_frame % frame_interval_pose == 0:

                    if detect_pose:
                        pose_det      = mpPoseDetection(crop, pose, mp_pose, mp_drawing, mp_drawing_styles)
                        pose_output[y:y+2*h, x:x+2*w, :] = pose_det

                        cv2.putText(pose_output, f'Player: {detection[4]}',
                        (x, y), cv2.FONT_HERSHEY_PLAIN, text_scale, (255, 0, 0), thickness=1)
                
    ##############################################################################################
    # Display FPS:
    t1 = time.time()
    avg_fps.append(t1 - t0)
    cv2.putText(frame, f'{len(avg_fps) / sum(avg_fps)}',
            (20, 20 + text_scale), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2)
        
    # Keyboard Controls:
    key = cv2.waitKey(1) or 0xff   
    if key == ord('k'): break
    if key == ord('p'): detect_pose = not detect_pose
        
    # Define output img:
    crop = cv2.resize(crop, (200, 400))
    crop = np.hstack([crop, cv2.resize(pose_det, (200, 400))])
    crop_show[:crop.shape[0], :crop.shape[1],:] = crop
    output_img = np.hstack([frame, pose_output])
    output_2   = np.hstack([crop_show, cv2.resize(pitch_img, (frame.shape[1], frame.shape[0]))])
    output_img = np.vstack([output_img, output_2])
    
    # Show output:
    cv2.imshow('Detection:', cv2.resize(output_img, resize))
    
    # Update frame counter:
    idx_frame += 1
    if idx_frame == 3*frame_interval: idx_frame = 0
    
    # Clear output:
    clear_output(wait=False)
    
#######################################################################################################
# Release Video:
cv2.destroyAllWindows()
cap.release()

  self.update(yaml.load(fo.read()))
Loading weights from C:\Users\MATHEUS DANTAS PEREI\Desktop\DeepSortYOLOv5\DeepSORT_YOLOv5_Pytorch\deep_sort\deep\checkpoint\ckpt.t7... Done!


Using CUDA device0 _CudaDeviceProperties(name='NVIDIA GeForce MX450', total_memory=2047MB)



ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 3 dimension(s) and the array at index 1 has 2 dimension(s)