# Dataset processing

## I use the dataset from https://motchallenge.net/data/MOT16/ , because it contains our train images with det.txt files. The det.txt files can be converted to YOLO.txt for detecting.

In [1]:
import os
import shutil
import numpy as np

In [2]:
root = os.getcwd()

In [3]:
root

'E:\\jupyterdoc\\yolov5-6.1'

In [4]:
def convert(imgWidth, imgHeight, left, top, width, height):
    x = (left + width / 2.0) / imgWidth
    y = (top + height / 2.0) / imgHeight
    w = width / imgWidth
    h = height / imgHeight
    return ('%.6f'%x, '%.6f'%y, '%.6f'%w, '%.6f'%h) 

In [5]:
def get_image_and_label(filepath,new_filepath,namedoc,seqLength,imgWidth,imgHeight):
    det_path = filepath + 'det/det.txt'
    dets = np.loadtxt(det_path,delimiter=',')
    for det in dets:
        frame_id, _, left, top, width, height = int(det[0]), det[1], det[2], det[3], det[4], det[5]
        box = convert(imgWidth, imgHeight, left, top, width, height)
        if '-' in ''.join(box) or float(box[0]) > 1.0 or float(box[1]) > 1.0 or float(box[2]) > 1.0 or float(box[3]) > 1.0:
            continue
        image_name = namedoc + '-' + '%06d'%frame_id + '.jpg' 
        label_name = namedoc + '-' + '%06d'%frame_id + '.txt' 
        oldimgpath = filepath + 'img1/'+ '%06d'%frame_id + '.jpg' 
        if frame_id <= seqLength // 2: 
            newimgpath = new_filepath + 'images/train/'+ image_name 
            labelpath = new_filepath + 'labels/train/' + label_name 
        else: 
            newimgpath = new_filepath + 'images/val/'+ image_name 
            labelpath = new_filepath + 'labels/val/'+ label_name 
        shutil.copyfile(str(oldimgpath), str(newimgpath)) 
        with open(labelpath, 'a') as f: 
            f.write(f'0 {box[0]} {box[1]} {box[2]} {box[3]}\n')
    print('Dataset has processed')
    return

In [6]:
get_image_and_label(root+'/MOT16/train/MOT16-09/',root+ '/VOCdevkit/','test_1',525,1920,1080)

Dataset has processed


In [7]:
get_image_and_label(root+'/MOT16/train/MOT16-02/',root+ '/VOCdevkit/','test_2',600,1920,1080)

Dataset has processed


# use anaconda prompt to run (python train.py --batch 8 --epoch 10) to get our weight file 'Best.pt'. You can directly use the weight file to detect.

<img src = './input_order.png'>

<img src = './train_yolo.png'>

# Use YOLO with Best.pt and person.yaml to detect pedestrains

In [1]:
import Ipynb_importer

In [2]:
import matplotlib.pyplot as plt
from skimage.measure import compare_ssim

In [3]:
import os
import sys
from pathlib import Path
import numpy as np
import cv2
import torch
import torch.backends.cudnn as cudnn

In [4]:
from models.common import DetectMultiBackend
from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
                           increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
from utils.plots import Annotator, colors, save_one_box
from utils.torch_utils import select_device, time_sync
 
#导入letterbox
from utils.augmentations import Albumentations, augment_hsv, copy_paste, letterbox, mixup, random_perspective

In [5]:
root = os.getcwd()

In [6]:
root

'E:\\jupyterdoc\\yolov5-6.1'

In [7]:
# This part of the code is quoted from detect.py in YOLOV5, the link is https://github.com/ultralytics/yolov5/releases/tag/v6.1
# and I have rewritten this part of the code so that it can return the coordinates of the pedestrian border
# I rewrite the detect.py as a function to return (x,y,w,h) 
weights= root + '/runs/train/exp1/weights/best.pt'  
source=root + '/data/images'  
data= root + '/data/person.yaml'  
 
imgsz=(640, 640) 
conf_thres=0.25  
iou_thres=0.45  
max_det=1000  
device='0'  
classes=None  
agnostic_nms=False  
augment=False  
visualize=False  
half=False  
dnn=False  
 
 

device = select_device(device)
 

model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data)
stride, names, pt, jit, onnx, engine = model.stride, model.names, model.pt, model.jit, model.onnx, model.engine
imgsz = check_img_size(imgsz, s=stride)  
 
# Half

half &= (pt or jit or onnx or engine) and device.type != 'cpu'  # FP16 supported on limited backends with CUDA
if pt or jit:
    model.model.half() if half else model.model.float()
 
 
def detect(img):
    # Dataloader
    
    dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
 
    # Run inference
    
    model.warmup(imgsz=(1, 3, *imgsz), half=half)  # warmup
    dt, seen = [0.0, 0.0, 0.0], 0
 
    
    im0 = img
    # Padded resize
    im = letterbox(im0, imgsz, stride, auto=pt)[0]
    # Convert
    im = im.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
    im = np.ascontiguousarray(im)
    t1 = time_sync()
    im = torch.from_numpy(im).to(device)
    im = im.half() if half else im.float()  # uint8 to fp16/32
    im /= 255  # 0 - 255 to 0.0 - 1.0
    if len(im.shape) == 3:
        im = im[None]  # expand for batch dim
    t2 = time_sync()
    dt[0] += t2 - t1
 
    # Inference
    
    pred = model(im, augment=augment, visualize=visualize)
    t3 = time_sync()
    dt[1] += t3 - t2
 
    
    pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
    dt[2] += time_sync() - t3
 
    
    detections=[]
    
    # Process predictions
    for i, det in enumerate(pred):  # per image 
        seen += 1
       
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()
            # Write results
            
            for *xyxy, conf, cls in reversed(det):
                xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4))).view(-1).tolist()
                xywh = [round(x) for x in xywh]
                xywh = [xywh[0] - xywh[2] // 2, xywh[1] - xywh[3] // 2, xywh[2],
                        xywh[3]]  # （left，top，w，h）
 
                cls = names[int(cls)]
                conf = float(conf)
                detections.append({'position': xywh})
    
    return detections
 

YOLOv5  2022-2-22 torch 1.12.0 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)

Fusing layers... 
Model Summary: 224 layers, 7053910 parameters, 0 gradients


In [8]:
path = root + '/test/1/000001.jpg'
img = cv2.imread(path)


detect(img)[0]['position']

[1621, 440, 50, 169]

In [9]:
save = detect(img)

In [10]:
len(save)

12

In [11]:
def get_data(filepath):
    file_dir = os.path.join(filepath)
    if os.path.exists(file_dir):
        files = os.listdir(file_dir)
    return files

# If you want to test your own images, you can load your images in test/1 folder.

In [12]:
test_1_path = root + '/test/1/'
test_2_path = root + '/test/2/'
test_1 = get_data(test_1_path)
test_2 = get_data(test_2_path)

In [13]:
# get all pedestrains position in i-th image 
def cal_img(train_path,train,i):
    vlist = []
    path = train_path + train[i]
    img = cv2.imread(path)
    save = detect(img)
    for j in range(len(save)):
        vlist.append(save[j]['position'])
    return vlist

In [14]:
def NMS(boxes, threshold):
    if len(boxes) == 0:
        return []
    
    boxes = np.array(boxes).astype("float")

    x1 = boxes[:,0]  
    y1 = boxes[:,1]
    w1 = boxes[:,2]  
    h1 = boxes[:,3]  
    x2 = x1 + w1  
    y2 = y1 + h1  
    
    area = (w1 + 1) * (h1 + 1)  
    temp = []
    
    idxs = np.argsort(h1)
    
    while len(idxs) > 0:
        last = len(idxs) - 1
        i = idxs[last]
        temp.append(i)   
        
        x1_m = np.maximum(x1[i], x1[idxs[:last]])
        y1_m = np.maximum(y1[i], y1[idxs[:last]])  
        
        x2_m = np.minimum(x2[i], x2[idxs[:last]])
        y2_m = np.minimum(y2[i], y2[idxs[:last]])
       

        w = np.maximum(0, x2_m - x1_m + 1)
        h = np.maximum(0, y2_m - y1_m + 1)
       
        over = (w * h) / area[idxs[:last]]
        
        idxs = np.delete(idxs, np.concatenate(([last],  
            np.where(over > threshold)[0])))  

    return boxes[temp].astype("int")

## find min distance 

In [15]:
import math

In [16]:
def find_distance(test_path,test_):
    save_list = []
    for i in range(len(test_)):
        
        
        if i == 0:
            get_poi = cal_img(test_path,test_,0)
            save = NMS(get_poi, threshold=0.3)
            person = save
           
        else:
            get_poi = cal_img(test_path,test_,i)
            save = NMS(get_poi, threshold=0.3)
            
            
            for j in range(len(save)):
                
                center_now = [(save[j][0] + save[j][0] + save[j][2])/2,(save[j][1] + save[j][1] + save[j][3])/2]
                
                min = 999999
                
                for k in range(len(person)):
                    
                    center_bf = [(person[k][0] + person[k][0] + person[k][2])/2,(person[k][1] + person[k][1] + person[k][3])/2]
                    
                    absdis = math.sqrt((center_now[0] - center_bf[0])**2 + (center_now[1] - center_bf[1])**2)
                    if absdis < min:
                        
                        min = absdis
                
                save_list.append(int(min))
            person = save
                     
            
    return save_list

In [17]:
num_li = find_distance(test_1_path,test_1)

In [18]:
max(num_li)

821

In [19]:
list_len = [0] * (max(num_li)+1)
x_l = []

In [20]:
for i in range(len(num_li)):
    indexadd = num_li[i]
    list_len[indexadd] += 1
for i in range(len(list_len)):
    x_l.append(i)

In [21]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('Qt5Agg')

In [22]:
# coding=utf-8

 
plt.figure(figsize=(20, 10), dpi=100)

plt.plot(x_l, list_len)
plt.xlabel("distance")
plt.ylabel("number in this distance")
plt.show(block=True)

In [23]:
# find whether is a group
def find_group(now_list):
    group_p = []
    
    for i in range(0,len(now_list)-1):
        center_i = [(now_list[i][0]+now_list[i][0]+now_list[i][2])/2,(now_list[i][1]+now_list[i][1]+now_list[i][3])/2]
                
        for j in range(i+1,len(now_list)):
            center_j = [(now_list[j][0]+now_list[j][0]+now_list[j][2])/2,(now_list[j][1]+now_list[j][1]+now_list[j][3])/2]
            dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
            if dis < 150:
                group_p.append([now_list[i],now_list[j]])
    
    final_group = []
    for i in range(len(group_p)):
        if group_p[i][0][0] <= group_p[i][1][0]:
            x = group_p[i][0][0]
        if group_p[i][0][0] > group_p[i][1][0]:
            x = group_p[i][1][0]
        if group_p[i][0][1] <= group_p[i][1][1]:
            y = group_p[i][1][1]
        if group_p[i][0][1] > group_p[i][1][1]:
            y = group_p[i][0][1]
        
        w = group_p[i][0][2] + group_p[i][1][2]
        
        h = group_p[i][0][3] + group_p[i][1][3]
        
        final_group.append([x,y,w,h])
    
    
    return final_group

# For task 1 ,2 and 3. When you use this function, you need to select the area you like on the first image, so that the statistics of pedestrians in the area will be realized in the subsequent video processing. I used waitkey() to help you see each image. If you want to achieve the effect of continuous playback, you can press and hold any key to achieve it.

In [24]:
def showtask_1and2and3(test_path,test_):
    def each_pop(a,b):
        if len(a)!=0:
            now = a.pop(0)
            now_c = b.pop(0)
            label.append(now)
            color.append(now_c)
            return each_pop(a,b)
        else:
            return
    
    point_size = 1
    thickness = 2
    label = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
    color = [(130,220,255), (250,135,115), (0,255,255), (102,205,170), (138,43,226), 
         (205,51,51), (102,205,0), (238,118,33), (193,255,193), (238,18,137), 
         (3,3,3), (247,247,247), (238,162,173), (255,153,18), (61,89,171), (128,138,135), (220,20,60),
        (0,100,0), (202,255,112), (154,50,205), (155,205,155), (148,0,211), (0,104,139),(0,100,50), (100,255,112), (154,150,205), (15,205,155), (48,0,211), (0,104,13),(0,14,139)]
    now_person = []
    now_number = []
    now_color = []
    save_two_point = []
    save_one_point = []
    save_two_color = []
    save_one_color = []
    
   
    for_mp4 = []
    
    total_person_number = 0
    
    select_region = cv2.imread(test_path+test_[0])
    r = cv2.selectROI('roi',select_region , False, False )
    
    for i in range(len(test_)):
        
        region_number = 0
        img = cv2.imread(test_path+test_[i])
        get_poi = cal_img(test_path,test_,i)
        save = NMS(get_poi, threshold=0.3)
        now_person_number = len(save)
        if len(now_person) == 0:
            
            now_im = cv2.rectangle(img, (r[0], r[1]), (r[0] + r[2], r[1] + r[3]), (173,255,47), 2)
            now_im = cv2.putText(now_im,'{}'.format('region'),(r[0], r[1]),cv2.FONT_HERSHEY_SIMPLEX,0.75,(255,255,0), 2)
            
            for k in range(len(save)):
                now_pont_center = [(save[k][0]+save[k][0]+save[k][2])/2,(save[k][1]+save[k][1]+save[k][3])/2]
                
                now_person.append(save[k])
                now_number.append(label.pop(0))
                now_color.append(color.pop(0))
                save_one_point.append(now_pont_center)
                save_one_color.append(now_color[-1])
            
            
            for j in range(len(now_person)):
                [x,y,w,h] = now_person[j]
                now_im = cv2.rectangle(img, (x, y), (x + w, y + h), now_color[j], 2)
            

                if x+w/2 > r[0] and y + h/2 > r[1] and x+w/2 < r[0]+ r[2] and y + h/2 < r[1]+r[3]:
                    region_number += 1
            
            for w in range(len(save_one_point)):
                now_im = cv2.circle(now_im, (int(save_one_point[w][0]),int(save_one_point[w][1])), point_size, save_one_color[w], thickness)
                
            
            toge = find_group(now_person)
            toge = NMS(toge, threshold=0.3)
            for p in range(len(toge)):
                [x,y,w,h] = toge[p]
                now_im = cv2.rectangle(now_im, (x, y), (x + w, y + h),(247,186,11), 2)
            
                
                
            
            total_person_number += len(now_person) 
            cv2.putText(img, "Count of Unique pedestrain: {}".format(total_person_number), (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 255), 2)
            cv2.putText(img, "Total pedestrain: {}".format(now_person_number), (20, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
            cv2.putText(img, "pedestrain in region: {}".format(region_number), (20, 140), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 255), 2)
            
            cv2.putText(img, "group number: {}".format(len(toge)), (20, 190), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (100, 0, 255), 2)
            cv2.putText(img, "alone person: {}".format(now_person_number-len(toge)), (20, 240), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 100, 100), 2)
            
            for_mp4.append(now_im)
            
            cv2.imshow('test',now_im)
            cv2.waitKey()
        else:
            update_person = []
            update_number = []
            update_color = []
            return_number = []
            return_color = []
            
            # Set four rectangles on the four sides of the picture
            highlight = []
            right_box = [1823, 1, 97, 1079]
            left_box = [3, 1, 123, 1079]
            top_box = [3, 2, 1917, 70]
            bot_box = [1, 984, 1919, 96]
            
            
            # Determine whether the center point of the pedestrian border is within these four rectangular boxes
            for k in range(len(save)):
                now_pont_center = [(save[k][0]+save[k][0]+save[k][2])/2,(save[k][1]+save[k][1]+save[k][3])/2]
                if now_pont_center[0] >= left_box[0] and now_pont_center[0] <= left_box[0] + left_box[2] and now_pont_center[1] >= left_box[1] and now_pont_center[1] <= left_box[1] + left_box[3]:
                    highlight.append(save[k])
                if now_pont_center[0] >= right_box[0] and now_pont_center[0] <= right_box[0] + right_box[2] and now_pont_center[1] >= right_box[1] and now_pont_center[1] <= right_box[1] + right_box[3]:
                    highlight.append(save[k])
                if now_pont_center[0] >= top_box[0] and now_pont_center[0] <= top_box[0] + top_box[2] and now_pont_center[1] >= top_box[1] and now_pont_center[1] <= top_box[1] + top_box[3]:
                    highlight.append(save[k])
                if now_pont_center[0] >= bot_box[0] and now_pont_center[0] <= bot_box[0] + bot_box[2] and now_pont_center[1] >= bot_box[1] and now_pont_center[1] <= bot_box[1] + bot_box[3]:
                    highlight.append(save[k])
             
                
                
                
                now_mid_r = save[k][0] + save[k][2]/2
                now_mid_c = save[k][1] + save[k][3]/2
                min = 999999999
                for j in range(len(now_person)):
                    bf_r = now_person[j][0] + now_person[j][2]/2
                    bf_c = now_person[j][1] + now_person[j][3]/2
                    if (now_mid_r-bf_r)**2+(now_mid_c-bf_c)**2 < min:
                        min = (now_mid_r-bf_r)**2+(now_mid_c-bf_c)**2
                        index = j
                if min**(1/2) < 150:
                    update_person.append(save[k])
                    update_number.append(now_number[index])
                    update_color.append(now_color[index])
                    bf_point = [(now_person[index][0] + now_person[index][0] + now_person[index][2])/2,(now_person[index][1] + now_person[index][1] + now_person[index][3])/2]
                    save_two_point.append([now_pont_center,bf_point])
                    save_two_color.append(update_color[-1])
                else:
                    update_person.append(save[k])
                    update_number.append(label.pop(0))
                    update_color.append(color.pop(0))
                    save_one_point.append(save[k])
                    save_one_color.append(update_color[-1])
                    #highlight.append(save[k])
                    
                    
                    
    
                    total_person_number += 1
            for q in range(len(now_color)):
                if now_color[q] not in update_color:
                    return_number.append(now_number[q])
                    return_color.append(now_color[q])
                    #highlight.append(now_person[q])
            each_pop(return_number,return_color)
            now_person = update_person
            now_number = update_number
            now_color = update_color
            
            now_im = cv2.rectangle(img, (r[0], r[1]), (r[0] + r[2], r[1] + r[3]), (173,255,47), 2)
            now_im = cv2.putText(now_im,'{}'.format('region'),(r[0], r[1]),cv2.FONT_HERSHEY_SIMPLEX,0.75,(255,255,0), 2)
            
            
            for j in range(len(now_person)):
                [x,y,w,h] = now_person[j]
                now_im = cv2.rectangle(img, (x, y), (x + w, y + h), now_color[j], 2)
            
                if x+w/2 > r[0] and y + h/2 > r[1] and x+w/2 < r[0]+ r[2] and y + h/2 < r[1]+r[3]:
                    region_number += 1
            
            for w in range(len(save_two_point)):
                
                now_im = cv2.line(now_im,(int(save_two_point[w][0][0]),int(save_two_point[w][0][1])),(int(save_two_point[w][1][0]),int(save_two_point[w][1][1])),save_two_color[w],thickness)
                
            for w in range(len(save_one_point)):
                now_im = cv2.circle(now_im, (int(save_one_point[w][0]),int(save_one_point[w][1])), point_size, save_one_color[w], thickness)

            
            
            toge = find_group(now_person)
            toge = NMS(toge, threshold=0.3)
            for p in range(len(toge)):
                [x,y,w,h] = toge[p]
                now_im = cv2.rectangle(now_im, (x, y), (x + w, y + h),(247,186,11), 2)
                now_im = cv2.putText(now_im,'{}'.format('Group'),(int(x),int(y)),cv2.FONT_HERSHEY_SIMPLEX,0.5,(247,186,11), 2)
            
            for p in range(len(highlight)):
                [x,y,w,h] = highlight[p]
                now_im = cv2.rectangle(now_im, (x, y), (x + w, y + h),(255,255,0), 2)
                now_im = cv2.putText(now_im,'{}'.format('HIGHLIGHT'),(int(x),int(y)),cv2.FONT_HERSHEY_SIMPLEX,0.75,(255,255,0), 2)
            
            
                
                
            

            cv2.putText(img, "Count of Unique pedestrain: {}".format(total_person_number), (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 255), 2)
            cv2.putText(img, "Total pedestrain: {}".format(now_person_number), (20, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
            cv2.putText(img, "pedestrain in region: {}".format(region_number), (20, 140), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 255), 2)
            
            cv2.putText(img, "group number: {}".format(len(toge)), (20, 190), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (100, 0, 255), 2)
            cv2.putText(img, "alone person: {}".format(now_person_number-len(toge)), (20, 240), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 100, 100), 2)
            
            for_mp4.append(now_im)
            
            cv2.imshow('test',now_im)
            cv2.waitKey()   
    cv2.destroyAllWindows()
    return for_mp4










In [25]:
for_mp4 = showtask_1and2and3(test_1_path,test_1)

  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)
  dis = ((center_i[0]-center_j[0])**2 - (center_i[1]-center_j[1])**2)**(1/2)


# Convert image to video in output folder.

In [28]:
fps = 30
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(filename=root+ '/output/test.mp4', fourcc=fourcc, fps=fps, frameSize=(1920,1080))  # 图片实际尺寸，不然生成的视频会打不开
for i in range(len(for_mp4)):
  
    video_writer.write(for_mp4[i])
video_writer.release()