# 1. Necessary Libraries

In [1]:
# copy the files for 360 video analysis
!git clone https://github.com/cuppp1998/360_object_tracking.git

Cloning into '360_object_tracking'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 92 (delta 0), reused 2 (delta 0), pack-reused 89 (from 1)[K
Receiving objects: 100% (92/92), 8.25 MiB | 15.56 MiB/s, done.
Resolving deltas: 100% (29/29), done.


In [1]:
cd /content/360_object_tracking/

/content/360_object_tracking


In [3]:
# download and update some files
!gdown 'https://drive.google.com/uc?export=download&id=1CYatcNRqc3ec2FakItcGU8EWSYhDNh42' -O ./deep_sort/sort/detection.py
!gdown 'https://drive.google.com/uc?export=download&id=1sMsAbca5l8qtysV-czmMkmik4PwdbOD4' -O ./deep_sort/deep/checkpoint/

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1CYatcNRqc3ec2FakItcGU8EWSYhDNh42
From (redirected): https://drive.google.com/uc?export=download&id=1CYatcNRqc3ec2FakItcGU8EWSYhDNh42&confirm=t&uuid=860ba8dd-762c-4e78-a491-27eebbc879fc
To: /content/360_object_tracking/deep_sort/sort/detection.py
100% 1.47k/1.47k [00:00<00:00, 7.85MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1sMsAbca5l8qtysV-czmMkmik4PwdbOD4
To: /content/360_object_tracking/deep_sort/deep/checkpoint/ckpt.t7
100% 46.0M/46.0M [00:00<00:00, 62.2MB/s]


In [2]:
# install some libraries
!pip install ultralytics torch torchvision ipdb matplotlib numpy scipy pyyaml pandas seaborn pillow deepface opencv-python



In [3]:
# install detectron2
!pip install 'git+https://github.com/facebookresearch/detectron2.git'

Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-mwxfo9zb
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-mwxfo9zb
  Resolved https://github.com/facebookresearch/detectron2.git to commit 3a161f234e12a56eb9f31fbdbd1972cc2280bed7
  Preparing metadata (setup.py) ... [?25l[?25hdone


# 2. 360 Video Object Tracking Framework

## Define Functions for Improving Object Detections

### 1. Projection Transformation from Equirectangular to Perspective View

* input_img: The input image which to be represented by the multidimensional matrix.

* FOV: The field of view of the sub-images.

* THETAs: A list that contains the theta of each sub-image (its length should be equal to the number of sub-images).

* PHIs: A list that contains the phi of each sub-image (its length should be equal to the number of sub-images).

* output_height, output_width: Height and Width of the output images (both should be the same).

In [4]:
### import the Perspective and Equirectangular libraries ###
import lib.Equirec2Perspec as E2P
import lib.Perspec2Equirec as P2E
import lib.multi_Perspec2Equirec as m_P2E


def equir2pers(input_img, FOV, THETAs, PHIs, output_height, output_width) :
    ### load the equirectangular image ###
    equ = E2P.Equirectangular(input_img)

    ### outputs save directory ###
    output_dir = "./output_sub/"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    ### define maps that define the projection from equirectangular to perspective ###
    lon_maps = []
    lat_maps = []
    imgs = []  # output images



    for i in range(len(PHIs)): # for each sub-image
        img1, lon_map1, lat_map1 = equ.GetPerspective(FOV, THETAs[i], PHIs[i], output_height, output_width)

        ### save the outputs ##
        output1 = output_dir + str(i) + ".png"
        cv2.imwrite(output1, img1)
        lon_maps.append(lon_map1)
        lat_maps.append(lat_map1)
        imgs.append(img1)

    return lon_maps, lat_maps, imgs

### 2. Project the Bounding-Boxes on the Sub-Images Back to the Original Image

Also return the bounding-boxes whose Left/Right borders are tangent to a border of the sub-image which are required to be merged


* bboxes: a list of bounding-boxes in [x_min y_min x_max y_max] format.

* lon_map_original, lat_map_original: Map matrix obtained through the projection transformation from equirectangular to perspective (i.e., lon_maps, lat_maps returned by equir2pers() ).

* classes, scores: list of classes and scores predicted by the object detection model.

* interval: A value that determines the pixel interval used for calculating the corresponding coordinate point of the bounding-box in the sub-images. Smaller interval means higher accuracy.

* num_of_subimage: Serial number of the current sub-image (i.e., 0, 1, 2, or 3, see paper).

* input_video_width, input_video_height: Width and Height of the input video.

* num_of_subimages: The total number of sub-images, should be 4 by default.

* threshold_of_boundary: Threshold to determine whether the Left/Right border of a bounding box is tangent to the border of the sub-image (i.e., distance > threshold_of_boundary).

* is_split_image2: Whether to split the bboxes into two across the centre line of the sub-image, default to True.

In [5]:
def reproject_bboxes(
    bboxes,
    lon_map_original,
    lat_map_original,
    classes,
    scores,
    interval,
    num_of_subimage,
    input_video_width,
    input_video_height,
    num_of_subimages,
    threshold_of_boundary,
    is_split_image2=True) :

    ### lists for storing the new bboxes, classes, and scores after reprojection ###
    new_bboxes = []
    new_classes = []
    new_scores = []

    ### variables that store the index of the bboxes from the new_bboxes list ###
    ### which coincide with the Left/Right boundaries of the sub-image ###
    left_boundary_box = None
    right_boundary_box = None

    ### calculate the degree of overlap between each pair of adjacent sub-images ###
    ### if the number of sub-images is 4, this will result to 30 ###
    overlaped_degree = (num_of_subimages * 120 - 360) / num_of_subimages

    ### calculate which subimage to be splited into two parts ###
    ### if the number of sub-images is 4, this will result to image 2 ###
    num_of_splited_subimage = num_of_subimages / 2

    index = 0

    ### number of pixels occupied by (overlaped_degree/2) degrees on the sub-image ###
    margin = int(lon_map_original.shape[0] / 120 * (overlaped_degree / 2))



    for bbox, class1, score in zip(bboxes, classes, scores) :
        ### get the coordinates of the top-left point and the right-bottom point ###
        left_top_x = int(bbox[0])
        left_top_y = int(bbox[1])
        right_bottom_x = int(bbox[2])
        right_bottom_y = int(bbox[3])

        ### only reproject bboxes that are not fully inside the overlapping area, and their y-values are less than 70 degress ###
        ### specific to the problem in which otherwise, the backpack of the cyclist will be incorrectly detected as a car ###
        if (margin <= ((left_top_x + right_bottom_x) / 2)
            <= (lon_map_original.shape[0] - margin)
            and left_top_y <= lon_map_original.shape[0] / 120 * 70 ) :

            ### for an a*b sub image, the size of lon_map and lat_map is (a-1)*(b-1), when the right_bottom_x or the right_bottom_y equals to a or b ###
            ### to get the corresponding value in lon_map and lat_map (which represent he corresponding position on the original image) we have to subtract them by 1 ###
            if right_bottom_x == lon_map_original.shape[0] :
                right_bottom_x -= 1
            if right_bottom_y == lon_map_original.shape[1] :
                right_bottom_y -= 1

            ### check if the bbox coincides with the Left/Right boundaries of the sub-image ###
            ### if yes, assign its index to left_boundary_box/right_boundary_box ###
            ### if the bbox is large (>subimage size/5), use the threshold to do the judgement ###
            if (right_bottom_x - left_top_x) * (right_bottom_y - left_top_y)  <  lon_map_original.shape[0] * lon_map_original.shape[0] / 5 :

                if left_top_x <= threshold_of_boundary :
                    left_boundary_box = index
                if right_bottom_x >= lon_map_original.shape[0] - threshold_of_boundary :
                    right_boundary_box = index

            ### if the bbox is small (<=subimage size/5), set the threshold a little bit larger ###
            ### this is arbitrarily determined through experiments ###
            else :
                if left_top_x <= (threshold_of_boundary + 15 * int(lon_map_original.shape[0] / 640) ) :
                    left_boundary_box = index
                if right_bottom_x >= lon_map_original.shape[0] - (threshold_of_boundary + 15 * int(lon_map_original.shape[0] / 640) ) :
                    right_boundary_box = index


            ### lists to store the corresponding x and y coordinates of each point on the bbox on the original image ###
            xs = []
            ys = []


            ### if the current sub-image is the one which crosses the boundary (e.g., image 2, when the number of sub-image is 4) ###
            ### and the current bbox is across the center line ###
            if (num_of_subimage == num_of_splited_subimage
                and left_top_x <= int(lon_map_original.shape[0] / 2) - 1
                and right_bottom_x >= int(lon_map_original.shape[0] / 2) ) :

                ### lists to store the x coordinates of each point on the Left/Right part of the bbox on the original image ###
                xs_left = []
                xs_right = []

                ### calculation for the left and right borders ###
                for i in range(left_top_y, right_bottom_y, interval) :
                    ### left border ###
                    x = int(round(lon_map_original[i, left_top_x]))
                    y = int(round(lat_map_original[i, left_top_x]))
                    xs.append(x)
                    ys.append(y)
                    xs_left.append(x)
                    ### right border ###
                    x = int(round(lon_map_original[i, right_bottom_x]))
                    y = int(round(lat_map_original[i, right_bottom_x]))
                    xs.append(x)
                    ys.append(y)
                    xs_right.append(x)

                ### calculation for the left part of the top and bottom borders ###
                for i in range(left_top_x, int(lon_map_original.shape[0] / 2) - 1, interval) :
                    x = int(round(lon_map_original[left_top_y, i]))
                    y = int(round(lat_map_original[left_top_y, i]))
                    xs.append(x)
                    ys.append(y)
                    xs_left.append(x)
                    x = int(round(lon_map_original[right_bottom_y, i]))
                    y = int(round(lat_map_original[right_bottom_y, i]))
                    xs.append(x)
                    ys.append(y)
                    xs_left.append(x)

                ### calculation for the right part of the top and bottom borders ###
                for i in range(int(lon_map_original.shape[0] / 2), right_bottom_x, interval) :
                    x = int(round(lon_map_original[left_top_y, i]))
                    y = int(round(lat_map_original[left_top_y, i]))
                    xs.append(x)
                    ys.append(y)
                    xs_right.append(x)
                    x = int(round(lon_map_original[right_bottom_y, i]))
                    y = int(round(lat_map_original[right_bottom_y, i]))
                    xs.append(x)
                    ys.append(y)
                    xs_right.append(x)



                ymax = max(ys)
                ymin = min(ys)
                xmin_left = min(xs_left)
                xmax_right = max(xs_right)

                ### if it is needed to split the bbox into two parts, create two bboxes with the MBRs of the left and right part seperately ###
                if is_split_image2 == True :
                    new_bboxes.append([xmin_left, ymin, input_video_width, ymax])
                    new_bboxes.append([0, ymin, xmax_right, ymax])
                    new_classes.append(int(class1))
                    new_classes.append(int(class1))
                    new_scores.append(score)
                    new_scores.append(score)
                    index += 2

                ### if not, create one bbox which extends outside the right boundary ###
                else :
                    new_bboxes.append([xmin_left, ymin, input_video_width + xmax_right, ymax])
                    new_classes.append(int(class1))
                    new_scores.append(score)
                    index += 1

            ### if the current sub-image is not the one which crosses the boundary ###
            else :
                ### in case the interval is set larger than the length of the border, if so, set it as the length of the short side of the bbox ###
                if (right_bottom_x - left_top_x < interval or right_bottom_y - left_top_y < interval) :
                    interval = min(right_bottom_x - left_top_x, right_bottom_y - left_top_y)

                ### get the corresponding coordinates of each point on the boundary on the original image ###
                for i in range(left_top_y, right_bottom_y, interval):
                    x = int(round(lon_map_original[i, left_top_x]))
                    y = int(round(lat_map_original[i, left_top_x]))
                    xs.append(x)
                    ys.append(y)
                    x = int(round(lon_map_original[i, right_bottom_x]))
                    y = int(round(lat_map_original[i, right_bottom_x]))
                    xs.append(x)
                    ys.append(y)
                for i in range(left_top_x, right_bottom_x, interval):
                    x = int(round(lon_map_original[left_top_y, i]))
                    y = int(round(lat_map_original[left_top_y, i]))
                    xs.append(x)
                    ys.append(y)
                    x = int(round(lon_map_original[right_bottom_y, i]))
                    y = int(round(lat_map_original[right_bottom_y, i]))
                    xs.append(x)
                    ys.append(y)

                ### create one bbox with the MBR ###
                xmax = max(xs)
                xmin = min(xs)
                ymax = max(ys)
                ymin = min(ys)
                new_bboxes.append([xmin, ymin, xmax, ymax])
                new_classes.append(int(class1))
                new_scores.append(score)
                index += 1

    return new_bboxes, new_classes, new_scores, left_boundary_box, right_boundary_box

### 3. Match the Serial Number of the Sub-Images with the Serial Number of the Boundaries

* number_of_subimage: The serial number (0,1,2,3) of the sub-image.

In [6]:
def number_of_left_and_right_boundary(number_of_subimage) :
    if number_of_subimage == 0 :
        return [2, 5]
    elif number_of_subimage == 1 :
        return [4, 7]
    elif number_of_subimage == 2 :
        return [6, 1]
    else :
        return [0, 3]

### 4. Merge the Bounding-Boxes of the Objects Which are Shown in Several Sub-Images

Bounding-boxes that are needed to be merged are categorised into:

1. Objects crossing two sub-images.

2. Objects crossing at least three sub-images.


* bboxes_all: List of bounding-boxes after projection to the original image.

* classes_all, scores_all: Lists of categories and scores of the bounding-boxes.

* width, height: Width and Height of the original images

* bboxes_boundary: A list whose length is 8, and the Nth value represents the index of the bounding-box that is tangent to the Nth boundary.

The following functions are also defined.

* weighted_average_score(): Used to calculate the weighted average score of several bounding boxes.

* class_with_largest_score(): When the bboxes to merge are of different categories, this function is used to choose the class with the largest weighted score as the class of the new bbox.

* MBR_bboxes(): Calculate the MBR of several bboxes.

In [7]:
def merge_bbox_across_boundary(bboxes_all,classes_all,scores_all,width,height,bboxes_boundary):

    ### a list to store the indeces of the bboxes that are to be deleted after we merge them ###
    bboxes_to_delete=[]

    ### first delete the bboxes which are on the boundary and are totally in the overlapped areas ###
    names = locals()
    for i in range(0,8,1):
        if bboxes_boundary[i] !=None:
            ### although the overlapped area is 30 degree in width, set the threshold as 40, since it produces better performances ###
            if (bboxes_all[bboxes_boundary[i]][2]-bboxes_all[bboxes_boundary[i]][0]) <= int(width/360*40):
                bboxes_to_delete.append(bboxes_boundary[i])
                bboxes_boundary[i] = None

    ### Assign each value in the array to 8 variables ###
    bboxes_boundary1=bboxes_boundary[0]
    bboxes_boundary2=bboxes_boundary[1]
    bboxes_boundary3=bboxes_boundary[2]
    bboxes_boundary4=bboxes_boundary[3]
    bboxes_boundary5=bboxes_boundary[4]
    bboxes_boundary6=bboxes_boundary[5]
    bboxes_boundary7=bboxes_boundary[6]
    bboxes_boundary8=bboxes_boundary[7]

    ### if the object crosses all 4 overlapped areas (12 34 56 78) ###
    if bboxes_boundary1!=None and bboxes_boundary2!=None and bboxes_boundary3!=None and bboxes_boundary4!=None and bboxes_boundary5!=None and bboxes_boundary6!=None and bboxes_boundary7!=None and bboxes_boundary8!=None and (bboxes_boundary1==bboxes_boundary4) and (bboxes_boundary3==bboxes_boundary6) and (bboxes_boundary5==bboxes_boundary8) :
        bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]]))
        classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1],classes_all[bboxes_boundary3],classes_all[bboxes_boundary5],classes_all[bboxes_boundary7]]))
        scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5],scores_all[bboxes_boundary7]])])
        bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2,bboxes_boundary3,bboxes_boundary4,bboxes_boundary5,bboxes_boundary6,bboxes_boundary7,bboxes_boundary8])
    else :
        ### if the object crosses 3 overlapped areas (12 34 56) ###
        if bboxes_boundary1!=None and bboxes_boundary2!=None and bboxes_boundary3!=None and bboxes_boundary4!=None and bboxes_boundary5!=None and bboxes_boundary6!=None and (bboxes_boundary1==bboxes_boundary4) and (bboxes_boundary3==bboxes_boundary6) :
            bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5]]))
            classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1],classes_all[bboxes_boundary3],classes_all[bboxes_boundary5]]))
            scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5]])])
            bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2,bboxes_boundary3,bboxes_boundary4,bboxes_boundary5,bboxes_boundary6])

            ### if another object crosses the remaining overlapped area (78) ###
            if bboxes_boundary7!=None and bboxes_boundary8!=None :
                bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary7],bboxes_all[bboxes_boundary8]]))
                classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary8],classes_all[bboxes_boundary7]]))
                scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]])])
                bboxes_to_delete.extend([bboxes_boundary7,bboxes_boundary8])


        ### if the object crosses 3 overlapped areas (34 56 78) ###
        if bboxes_boundary3!=None and bboxes_boundary4!=None and bboxes_boundary5!=None and bboxes_boundary6!=None and bboxes_boundary7!=None and bboxes_boundary8!=None and (bboxes_boundary3==bboxes_boundary6) and (bboxes_boundary5==bboxes_boundary8) :
            bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]]))
            classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary4],classes_all[bboxes_boundary3],classes_all[bboxes_boundary5],classes_all[bboxes_boundary7]]))
            scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5],scores_all[bboxes_boundary7]])])
            bboxes_to_delete.extend([bboxes_boundary3,bboxes_boundary4,bboxes_boundary5,bboxes_boundary6,bboxes_boundary7,bboxes_boundary8])

            ### if another object crosses the remaining overlapped area (12) ###
            if bboxes_boundary1!=None and bboxes_boundary2!=None :
                bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]]))
                classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1]]))
                scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]])])
                bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2])

        else :
            ### if the object crosses 2 overlapped areas (12 34) ###
            if bboxes_boundary1!=None and bboxes_boundary2!=None and bboxes_boundary3!=None and bboxes_boundary4!=None and (bboxes_boundary1==bboxes_boundary4) :
                bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3]]))
                classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1],scores_all[bboxes_boundary3]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1],classes_all[bboxes_boundary3]]))
                scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1],bboxes_all[bboxes_boundary3]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1],scores_all[bboxes_boundary3]])])
                bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2,bboxes_boundary3,bboxes_boundary4])

                ### if another object crosses the remaining overlapped area (56) ###
                if bboxes_boundary5!=None and bboxes_boundary6!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary6]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary6],scores_all[bboxes_boundary5]],[classes_all[bboxes_boundary6],classes_all[bboxes_boundary5]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary6],scores_all[bboxes_boundary5]])])
                    bboxes_to_delete.extend([bboxes_boundary5,bboxes_boundary6])

                ### if another object crosses the remaining overlapped area (78) ###
                if bboxes_boundary7!=None and bboxes_boundary8!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary7],bboxes_all[bboxes_boundary8]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary8],classes_all[bboxes_boundary7]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]])])
                    bboxes_to_delete.extend([bboxes_boundary7,bboxes_boundary8])

            ### if the object crosses 2 overlapped areas (34 56) ###
            if bboxes_boundary3!=None and bboxes_boundary4!=None and bboxes_boundary5!=None and bboxes_boundary6!=None and (bboxes_boundary3==bboxes_boundary6) :
                bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5]]))
                classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5]],[classes_all[bboxes_boundary4],classes_all[bboxes_boundary3],classes_all[bboxes_boundary5]]))
                scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3],scores_all[bboxes_boundary5]])])
                bboxes_to_delete.extend([bboxes_boundary3,bboxes_boundary4,bboxes_boundary5,bboxes_boundary6])

                ### if another object crosses the remaining overlapped area (12) ###
                if bboxes_boundary1!=None and bboxes_boundary2!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]])])
                    bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2])

                ### if another object crosses the remaining overlapped area (78) ###
                if bboxes_boundary7!=None and bboxes_boundary8!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary7],bboxes_all[bboxes_boundary8]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary8],classes_all[bboxes_boundary7]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]])])
                    bboxes_to_delete.extend([bboxes_boundary7,bboxes_boundary8])


            ### if the object crosses 2 overlapped areas (56 78) ###
            if bboxes_boundary5!=None and bboxes_boundary6!=None and bboxes_boundary7!=None and bboxes_boundary8!=None and (bboxes_boundary5==bboxes_boundary8) :
                bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]]))
                classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary6],scores_all[bboxes_boundary5],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary6],classes_all[bboxes_boundary5],classes_all[bboxes_boundary7]]))
                scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary6],scores_all[bboxes_boundary5],scores_all[bboxes_boundary7]])])
                bboxes_to_delete.extend([bboxes_boundary5,bboxes_boundary6,bboxes_boundary7,bboxes_boundary8])

                ### if another object crosses the remaining overlapped area (12) ###
                if bboxes_boundary1!=None and bboxes_boundary2!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]])])
                    bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2])

                ### if another object crosses the remaining overlapped area (34) ###
                if bboxes_boundary3!=None and bboxes_boundary4!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary4]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3]],[classes_all[bboxes_boundary4],classes_all[bboxes_boundary3]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3]])])
                    bboxes_to_delete.extend([bboxes_boundary3,bboxes_boundary4])

            else :
                ### if the object crosses 1 overlapped area (12) ###
                if bboxes_boundary1!=None and bboxes_boundary2!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]],[classes_all[bboxes_boundary2],classes_all[bboxes_boundary1]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary2],bboxes_all[bboxes_boundary1]],[scores_all[bboxes_boundary2],scores_all[bboxes_boundary1]])])
                    bboxes_to_delete.extend([bboxes_boundary1,bboxes_boundary2])

                ### if the object crosses 1 overlapped area (34) ###
                if bboxes_boundary3!=None and bboxes_boundary4!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary3],bboxes_all[bboxes_boundary4]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3]],[classes_all[bboxes_boundary4],classes_all[bboxes_boundary3]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary4],bboxes_all[bboxes_boundary3]],[scores_all[bboxes_boundary4],scores_all[bboxes_boundary3]])])
                    bboxes_to_delete.extend([bboxes_boundary3,bboxes_boundary4])

                ### if the object crosses 1 overlapped area (56) ###
                if bboxes_boundary5!=None and bboxes_boundary6!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary5],bboxes_all[bboxes_boundary6]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary6],scores_all[bboxes_boundary5]],[classes_all[bboxes_boundary6],classes_all[bboxes_boundary5]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary6],bboxes_all[bboxes_boundary5]],[scores_all[bboxes_boundary6],scores_all[bboxes_boundary5]])])
                    bboxes_to_delete.extend([bboxes_boundary5,bboxes_boundary6])

                ### if the object crosses 1 overlapped area (78) ###
                if bboxes_boundary7!=None and bboxes_boundary8!=None :
                    bboxes_all.extend(MBR_bboxes([bboxes_all[bboxes_boundary7],bboxes_all[bboxes_boundary8]]))
                    classes_all.append(class_with_largest_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]],[classes_all[bboxes_boundary8],classes_all[bboxes_boundary7]]))
                    scores_all.extend([weighted_average_score([bboxes_all[bboxes_boundary8],bboxes_all[bboxes_boundary7]],[scores_all[bboxes_boundary8],scores_all[bboxes_boundary7]])])
                    bboxes_to_delete.extend([bboxes_boundary7,bboxes_boundary8])

    ### delete the boxes that have been merged from the lists ###
    bboxes_to_delete=list(set(bboxes_to_delete))
    bboxes_to_delete.sort(reverse=True)
    for i in bboxes_to_delete :
        bboxes_all.pop(i)
        classes_all.pop(i)
        scores_all.pop(i)

    return bboxes_all, classes_all, scores_all

In [8]:
def weighted_average_score(bboxes,scores) :
    sum=0
    sum_area=0
    for bbox,score in zip(bboxes,scores) :
        area=(bbox[3]-bbox[1])*(bbox[2]-bbox[0])
        sum+=score*area
        sum_area+=area
    return float(sum/sum_area)

In [9]:
def class_with_largest_score(bboxes,scores,classes) :
    sum_area=0
    score_multi_area=[]
    for bbox,score in zip(bboxes,scores) :
        area=(bbox[3]-bbox[1])*(bbox[2]-bbox[0])
        score_multi_area.append(area*score)
        sum_area+=area
    weighted_score = [i / sum_area for i in score_multi_area]
    return classes[weighted_score.index(max(weighted_score))]

In [10]:
def MBR_bboxes(bboxes) :
    xs=[]
    ys=[]
    for bbox in bboxes :
        xs.append(bbox[0])
        xs.append(bbox[2])
        ys.append(bbox[1])
        ys.append(bbox[3])
    return [[min(xs),min(ys),max(xs),max(ys)]]

### 5. Additional Functions for Improving Object Detection

* filter_classes(): Pre-trained YOLO and Faster RCNN output detection results according to categories in COCO. Here, only parts of them are used.

* xyxy2xcycwh(): A function used to transform the output from [x1,y1,x2,y2] format to [x_centre, y_centre, width, height].

* convert_y_to_D(): A function to calculate the real-world distance to the camera D using the y bottom value of the bounding box.

* draw_bboxes(): A function to annotate the image frame with the results.

In [11]:
def filter_classes(bboxes_all, classes_all, scores_all, class_needed):
    bboxes_all = bboxes_all.tolist()
    classes_all = classes_all.tolist()
    scores_all = scores_all.tolist()
    # remove the bboxes which are not belong to the needed classes from the lists
    for i in range(len(classes_all), 0, -1):
        if classes_all[i - 1] not in class_needed:
            bboxes_all.pop(i - 1)
            classes_all.pop(i - 1)
            scores_all.pop(i - 1)
    return bboxes_all, classes_all, scores_all

In [12]:
def xyxy2xcycwh(bboxes) :
    bboxes_new = []
    for bbox in bboxes :
        bboxes_new.append( [(bbox[0]+bbox[2])/2,
                            (bbox[1]+bbox[3])/2,
                            (bbox[2] - bbox[0]) ,
                            (bbox[3] - bbox[1]) ] )
    return bboxes_new

In [13]:
def convert_y_to_D(y_bottom, image_height, camera_height) :
    beta = (y_bottom/image_height * np.pi) - (np.pi/2)
    D = camera_height/np.tan(beta)

    return D

In [14]:
from panoramic_detection.draw_output import classid2name

def draw_bboxes(img, bbox, track_classes, track_scores, video_width, identities, distances, dominant_emotions, offset=(0, 0)) :
    # for each object, draw the bbox and label
    for i, box in enumerate(bbox) :
        x1, y1, x2, y2 = [int(i) for i in box]
        x1 += offset[0]
        x2 += offset[0]
        y1 += offset[1]
        y2 += offset[1]
        bbox_size = None
        # box text and bar
        id = int(identities[i]) if identities is not None else 0
        color = (0,204,0)

        label_top = (str(id) + " " + classid2name(track_classes[i]) + " " +
                     str(round(track_scores[i] * 100, 1)) + "%" )
        label_bottom = ("D=" + str(np.round(distances[i], decimals=2)) + "m" + " " +
                        str(dominant_emotions[i]) )

        t_size = cv2.getTextSize(label_top, cv2.FONT_HERSHEY_PLAIN, 2, 2)[0]




        # if the bbox is totally in the image frame
        if x2 <= video_width:
            cv2.rectangle(img, (x1, y1), (x2, y2), color, 3)
            cv2.rectangle(img, (x1, y1), (x1 + t_size[0] + 15, y1 + t_size[1] + 15), color, -1 )
            cv2.putText(img,label_top, (x1, y1 + t_size[1] + 4), cv2.FONT_HERSHEY_SIMPLEX,
                        1, [255, 255, 255], 2, )
            cv2.putText(img,label_bottom, (x1, y2 +t_size[1] + 4), cv2.FONT_HERSHEY_SIMPLEX,
                        1, color, 2, )

        # if the bbox crosses the boundary of the video
        else:
            cv2.rectangle(img, (x1, y1), (video_width, y2), color, 3)
            # plot the right part
            cv2.rectangle(img, (x1, y1), (x1 + t_size[0] + 15, y1 + t_size[1] + 15), color, -1 )
            cv2.putText(img,label_top, (x1, y1 + t_size[1] + 4), cv2.FONT_HERSHEY_SIMPLEX,
                        1, [255, 255, 255], 2, )
            cv2.putText(img,label_bottom, (x1, y2 + t_size[1] + 4), cv2.FONT_HERSHEY_SIMPLEX,
                        1, color, 2, )
            # plot the left part
            cv2.rectangle(img, (0, y1), (x2 - video_width, y2), color, 3)
            cv2.rectangle(img, (0, y1), (0 + t_size[0] + 15, y1 + t_size[1] + 15), color, -1 )
            cv2.putText(img,label_top, (0, y1 + t_size[1] + 4), cv2.FONT_HERSHEY_SIMPLEX,
                        1, [255, 255, 255], 2, )
            cv2.putText(img,label_bottom, (0, y2 + t_size[1] + 4), cv2.FONT_HERSHEY_SIMPLEX,
                        1, color, 2, )




    return img

## Define the Processes for the Improved Object Detection on One Frame

Using the functions defined previously, the process of the improved object detection on one frame is defined by the function predict_one_frame().

* FOV: The field of view of the sub-images.

* THETAs: A list that contains the theta of each sub-image whose length should be the same as the number of sub-images.

* PHIs: A list that contains the Phi of each sub-image whose length should be the same as the number of sub-images.

* im: The image on which to do the object detection.

* predictor: YOLOv5 or Faster RCNN object detection model.

* video_width, video_height: Width and Height of the input image frame.

* sub_image_width: Width (or height) of the sub-images.

* classes_to_detect: Index numbers of the classes to detect according to COCO, default to [0, 1, 2, 3, 5, 7, 9].

* split_image2: Whether to split the bboxes across the centre line of sub-image 2 into two when reprojecting the bboxes back into the original image using reproject_bboxes().

In [15]:
import time

from detectron2.layers import batched_nms
import torch
import torchvision




def predict_one_frame(FOV, THETAs, PHIs, im, predictor, video_width, video_height,
                      sub_image_width, classes_to_detect=[0, 1, 2, 3, 5, 7, 9],
                      split_image2=True) :

    ### check processing speed, record the current time first ###
    time1 = time.time()













    # =============================================================================
    # =======================IMPROVED-OBJECT-DETECTOR==============================
    # =============================================================================

    ### split the frame into 4 sub images (of perspective projection), get the maps and the output images ###
    lon_maps, lat_maps, subimgs = equir2pers( im, FOV, THETAs, PHIs, sub_image_width, sub_image_width)

    ### lists for storing the detection results from all sub-images ###
    bboxes_all = []
    classes_all = []
    scores_all = []

    ### list for storing the index of the bounding boxes that intersect with the boundaries of the sub-images ###
    bboxes_boundary = [None] * 8

























    #---------------------------------YOLO----------------------------------


    ### for each sub image, first change the color from BGR to RGB ###
    for i in range(len(subimgs)) :
        subimgs[i] = cv2.cvtColor(subimgs[i], cv2.COLOR_BGR2RGB)

    ### YOLO supports detecting several images at the same time, so input all the sub images at once to the predictor ###
    #results = predictor(subimgs, size=sub_image_width)  # includes NMS
    results = predictor(subimgs, imgsz=sub_image_width)  # includes NMS


    """
    # --------  if you want to save and check the detail of the results on each sub image, run the code below  ----------
    # results.save()
    # --------  end of this part  ----------
    """


    for i in range(len(subimgs)) : # for each sub-image

        bboxes  = results[i].boxes.xyxy.tolist()
        classes = list(map(int, results[i].boxes.cls.tolist()))
        scores  = results[i].boxes.conf.tolist()

        """
        ### Originally, YOLO outputs the positions using the relative coordinates [0-1], so transform the output format by multiplying by the width/height of the sub-image ###
        bboxes = (results.xyxyn[i].cpu().numpy()[:, 0:4]  *  [sub_image_width,sub_image_width,sub_image_width,sub_image_width] ).tolist()
        classes = list(map(int, results.xyxyn[i].cpu().numpy()[:, 5].tolist()))
        scores = results.xyxyn[i].cpu().numpy()[:, 4].tolist()
        """




        ### for each bbox in the current sub image, reproject it to the original image ###
        (reprojected_bboxes,classes,scores,left_boundary_box,right_boundary_box) = reproject_bboxes(bboxes, lon_maps[i], lat_maps[i], classes, scores,
                                                                                                            10, i, video_width, video_height, len(subimgs),
                                                                                                            sub_image_width / 640 * 20,
                                                                                                            split_image2)

        ### get the indeces of the bboxes that intersect the boundaries of the sub-images ###
        if left_boundary_box != None :
            bboxes_boundary[number_of_left_and_right_boundary(i)[0] ] = left_boundary_box + len(bboxes_all)
        if right_boundary_box != None :
            bboxes_boundary[number_of_left_and_right_boundary(i)[1] ] = right_boundary_box + len(bboxes_all)

        ### add the bboxes after reprojection to the previous lists ###
        bboxes_all = bboxes_all + reprojected_bboxes
        classes_all = classes_all + classes
        scores_all = scores_all + scores





















    ### merge the boxes that goes across the boundaries using merge_bbox_across_boundary() ###
    bboxes_all, classes_all, scores_all = merge_bbox_across_boundary(bboxes_all, classes_all, scores_all, video_width, video_height, bboxes_boundary)

    ### do NMS on the output bboxes again to get the indeces of the boxes that should be kept ###
    keep = batched_nms( torch.tensor(bboxes_all),
                            torch.tensor(scores_all),
                            torch.tensor(classes_all), 0.3)

    ### only keep the instances of the classes we need (person, bike, car, motorbike, bus, truck, traffic light by default) ###
    bboxes_all, classes_all, scores_all = filter_classes( torch.tensor(bboxes_all)[keep],
                                                              torch.tensor(classes_all)[keep],
                                                              torch.tensor(scores_all)[keep],
                                                              classes_to_detect)








































    ### record the current time again and calculate the running time ###
    time2 = time.time()
    # print(time2 - time1)

    return bboxes_all, classes_all, scores_all

# 3. The Main Body of the Code

In [16]:
cd /content/360_object_tracking/

/content/360_object_tracking


In [17]:
### get a sample 360 video ###
!gdown 'https://drive.google.com/uc?export=download&id=1BhtGx4YQG2Fx5SJRZqUFZ4qD6doDsPa1' -O /content/demo.mp4

Downloading...
From: https://drive.google.com/uc?export=download&id=1BhtGx4YQG2Fx5SJRZqUFZ4qD6doDsPa1
To: /content/demo.mp4
  0% 0.00/4.05M [00:00<?, ?B/s]100% 4.05M/4.05M [00:00<00:00, 163MB/s]


## Object Tracking on Panoramic Videos Using the Improved Object Detection Models

In [18]:
import os
import numpy as np
import cv2

from collections import defaultdict

from deep_sort.deep_sort import DeepSort

from ultralytics import YOLO

from deepface import DeepFace
from google.colab.patches import cv2_imshow
import tensorflow as tf # Import tensorflow




input_video_path = "/content/demo.mp4" # path of the input video
output_video_path = "/content/result_tracked.mp4" # path for the output video


prevent_different_classes_match = True # whether to use the support for multiple categories in DeepSORT
match_across_boundary = True # whether to use the support for boundary continuity in DeepSORT
classes_to_detect = [0] # index numbers of the categories to detect in the COCO dataset [0, 1, 2, 3, 5, 7, 9]

FOV = 120 # the field of view of the sub-images
THETAs = [0, 90, 180, 270] # contains the theta of each sub-image whose length should be the same as the number of sub-images
PHIs = [-10, -10, -10, -10] # contains the Phi of each sub-image whose length should be the same as the number of sub-images

sub_image_width = 640 # width (or height) of the sub-images
score_threshold = 0.6 # threshold for the confidence score
nms_threshold = 0.45 # threshold for the Non Maximum Supression


cH = 1.2 # camera height (assumed)


# force TensorFlow to use CPU for DeepFace analysis
tf.config.set_visible_devices([], 'GPU') # hide GPUs from TensorFlow


vid_log = defaultdict(list) # to log the input video information
db_log  = defaultdict(list) # to log bbox, distances, and dominant emotions









### load the pretrained detection model ###
model = YOLO('yolo11n.pt')


### read the input panoramic video in equirectangular projection ###
video_capture = cv2.VideoCapture(input_video_path)
### warn if the input path is not right ###
if (video_capture.isOpened()==False) :
    print('Cannot open the video file.')
### if path is right, get the info on video width, height, frame count and fps ###
else :
    video_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_frame_count = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    video_fps = int(round(video_capture.get(cv2.CAP_PROP_FPS)))
    # fourcc = cv2.VideoWriter_fourcc(*'MJPG')
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    outputfile = cv2.VideoWriter(output_video_path, fourcc, video_fps, (video_width, video_height))
### print the video info ###
print("The input video is "+str(video_width)+' in width and '+str(video_height)+" in height.")
### save input video information ###
vid_log["info"] = {"x_extent": video_width, "y_extent": video_height,
                   "frame_count": video_frame_count, "fps": video_fps}











### create a deepsort instance using the pre-trained feature extraction model ###
deepsort = DeepSort('./deep_sort/deep/checkpoint/ckpt.t7', use_cuda=torch.cuda.is_available())




### the number of the current frame ###
frame_num = 1









### for each image frame in the video ###
#while video_capture.grab() :
while video_capture.isOpened() :

    time1 = time.time()

    ### get the image frame ###
    #_,im = video_capture.retrieve()
    success, im = video_capture.read()
    if not success :
        break

    ### get the predictions on the current frame ###
    bboxes_all, classes_all, scores_all = predict_one_frame(FOV, THETAs, PHIs,
                                                                im, model, video_width, video_height,
                                                                sub_image_width, classes_to_detect,
                                                                not match_across_boundary)









    ### convert the bboxes from [x,y,x,y] to [xc,yc,w,h] ###
    bboxes_all_xcycwh = xyxy2xcycwh(bboxes_all)









    ### update deepsort and get the tracking results ###
    track_outputs = deepsort.update( np.array(bboxes_all_xcycwh), np.array(classes_all),
                                         np.array(scores_all),
                                         im, prevent_different_classes_match, match_across_boundary)









    ### plot the results on the video and save them ###
    if len(track_outputs) > 0 :
        bbox_xyxy = track_outputs[:, :4]
        track_classes = track_outputs[:, 4]
        track_scores = track_outputs[:, 5]
        identities = track_outputs[:, -1]

        # get distances and emotions
        distances_to_camera = []
        dominant_emotions = []
        for bbox in bbox_xyxy :
            x1,y1,x2,y2 = map(int, bbox)
            # calculate distance
            D = convert_y_to_D(y2, video_height, cH)
            distances_to_camera.append(D)
            # determine the dominant emotion
            yq = int(np.round(y1+(y2-y1)/4, decimals=0))
            face = im[y1:yq, x1:x2]
            # use tf.config.set_visible_devices to ensure subsequent TensorFlow ops (e.g., DeepFace.analyze) run on CPU.
            analysis = DeepFace.analyze(face, actions=['emotion'], enforce_detection=False)
            emotion = analysis[0]['dominant_emotion']
            dominant_emotions.append(emotion)

            #cv2_imshow(face)



        im = draw_bboxes(im, bbox_xyxy, track_classes, track_scores, video_width, identities, distances_to_camera, dominant_emotions)




        # for saving into .json
        db_log[frame_num] = {'ids': identities.tolist(),
                                 'x': [item[0] for item in bbox_xyxy], 'y': [item[1] for item in bbox_xyxy],
                                 'w': [item[2]-item[0] for item in bbox_xyxy],
                                 'h': [item[3]-item[1] for item in bbox_xyxy],
                                 'cls': track_classes.tolist(),
                                 'D': np.array(distances_to_camera).tolist(), 'emotion': dominant_emotions}



        #cv2_imshow(im)



    outputfile.write(im)














    ### show the current FPS ###
    time2 = time.time()
    if frame_num%5==0 :
        print(frame_num,'/',video_frame_count)
        print(str(1/(time2-time1))+' fps')

    frame_num += 1




### release the input and output videos ###
video_capture.release()
outputfile.release()
cv2.destroyAllWindows()

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
25-06-19 04:06:12 - Directory /root/.deepface has been created
25-06-19 04:06:12 - Directory /root/.deepface/weights has been created
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 375MB/s]


The input video is 1280 in width and 720 in height.

0: 640x640 1 person, 1 tie, 5 chairs, 2 dining tables, 1 tv, 14.0ms
1: 640x640 1 bench, 13 chairs, 14.0ms
2: 640x640 1 person, 1 train, 3 chairs, 1 dining table, 14.0ms
3: 640x640 1 person, 3 benchs, 1 surfboard, 4 chairs, 1 dining table, 14.0ms
Speed: 4.2ms preprocess, 14.0ms inference, 84.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 tie, 6 chairs, 2 dining tables, 1 tv, 5.8ms
1: 640x640 1 bench, 13 chairs, 5.8ms
2: 640x640 1 person, 4 chairs, 1 dining table, 5.8ms
3: 640x640 1 person, 2 benchs, 1 surfboard, 5 chairs, 5.8ms
Speed: 2.7ms preprocess, 5.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 tie, 7 chairs, 2 dining tables, 1 tv, 5.8ms
1: 640x640 1 bench, 14 chairs, 5.8ms
2: 640x640 1 person, 3 chairs, 1 dining table, 5.8ms
3: 640x640 1 person, 2 benchs, 1 surfboard, 5 chairs, 5.8ms
Speed: 2.9ms preprocess, 5.8ms inference, 1.3ms postprocess per image a

Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: /root/.deepface/weights/facial_expression_model_weights.h5
100%|██████████| 5.98M/5.98M [00:00<00:00, 365MB/s]



0: 640x640 1 person, 1 tie, 5 chairs, 2 dining tables, 1 tv, 5.8ms
1: 640x640 1 bench, 16 chairs, 1 potted plant, 5.8ms
2: 640x640 1 person, 3 chairs, 1 dining table, 5.8ms
3: 640x640 1 person, 2 benchs, 1 surfboard, 3 chairs, 5.8ms
Speed: 3.1ms preprocess, 5.8ms inference, 2.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 ties, 5 chairs, 2 dining tables, 1 tv, 5.8ms
1: 640x640 1 bench, 14 chairs, 1 potted plant, 5.8ms
2: 640x640 1 person, 3 chairs, 1 dining table, 5.8ms
3: 640x640 1 person, 2 benchs, 1 surfboard, 7 chairs, 1 dining table, 5.8ms
Speed: 2.6ms preprocess, 5.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)
5 / 449
2.2350739428664 fps

0: 640x640 1 person, 1 tie, 5 chairs, 2 dining tables, 1 tv, 5.8ms
1: 640x640 1 bench, 15 chairs, 1 potted plant, 5.8ms
2: 640x640 1 person, 3 chairs, 1 dining table, 5.8ms
3: 640x640 1 person, 2 benchs, 1 surfboard, 5 chairs, 5.8ms
Speed: 2.5ms preprocess, 5.8ms inference, 1.6ms postprocess p

## Saving the log file

In [23]:
import json

with open("/content/out.json", 'w') as j_out :
    json.dump(db_log, j_out)

with open("/content/vid.json", 'w') as vij_out :
    json.dump(vid_log, vij_out)