In [1]:
import cv2 as cv
import time
import numpy as np
import csv

In [2]:
protoFile = "CMU-Perceptual-Computing-Lab openpose master models/pose/coco/pose_deploy_linevec.prototxt"
weightsFile = "CMU-Perceptual-Computing-Lab openpose master models/pose/coco/pose_iter_440000.caffemodel"
nPoints = 18
# COCO Output Format (removed last index background)
keypointsMapping = ['Nose', 'Neck', 'R-Sho', 'R-Elb', 'R-Wr', 'L-Sho', 'L-Elb', 'L-Wr', 'R-Hip', 'R-Knee', 'R-Ank', 'L-Hip', 'L-Knee', 'L-Ank', 'R-Eye', 'L-Eye', 'R-Ear', 'L-Ear']

POSE_PAIRS = [[2,3], [3,4], [5,6], [6,7], [8,9], [9,10], [11,12], [12,13]]

# index of pafs correspoding to the POSE_PAIRS
# e.g for POSE_PAIR(1,2), the PAFs are located at indices (31,32) of output, Similarly, (1,5) -> (39,40) and so on.
mapIdx = [[33,34], [35,36], [41,42], [43,44], [21,22], [23,24], [27,28], [29,30]]

# Maps to each keypoint
colors = [[0,255,255], [0,100,255], [0,255,255], [0,100,255], [255,200,100], [255,0,255], [255,200,100], [255,0,255]]

In [3]:
POSE_PAIRS = [[1,2], [1,5], [2,3], [3,4], [5,6], [6,7],
              [1,8], [8,9], [9,10], [1,11], [11,12], [12,13],
              [1,0], [0,14], [14,16], [0,15], [15,17]]

# index of pafs correspoding to the POSE_PAIRS
# e.g for POSE_PAIR(1,2), the PAFs are located at indices (31,32) of output, Similarly, (1,5) -> (39,40) and so on.
mapIdx = [[31,32], [39,40], [33,34], [35,36], [41,42], [43,44], 
          [19,20], [21,22], [23,24], [25,26], [27,28], [29,30], 
          [47,48], [49,50], [53,54], [51,52], [55,56]]

colors = [ [0,100,255], [0,100,255], [0,255,255], [0,100,255], [0,255,255], [0,100,255],
         [0,255,0], [255,200,100], [255,0,255], [0,255,0], [255,200,100], [255,0,255],
         [0,0,255], [255,0,0], [200,200,0], [255,0,0], [200,200,0]]

In [4]:
# Returns
#   keypoints: list(tuple([x_coord: int, y_coord: int, prob: float]))
#       a list of tuple representing the x, y coordinates and the probability of a keypoint being at that coordinate
#       multiple keypoints indicate multiple people detected in frame
#       eg: [(140, 106, 0.696484), (339, 83, 0.6892066)]
def getKeypoints(probMap, threshold=0.1):
    mapSmooth = cv.GaussianBlur(probMap,(3,3),0,0)

    mapMask = np.uint8(mapSmooth>threshold)
    keypoints = []

    #find the blobs
    contours, _ = cv.findContours(mapMask, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)

    #for each blob find the maxima
    for cnt in contours:
        blobMask = np.zeros(mapMask.shape)
        blobMask = cv.fillConvexPoly(blobMask, cnt, 1)
        maskedProbMap = mapSmooth * blobMask
        _, maxVal, _, maxLoc = cv.minMaxLoc(maskedProbMap)
        keypoints.append(maxLoc + (probMap[maxLoc[1], maxLoc[0]],))
    return keypoints

In [5]:
# Find valid connections between the different joints of all persons present
def getValidPairs(output, width, height, detected_keypoints):
    valid_pairs = []
    invalid_pairs = []
    n_interp_samples = 10
    paf_score_th = 0.1
    conf_th = 0.7
    # loop for every POSE_PAIR
    for k in range(len(mapIdx)):
        # A->B constitute a limb
        pafA = output[0, mapIdx[k][0], :, :]
        pafB = output[0, mapIdx[k][1], :, :]
        pafA = cv.resize(pafA, (width, height))
        pafB = cv.resize(pafB, (width, height))

        # Find the keypoints for the first and second limb
        candA = detected_keypoints[POSE_PAIRS[k][0]]
        candB = detected_keypoints[POSE_PAIRS[k][1]]
        nA = len(candA)
        nB = len(candB)

        # If keypoints for the joint-pair is detected
        # check every joint in candA with every joint in candB
        # Calculate the distance vector between the two joints
        # Find the PAF values at a set of interpolated points between the joints
        # Use the above formula to compute a score to mark the connection valid

        if( nA != 0 and nB != 0):
            valid_pair = np.zeros((0,3))
            for i in range(nA):
                max_j=-1
                maxScore = -1
                found = 0
                for j in range(nB):
                    # Find d_ij
                    d_ij = np.subtract(candB[j][:2], candA[i][:2])
                    norm = np.linalg.norm(d_ij)
                    if norm:
                        d_ij = d_ij / norm
                    else:
                        continue
                    # Find p(u)
                    interp_coord = list(zip(np.linspace(candA[i][0], candB[j][0], num=n_interp_samples),
                                            np.linspace(candA[i][1], candB[j][1], num=n_interp_samples)))
                    # Find L(p(u))
                    paf_interp = []
                    for k in range(len(interp_coord)):
                        paf_interp.append([pafA[int(round(interp_coord[k][1])), int(round(interp_coord[k][0]))],
                                           pafB[int(round(interp_coord[k][1])), int(round(interp_coord[k][0]))] ])
                    # Find E
                    paf_scores = np.dot(paf_interp, d_ij)
                    avg_paf_score = sum(paf_scores)/len(paf_scores)

                    # Check if the connection is valid
                    # If the fraction of interpolated vectors aligned with PAF is higher then threshold -> Valid Pair
                    if ( len(np.where(paf_scores > paf_score_th)[0]) / n_interp_samples ) > conf_th :
                        if avg_paf_score > maxScore:
                            max_j = j
                            maxScore = avg_paf_score
                            found = 1
                # Append the connection to the list
                if found:
                    valid_pair = np.append(valid_pair, [[candA[i][3], candB[max_j][3], maxScore]], axis=0)

            # Append the detected connections to the global list
            valid_pairs.append(valid_pair)
        else: # If no keypoints are detected
            invalid_pairs.append(k)
            valid_pairs.append([])
    return valid_pairs, invalid_pairs

In [6]:
# This function creates a list of keypoints belonging to each person
# For each detected valid pair, it assigns the joint(s) to a person
def getPersonwiseKeypoints(valid_pairs, invalid_pairs, keypoints_list):
    # the last number in each row is the overall score
    personwiseKeypoints = -1 * np.ones((0, 19))

    for k in range(len(mapIdx)):
        if k not in invalid_pairs:
            partAs = valid_pairs[k][:,0]
            partBs = valid_pairs[k][:,1]
            indexA, indexB = np.array(POSE_PAIRS[k])

            for i in range(len(valid_pairs[k])):
                found = 0
                person_idx = -1
                for j in range(len(personwiseKeypoints)):
                    if personwiseKeypoints[j][indexA] == partAs[i]:
                        person_idx = j
                        found = 1
                        break

                if found:
                    personwiseKeypoints[person_idx][indexB] = partBs[i]
                    personwiseKeypoints[person_idx][-1] += keypoints_list[partBs[i].astype(int), 2] + valid_pairs[k][i][2]

                # if find no partA in the subset, create a new subset
                elif not found and k < 17:
                    row = -1 * np.ones(19)
                    row[indexA] = partAs[i]
                    row[indexB] = partBs[i]
                    # add the keypoint_scores for the two keypoints and the paf_score
                    row[-1] = sum(keypoints_list[valid_pairs[k][i,:2].astype(int), 2]) + valid_pairs[k][i][2]
                    personwiseKeypoints = np.vstack([personwiseKeypoints, row])
    return personwiseKeypoints

In [7]:
def save_file(vid_num, df):
    with open(f'imp/videos/data/{vid_num}_hpe.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(df)

In [8]:
resize_factor = 8
out_format = "csv" # csv, video, or imshow
fourcc = cv.VideoWriter_fourcc(*"MJPG")

net = cv.dnn.readNetFromCaffe(protoFile, weightsFile)
net.setPreferableBackend(cv.dnn.DNN_TARGET_CPU)

for vid_num in range(1, 7):
    cap = cv.VideoCapture(f'imp/videos/videos_0/{vid_num}.mov')
    frame_width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    frame_rate = cap.get(cv.CAP_PROP_FPS)
    num_frames = int(cap.get(cv.CAP_PROP_FRAME_COUNT))
    width = int(frame_width/resize_factor)
    height = int(frame_height/resize_factor)
    frame_size  = (width, height)
    if out_format == "video":
        out_vid = cv.VideoWriter("hpe.avi", fourcc, frame_rate, frame_size)
    df = []

    # Fix the input Height and get the width according to the Aspect Ratio
    frame_i = 0
    while True:
        # t = time.time()
        # only analyze every second frame ie 30fps
        ret, frame = cap.read()
        ret, frame = cap.read()
        if not ret:
            cap.release()
            break

        print(frame_i, "/", num_frames, ": ", int(100*frame_i/num_frames), "%")

        image1 = cv.resize(frame, (width, height))
        inpBlob = cv.dnn.blobFromImage(image1, 1.0 / 255, (width, height),
                                (0, 0, 0), swapRB=False, crop=False)
        net.setInput(inpBlob)
        output = net.forward()

        detected_keypoints = []
        keypoints_list = np.zeros((0,3)) # coordinates of detected keypoints
        keypoint_id = 0
        threshold = 0.1

        # Get list
        for part in range(nPoints):
            probMap = output[0,part,:,:]
            probMap = cv.resize(probMap, (image1.shape[1], image1.shape[0]))
            keypoints = getKeypoints(probMap, threshold)
            # print("Keypoints - {} : {}".format(keypointsMapping[part], keypoints))
            keypoints_with_id = []
            for keypoint in keypoints:
                keypoints_with_id.append(keypoint + (keypoint_id,)) # Delete? Could be useful
                keypoints_list = np.vstack([keypoints_list, keypoint])
                keypoint_id += 1

            detected_keypoints.append(keypoints_with_id) # delete ?

        frameClone = image1.copy()
        valid_pairs, invalid_pairs = getValidPairs(output, width, height, detected_keypoints)
        # Indices of keypoints_list for each corrsponding keypoint
        personwiseKeypoints = getPersonwiseKeypoints(valid_pairs, invalid_pairs, keypoints_list)
        # Filter out any person with  score < 10
        personwiseKeypoints = np.array(list(filter(lambda x: x[-1] > 10, personwiseKeypoints)))
        # get personwise keypoint coordinates, don't include probability of each coordinate, and ignore background coordinate
        personwise_coords = np.array([[keypoints_list[i][:-1].tolist() if i != -1 else [-1, -1] for i in person_list[:-1]] for person_list in personwiseKeypoints.astype(int)])
        # print(personwise_coords.shape)
        # find average x position for each person for keypoints that do exist
        personwise_avg_x = np.array([sum([x if x != -1 else 0 for x, _ in person_coords])/sum([1 if x != -1 else 0 for x, _ in person_coords]) for person_coords in personwise_coords])
        if len(personwise_coords) == 0:
            keypoint_coords = np.full((2, 18, 2), -1, int)
        elif len(personwise_coords) == 1:
            insert_i = 0 if personwise_avg_x[0] < frameClone.shape[1]/2 else 1
            keypoint_coords = np.insert(personwise_coords, insert_i, np.full((18, 2), -1, int), axis=0)
        else:
            # # find average x position for each person for keypoints that do exist
            # personwise_avg_x = np.array([sum([x if x != -1 else 0 for x, _ in person_coords])/sum([1 if x != -1 else 0 for x, _ in person_coords]) for person_coords in personwise_coords])
            # sort people by average x position
            sorted_avg_x = personwise_avg_x.argsort()
            # get the leftmost player (index 0), and rightmost player (index 1) and remove the scores for each person (last index of each person)
            keypoint_coords = personwise_coords[[sorted_avg_x[0], sorted_avg_x[-1]]]
        if out_format == "video" or out_format == "imshow":
            # Iterate through each person (2 people)
            for person_keypoints in keypoint_coords:
                # Iterate through all keypoint pairs (i is tuple)
                for i, keypoint_i in enumerate(POSE_PAIRS):
                    A, B = person_keypoints[keypoint_i].astype(int)
                    if -1 in A or -1 in B:
                        continue
                    cv.line(frameClone, (A[0], A[1]), (B[0], B[1]), colors[i], 3, cv.LINE_AA)
            if out_format == "imshow":
                cv.imshow("Detected Pose" , frameClone)
                cv.waitKey(1)
            else:
                out_vid.write(frameClone)
        else:
            # Flatten keypoint_coords so it can be cleanly stored in .csv
            keypoint_coords = keypoint_coords.flatten()
            # If less than 2 people detected fill remaining space with -1
            keypoint_coords = np.concatenate((keypoint_coords, [-1]*(len(keypointsMapping)*4 - keypoint_coords.size)))
            df.append(keypoint_coords)
        frame_i += 2
        # print("Time Taken in forward pass = {}".format(time.time() - t))

    if out_format == "imshow":
        cv.destroyAllWindows()
        cv.waitKey(1)
    elif out_format == "video":
        out_vid.release()
    elif out_format == "csv":
        save_file(vid_num, df)

0 / 9043.0 :  0 %
2 / 9043.0 :  0 %
4 / 9043.0 :  0 %
6 / 9043.0 :  0 %
8 / 9043.0 :  0 %
10 / 9043.0 :  0 %
12 / 9043.0 :  0 %
14 / 9043.0 :  0 %
16 / 9043.0 :  0 %
18 / 9043.0 :  0 %
20 / 9043.0 :  0 %
22 / 9043.0 :  0 %
24 / 9043.0 :  0 %
26 / 9043.0 :  0 %
28 / 9043.0 :  0 %
30 / 9043.0 :  0 %
32 / 9043.0 :  0 %
34 / 9043.0 :  0 %
36 / 9043.0 :  0 %
38 / 9043.0 :  0 %
40 / 9043.0 :  0 %
42 / 9043.0 :  0 %
44 / 9043.0 :  0 %
46 / 9043.0 :  0 %
48 / 9043.0 :  0 %
50 / 9043.0 :  0 %
52 / 9043.0 :  0 %
54 / 9043.0 :  0 %
56 / 9043.0 :  0 %
58 / 9043.0 :  0 %
60 / 9043.0 :  0 %
62 / 9043.0 :  0 %
64 / 9043.0 :  0 %
66 / 9043.0 :  0 %
68 / 9043.0 :  0 %
70 / 9043.0 :  0 %
72 / 9043.0 :  0 %
74 / 9043.0 :  0 %
76 / 9043.0 :  0 %
78 / 9043.0 :  0 %
80 / 9043.0 :  0 %
82 / 9043.0 :  0 %
84 / 9043.0 :  0 %
86 / 9043.0 :  0 %
88 / 9043.0 :  0 %
90 / 9043.0 :  0 %
92 / 9043.0 :  1 %
94 / 9043.0 :  1 %
96 / 9043.0 :  1 %
98 / 9043.0 :  1 %
100 / 9043.0 :  1 %
102 / 9043.0 :  1 %
104 / 9043.0 : 

- image size reduced by factor of 5 => 7.58892982006073 seconds per frame
- image size reduced by factor of 6 => 5.060316634178162 seconds per frame
- image size reduced by factor of 7 => 4.076976895332336 seconds per frame
- image size reduced by factor of 8 => 3.0399099349975587 seconds per frame
- image size reduced by factor of 9 => 2.424585461616516 seconds per frame
- image size reduced by factor of 10 => 2.02267644405365 seconds per frame