# CPSC 479/579 Final: Video-Based Motion Capture #

## Setup

In the cells below, we install necessary packages, set up imports and define the data paths.

In [1]:
import sys
import os
import os.path as osp
import argparse
import numpy as np
import cv2
import math
import torch
import torchvision.transforms as transforms
from torch.nn.parallel.data_parallel import DataParallel
import torch.backends.cudnn as cudnn

sys.path.append('main/')
from PoseConfig import p_cfg
from RootConfig import r_cfg
from PoseModel import get_pose_net
from RootModel import get_root_net
from PoseDataset import p_generate_patch_image
from RootDataset import r_generate_patch_image
sys.path.append('common/')
from utils.PoseUtils import p_process_bbox, p_pixel2cam
from utils.PoseVis import p_vis_keypoints, p_vis_3d_multiple_skeleton
from utils.RootUtils import r_process_bbox, r_pixel2cam
from utils.RootVis import r_vis_keypoints, r_vis_3d_skeleton

In [2]:
# Consolidate overlapping bounding boxes
def consolidate_bounding_boxes(boxes, threshold=0.5):
    def iou(box1, box2):
        # Calculate intersection over union (IoU) between two boxes
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[0] + box1[2], box2[0] + box2[2])
        y2 = min(box1[1] + box1[3], box2[1] + box2[3])
        intersection = max(0, x2 - x1) * max(0, y2 - y1)
        area1 = box1[2] * box1[3]
        area2 = box2[2] * box2[3]
        union = area1 + area2 - intersection
        return intersection / union if union > 0 else 0

    consolidated = []
    while boxes:
        box = boxes.pop(0)
        to_merge = [box]
        for other_box in boxes[:]:
            if iou(box, other_box) > threshold:
                to_merge.append(other_box)
                boxes.remove(other_box)
        # Merge boxes by calculating the bounding box of best fit
        x_coords = [b[0] for b in to_merge] + [b[0] + b[2] for b in to_merge]
        y_coords = [b[1] for b in to_merge] + [b[1] + b[3] for b in to_merge]
        x_min, y_min = min(x_coords), min(y_coords)
        x_max, y_max = max(x_coords), max(y_coords)
        consolidated.append((x_min, y_min, x_max - x_min, y_max - y_min))
    return consolidated

# Load YOLO model
def bbox_extractor(image):
    net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")


# Get image dimensions
    (height, width) = image.shape[:2]

# Define the neural network input
    blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)

# Perform forward propagation
    output_layer_name = net.getUnconnectedOutLayersNames()
    output_layers = net.forward(output_layer_name)

# Initialize list of detected people
    people = []

# Loop over the output layers
    for output in output_layers:
    # Loop over the detections
        for detection in output:
        # Extract the class ID and confidence of the current detection
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

        # Only keep detections with a high confidence
            if class_id == 0 and confidence > 0.5:
            # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

            # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

            # Add the detection to the list of people
                people.append((x, y, w, h))
    people = consolidate_bounding_boxes(people)
    return people

image = cv2.imread("input.jpg")
people = bbox_extractor(image)
print(people)
# Draw bounding boxes around the people
for (x, y, w, h) in people:
    cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Save the image with bounding boxes
cv2.imwrite("output_with_bounding_boxes.jpg", image)


[(133, -37, 824, 1696), (-8, 413, 385, 961)]


True

In [None]:
def extract_root_depth_from_image(original_img, vis = False):
    r_cfg.set_args('n')
    cudnn.benchmark = True

# snapshot load
    model_path = './snapshot_RootNet.pth.tar'
    assert osp.exists(model_path), 'Cannot find model at ' + model_path
    model = get_root_net(r_cfg, False)
    model = DataParallel(model)
    ckpt = torch.load(model_path,  map_location=torch.device('cpu'))
    model.load_state_dict(ckpt['network'])
    model.eval()

# prepare input image
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=r_cfg.pixel_mean, std=r_cfg.pixel_std)])
    original_img_height, original_img_width = original_img.shape[:2]

# prepare bbox for each human
    bbox_list = bbox_extractor(original_img)
    person_num = len(bbox_list)
    #print(person_num)

# normalized camera intrinsics
    focal = [1500, 1500] # x-axis, y-axis
    princpt = [original_img_width/2, original_img_height/2] # x-axis, y-axis
    #print('focal length: (' + str(focal[0]) + ', ' + str(focal[1]) + ')')
    #print('principal points: (' + str(princpt[0]) + ', ' + str(princpt[1]) + ')')

    roots = []
# for cropped and resized human image, forward it to RootNet
    for n in range(person_num):
        bbox = r_process_bbox(np.array(bbox_list[n]), original_img_width, original_img_height)
        img, img2bb_trans = r_generate_patch_image(original_img, bbox, False, 0.0) 
        img = transform(img)[None,:,:,:]
        k_value = np.array([math.sqrt(r_cfg.bbox_real[0]*r_cfg.bbox_real[1]*focal[0]*focal[1]/(bbox[2]*bbox[3]))]).astype(np.float32)
        k_value = torch.FloatTensor([k_value])[None,:]

    # forward
        with torch.no_grad():
            root_3d = model(img, k_value) # x,y: pixel, z: root-relative depth (mm)
        img = img[0].cpu().numpy()
        root_3d = root_3d[0].cpu().numpy()

    # save output in 2D space (x,y: pixel)
        if vis:
            vis_img = img.copy()
            vis_img = vis_img * np.array(r_cfg.pixel_std).reshape(3,1,1) + np.array(r_cfg.pixel_mean).reshape(3,1,1)
            vis_img = vis_img.astype(np.uint8)
            vis_img = vis_img[::-1, :, :]
            vis_img = np.transpose(vis_img,(1,2,0)).copy()
            vis_root = np.zeros((2))
            vis_root[0] = root_3d[0] / r_cfg.output_shape[1] * r_cfg.input_shape[1]
            vis_root[1] = root_3d[1] / r_cfg.output_shape[0] * r_cfg.input_shape[0]
            cv2.circle(vis_img, (int(vis_root[0]), int(vis_root[1])), radius=5, color=(0,255,0), thickness=-1, lineType=cv2.LINE_AA)
            cv2.imwrite('output_root_2d_' + str(n) + '.jpg', vis_img)
    
        print('Root joint depth: ' + str(root_3d[2]) + ' mm')
        roots.append(root_3d[2])
    return roots

img_path = 'input.jpg'
original_img = cv2.imread(img_path)
extract_root_depth_from_image(original_img, True)

>>> Using CPU
Root joint depth: 1932.1128 mm
Root joint depth: 2763.571 mm


[1932.1128, 2763.571]

In [5]:
def extract_pose(original_img, vis = False):
    p_cfg.set_args('n')
    cudnn.benchmark = True

# MuCo joint set
    joint_num = 21
    joints_name = ('Head_top', 'Thorax', 'R_Shoulder', 'R_Elbow', 'R_Wrist', 'L_Shoulder', 'L_Elbow', 'L_Wrist', 'R_Hip', 'R_Knee', 'R_Ankle', 'L_Hip', 'L_Knee', 'L_Ankle', 'Pelvis', 'Spine', 'Head', 'R_Hand', 'L_Hand', 'R_Toe', 'L_Toe')
    flip_pairs = ( (2, 5), (3, 6), (4, 7), (8, 11), (9, 12), (10, 13), (17, 18), (19, 20) )
    skeleton = ( (0, 16), (16, 1), (1, 15), (15, 14), (14, 8), (14, 11), (8, 9), (9, 10), (10, 19), (11, 12), (12, 13), (13, 20), (1, 2), (2, 3), (3, 4), (4, 17), (1, 5), (5, 6), (6, 7), (7, 18) )

# snapshot load
    model_path = './snapshot_PoseNet.pth.tar'
    assert osp.exists(model_path), 'Cannot find model at ' + model_path
    model = get_pose_net(p_cfg, False, joint_num)
    model = DataParallel(model)
    ckpt = torch.load(model_path, map_location=torch.device('cpu'))
    model.load_state_dict(ckpt['network'])
    model.eval()

# prepare input image
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=p_cfg.pixel_mean, std=p_cfg.pixel_std)])
    original_img_height, original_img_width = original_img.shape[:2]

# prepare bbox
    bbox_list = bbox_extractor(original_img)
    root_depth_list = extract_root_depth_from_image(original_img) # obtain this from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE/tree/master/demo)
    assert len(bbox_list) == len(root_depth_list)
    person_num = len(bbox_list)

# normalized camera intrinsics
    focal = [1500, 1500] # x-axis, y-axis
    princpt = [original_img_width/2, original_img_height/2] # x-axis, y-axis
    #print('focal length: (' + str(focal[0]) + ', ' + str(focal[1]) + ')')
    #print('principal points: (' + str(princpt[0]) + ', ' + str(princpt[1]) + ')')

# for each cropped and resized human image, forward it to PoseNet
    output_pose_2d_list = []
    output_pose_3d_list = []
    for n in range(person_num):
        bbox = p_process_bbox(np.array(bbox_list[n]), original_img_width, original_img_height)
        img, img2bb_trans = p_generate_patch_image(original_img, bbox, False, 1.0, 0.0, False) 
        img = transform(img)[None,:,:,:]

    # forward
        with torch.no_grad():
            pose_3d = model(img) # x,y: pixel, z: root-relative depth (mm)

    # inverse affine transform (restore the crop and resize)
        pose_3d = pose_3d[0].cpu().numpy()
        pose_3d[:,0] = pose_3d[:,0] / p_cfg.output_shape[1] * p_cfg.input_shape[1]
        pose_3d[:,1] = pose_3d[:,1] / p_cfg.output_shape[0] * p_cfg.input_shape[0]
        pose_3d_xy1 = np.concatenate((pose_3d[:,:2], np.ones_like(pose_3d[:,:1])),1)
        img2bb_trans_001 = np.concatenate((img2bb_trans, np.array([0,0,1]).reshape(1,3)))
        pose_3d[:,:2] = np.dot(np.linalg.inv(img2bb_trans_001), pose_3d_xy1.transpose(1,0)).transpose(1,0)[:,:2]
        output_pose_2d_list.append(pose_3d[:,:2].copy())
    
    # root-relative discretized depth -> absolute continuous depth
        pose_3d[:,2] = (pose_3d[:,2] / p_cfg.depth_dim * 2 - 1) * (p_cfg.bbox_3d_shape[0]/2) + root_depth_list[n]
        pose_3d = p_pixel2cam(pose_3d, focal, princpt)
        output_pose_3d_list.append(pose_3d.copy())

# visualize 2d poses
    if vis:
        vis_img = original_img.copy()
        for n in range(person_num):
            vis_kps = np.zeros((3,joint_num))
            vis_kps[0,:] = output_pose_2d_list[n][:,0]
            vis_kps[1,:] = output_pose_2d_list[n][:,1]
            vis_kps[2,:] = 1
            vis_img = p_vis_keypoints(vis_img, vis_kps, skeleton)
        cv2.imwrite('output_pose_2d.jpg', vis_img)

    # visualize 3d poses
        vis_kps = np.array(output_pose_3d_list)
        p_vis_3d_multiple_skeleton(vis_kps, np.ones_like(vis_kps), skeleton, 'output_pose_3d (x,y,z: camera-centered. mm.)')
    
    return output_pose_2d_list, output_pose_3d_list

img_path = 'input.jpg'
original_img = cv2.imread(img_path)
output_pose_2d_list, output_pose_3d_list = extract_pose(original_img, True)

>>> Using CPU
>>> Using CPU
Root joint depth: 1932.1128 mm
Root joint depth: 2763.571 mm


  ax.scatter(kpt_3d[n, i1, 0], kpt_3d[n, i1, 2], -kpt_3d[n, i1, 1], c=colors[l], marker='o')
  ax.scatter(kpt_3d[n, i2, 0], kpt_3d[n, i2, 2], -kpt_3d[n, i2, 1], c=colors[l], marker='o')


In [44]:
import cv2

def process_video(input_video_path, output_video_path):
    # Open the input video
    cap = cv2.VideoCapture(input_video_path)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print( length )
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')

    # Create a VideoWriter object for the output video
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Apply pose extraction and visualization
        _, _ = extract_pose(frame, vis=True)

        # Load the frame with the pose skeleton applied
        vis_frame = cv2.imread('output_pose_2d.jpg')

        # Write the processed frame to the output video
        out.write(vis_frame)

    # Release resources
    cap.release()
    out.release()
    print(f"Processed video saved to {output_video_path}")

# Example usage
process_video('input_video.mp4', 'output_video.mp4')

460
>>> Using GPU: 0
>>> Using GPU: 0
1
focal length: (1500, 1500)
principal points: (540.0, 960.0)
Root joint depth: 4922.419 mm
focal length: (1500, 1500)
principal points: (540.0, 960.0)
>>> Using GPU: 0
>>> Using GPU: 0
1
focal length: (1500, 1500)
principal points: (540.0, 960.0)
Root joint depth: 4921.6626 mm
focal length: (1500, 1500)
principal points: (540.0, 960.0)
>>> Using GPU: 0
>>> Using GPU: 0
1
focal length: (1500, 1500)
principal points: (540.0, 960.0)
Root joint depth: 4742.263 mm
focal length: (1500, 1500)
principal points: (540.0, 960.0)
>>> Using GPU: 0
>>> Using GPU: 0
1
focal length: (1500, 1500)
principal points: (540.0, 960.0)
Root joint depth: 4815.551 mm
focal length: (1500, 1500)
principal points: (540.0, 960.0)
>>> Using GPU: 0
>>> Using GPU: 0
1
focal length: (1500, 1500)
principal points: (540.0, 960.0)
Root joint depth: 4718.9663 mm
focal length: (1500, 1500)
principal points: (540.0, 960.0)
>>> Using GPU: 0
>>> Using GPU: 0
1
focal length: (1500, 1500)
p