# Pose detection from camera feed

This takes video clips and outputs the keypoint positions over time. 

Included in the docker container: 
* Setup instructions from here: https://github.com/NVIDIA-AI-IOT/trt_pose
* Pytorch installation: https://forums.developer.nvidia.com/t/pytorch-for-jetpack-4-4-l4t-r32-4-3-in-jetson-xavier-nx/141455
* TensortRT

Code is adapted from https://spyjetson.blogspot.com/2019/12/jetsonnano-human-pose-estimation-using.html

In [31]:
import json
import trt_pose.coco
import trt_pose.models
import torch
import torch2trt
from torch2trt import TRTModule
import time, sys
import cv2
import PIL.Image, PIL.ImageDraw, PIL.ImageFont
import numpy as np

import torchvision.transforms as transforms
from trt_pose.draw_objects import DrawObjects
from trt_pose.parse_objects import ParseObjects
import argparse
import os.path
import glob

In [12]:
def draw_keypoints(img, key):
    thickness = 5
    w, h = img.size
    draw = PIL.ImageDraw.Draw(img)
    #draw Rankle -> RKnee (16-> 14)
    if all(key[16]) and all(key[14]):
        draw.line([ round(key[16][2] * w), round(key[16][1] * h), round(key[14][2] * w), round(key[14][1] * h)],width = thickness, fill=(51,51,204))
    #draw RKnee -> Rhip (14-> 12)
    if all(key[14]) and all(key[12]):
        draw.line([ round(key[14][2] * w), round(key[14][1] * h), round(key[12][2] * w), round(key[12][1] * h)],width = thickness, fill=(51,51,204))
    #draw Rhip -> Lhip (12-> 11)
    if all(key[12]) and all(key[11]):
        draw.line([ round(key[12][2] * w), round(key[12][1] * h), round(key[11][2] * w), round(key[11][1] * h)],width = thickness, fill=(51,51,204))
    #draw Lhip -> Lknee (11-> 13)
    if all(key[11]) and all(key[13]):
        draw.line([ round(key[11][2] * w), round(key[11][1] * h), round(key[13][2] * w), round(key[13][1] * h)],width = thickness, fill=(51,51,204))
    #draw Lknee -> Lankle (13-> 15)
    if all(key[13]) and all(key[15]):
        draw.line([ round(key[13][2] * w), round(key[13][1] * h), round(key[15][2] * w), round(key[15][1] * h)],width = thickness, fill=(51,51,204))

    #draw Rwrist -> Relbow (10-> 8)
    if all(key[10]) and all(key[8]):
        draw.line([ round(key[10][2] * w), round(key[10][1] * h), round(key[8][2] * w), round(key[8][1] * h)],width = thickness, fill=(255,255,51))
    #draw Relbow -> Rshoulder (8-> 6)
    if all(key[8]) and all(key[6]):
        draw.line([ round(key[8][2] * w), round(key[8][1] * h), round(key[6][2] * w), round(key[6][1] * h)],width = thickness, fill=(255,255,51))
    #draw Rshoulder -> Lshoulder (6-> 5)
    if all(key[6]) and all(key[5]):
        draw.line([ round(key[6][2] * w), round(key[6][1] * h), round(key[5][2] * w), round(key[5][1] * h)],width = thickness, fill=(255,255,0))
    #draw Lshoulder -> Lelbow (5-> 7)
    if all(key[5]) and all(key[7]):
        draw.line([ round(key[5][2] * w), round(key[5][1] * h), round(key[7][2] * w), round(key[7][1] * h)],width = thickness, fill=(51,255,51))
    #draw Lelbow -> Lwrist (7-> 9)
    if all(key[7]) and all(key[9]):
        draw.line([ round(key[7][2] * w), round(key[7][1] * h), round(key[9][2] * w), round(key[9][1] * h)],width = thickness, fill=(51,255,51))

    #draw Rshoulder -> RHip (6-> 12)
    if all(key[6]) and all(key[12]):
        draw.line([ round(key[6][2] * w), round(key[6][1] * h), round(key[12][2] * w), round(key[12][1] * h)],width = thickness, fill=(153,0,51))
    #draw Lshoulder -> LHip (5-> 11)
    if all(key[5]) and all(key[11]):
        draw.line([ round(key[5][2] * w), round(key[5][1] * h), round(key[11][2] * w), round(key[11][1] * h)],width = thickness, fill=(153,0,51))


    #draw nose -> Reye (0-> 2)
    if all(key[0][1:]) and all(key[2]):
        draw.line([ round(key[0][2] * w), round(key[0][1] * h), round(key[2][2] * w), round(key[2][1] * h)],width = thickness, fill=(219,0,219))

    #draw Reye -> Rear (2-> 4)
    if all(key[2]) and all(key[4]):
        draw.line([ round(key[2][2] * w), round(key[2][1] * h), round(key[4][2] * w), round(key[4][1] * h)],width = thickness, fill=(219,0,219))

    #draw nose -> Leye (0-> 1)
    if all(key[0][1:]) and all(key[1]):
        draw.line([ round(key[0][2] * w), round(key[0][1] * h), round(key[1][2] * w), round(key[1][1] * h)],width = thickness, fill=(219,0,219))

    #draw Leye -> Lear (1-> 3)
    if all(key[1]) and all(key[3]):
        draw.line([ round(key[1][2] * w), round(key[1][1] * h), round(key[3][2] * w), round(key[3][1] * h)],width = thickness, fill=(219,0,219))

    #draw nose -> neck (0-> 17)
    if all(key[0][1:]) and all(key[17]):
        draw.line([ round(key[0][2] * w), round(key[0][1] * h), round(key[17][2] * w), round(key[17][1] * h)],width = thickness, fill=(255,255,0))
    return img

In [13]:
'''
hnum: 0 based human index
kpoint : keypoints (float type range : 0.0 ~ 1.0 ==> later multiply by image width, height
'''
def get_keypoint(humans, hnum, peaks):
    #check invalid human index
    kpoint = []
    human = humans[0][hnum]
    C = human.shape[0]
    for j in range(C):
        k = int(human[j])
        if k >= 0:
            peak = peaks[0][j][k]   # peak[1]:width, peak[0]:height
            peak = (j, float(peak[0]), float(peak[1]))
            kpoint.append(peak)
            #print('index:%d : success [%5.3f, %5.3f]'%(j, peak[1], peak[2]) )
        else:    
            peak = (j, None, None)
            kpoint.append(peak)
            #print('index:%d : None %d'%(j, k) )
    return kpoint

In [14]:
with open('../trt_pose/tasks/human_pose/human_pose.json', 'r') as f:
    human_pose = json.load(f)

In [15]:
topology = trt_pose.coco.coco_category_to_topology(human_pose)

In [16]:
num_parts = len(human_pose['keypoints'])
num_links = len(human_pose['skeleton'])

In [17]:
print('------ model = resnet--------')
MODEL_WEIGHTS = '../trt_pose/tasks/human_pose/resnet18_baseline_att_224x224_A_epoch_249.pth'
OPTIMIZED_MODEL = '../trt_pose/tasks/human_pose/resnet18_baseline_att_224x224_A_epoch_249_trt.pth'
model = trt_pose.models.resnet18_baseline_att(num_parts, 2 * num_links).cuda().eval()
WIDTH = 224
HEIGHT = 224

------ model = resnet--------


In [18]:
data = torch.zeros((1, 3, HEIGHT, WIDTH)).cuda()

In [19]:
if os.path.exists(OPTIMIZED_MODEL) == False:
    print('-- Converting TensorRT models. This may takes several minutes...')
    model.load_state_dict(torch.load(MODEL_WEIGHTS))
    model_trt = torch2trt.torch2trt(model, [data], fp16_mode=True, max_workspace_size=1<<25)
    torch.save(model_trt.state_dict(), OPTIMIZED_MODEL)

In [20]:
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(OPTIMIZED_MODEL))

<All keys matched successfully>

In [21]:
t0 = time.time()
torch.cuda.current_stream().synchronize()
for i in range(50):
    y = model_trt(data)
torch.cuda.current_stream().synchronize()
t1 = time.time()

In [23]:
mean = torch.Tensor([0.485, 0.456, 0.406]).cuda()
std = torch.Tensor([0.229, 0.224, 0.225]).cuda()
device = torch.device('cuda')

In [32]:
def preprocess(image):
    global device
    device = torch.device('cuda')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = PIL.Image.fromarray(image)
    image = transforms.functional.to_tensor(image).to(device)
    image.sub_(mean[:, None, None]).div_(std[:, None, None])
    return image[None, ...]

def execute(img, src, t):
    color = (0, 255, 0)
    data = preprocess(img)
    cmap, paf = model_trt(data)
    cmap, paf = cmap.detach().cpu(), paf.detach().cpu()
    counts, objects, peaks = parse_objects(cmap, paf)#, cmap_threshold=0.15, link_threshold=0.15)
    fps = 1.0 / (time.time() - t)
    for i in range(counts[0]):
        keypoints = get_keypoint(objects, i, peaks)
        for j in range(len(keypoints)):
            if keypoints[j][1]:
                x = round(keypoints[j][2] * WIDTH * X_compress)
                y = round(keypoints[j][1] * HEIGHT * Y_compress)
                cv2.circle(src, (x, y), 3, color, 2)
                cv2.putText(src , "%d" % int(keypoints[j][0]), (x + 5, y),  cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 1)
                cv2.circle(src, (x, y), 3, color, 2)
    print("FPS:%f "%(fps))
    #draw_objects(img, counts, objects, peaks)

    cv2.putText(src , "FPS: %f" % (fps), (20, 20),  cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
    out_video.write(src)

In [55]:
def execute_2(img, org, count, outpath):
    start = time.time()
    data = preprocess(img)
    cmap, paf = model_trt(data)
    cmap, paf = cmap.detach().cpu(), paf.detach().cpu()
    end = time.time()
    counts, objects, peaks = parse_objects(cmap, paf)#, cmap_threshold=0.15, link_threshold=0.15)
    for i in range(counts[0]):
        #print("Human index:%d "%( i ))
        kpoint = get_keypoint(objects, i, peaks)
        ti = time.time()
#         print(kpoint)
#         org = draw_keypoints(org, kpoint)
    netfps = 1 / (end - start)  
#     draw = PIL.ImageDraw.Draw(org)
    f = open(outpath, "a")
    f.write(f"{ti}\t{kpoint}\t{count}\t{netfps}\n")
    return org

In [36]:
parse_objects = ParseObjects(topology)

In [56]:
def run_pose(video_path, video_name):
    cap = cv2.VideoCapture(video_path+video_name+".mp4")
    # cap = cv2.VideoCapture(-1)
    ret_val, img = cap.read()
    H, W, __ = img.shape

    if cap is None:
        print("Camera Open Error")
        sys.exit(0)

    parse_objects = ParseObjects(topology)
    draw_objects = DrawObjects(topology)

    fontname = '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc'

    count = 1
    outpath = "../keypoints/"+video_name+".txt"
    while cap.isOpened():
        ret_val, dst = cap.read()
        if ret_val == False:
            print("Frame Read End", video_name)
            break
        img = cv2.resize(dst, dsize=(WIDTH, HEIGHT), interpolation=cv2.INTER_AREA)
        pilimg = cv2.cvtColor(dst, cv2.COLOR_BGR2RGB)
        pilimg = PIL.Image.fromarray(pilimg)
        pilimg = execute_2(img, pilimg, count, outpath) 
        count += 1

    cv2.destroyAllWindows()
    cap.release()

In [27]:
video_path = "../data/"

In [57]:
cliplist = glob.glob(video_path + '*.mp4')
cliplist = [x.replace(video_path, "").replace(".mp4", "") for x in cliplist]

In [61]:
for clip in cliplist:
    run_pose(video_path, clip)

Frame Read End 28_kkw_front_3
Frame Read End 23_kkw_front_2
Frame Read End 11_kkw_front_7
Frame Read End 40_front_kw_4
Frame Read End 41_kkw_front_12
Frame Read End 18_front_kw_2
Frame Read End 25_kkw_front_4
Frame Read End 25_kkw_front_12
Frame Read End 16_kkw_front_3
Frame Read End 38_kkw_front2_4
Frame Read End 35_kkw_front_10
Frame Read End 27_kkw_front_12
Frame Read End 13_kkw_front_5
Frame Read End 20_kkw_front_2
Frame Read End 10_kkw_front_9
Frame Read End 16_kkw_front_10
Frame Read End 15_kkw_front_4
Frame Read End 27_kkw_front_1
Frame Read End 27_kkw_front_7
Frame Read End 35_kkw_front_4
Frame Read End 26_kkw_front_1
Frame Read End 38_kkw_front_3
Frame Read End 29_kkw_front_2
Frame Read End 31_kkw_front_2
Frame Read End 13_kkw_front_3
Frame Read End 24_kkw_front_4
Frame Read End 35_kkw_front_11
Frame Read End 29_kkw_front_6
Frame Read End 29_kkw_front_12
Frame Read End 25_kkw_front_9
Frame Read End 20_kkw_front_3
Frame Read End 11_kkw_front_2
Frame Read End 23_kkw_front_10
Fra