In [1]:
%cd /home/junfeng/Documents/WorkSpace/SummerProject/COMP0073_2023_Junfeng

/home/junfeng/Documents/WorkSpace/SummerProject/COMP0073_2023_Junfeng


In [2]:
import numpy as np
import torch
import torch.nn as nn
import json

from datasets.posetrack21 import PoseTrack21
from utilities.utilities import keypoints_to_mask, HiddenPrints, normalize_heatmaps, ReLu, count_positive, get_mean_average_acc
from mmpose.evaluation import pose_pck_accuracy, keypoint_pck_accuracy
from mmpose.codecs import UDPHeatmap

from models.PoseEstimate import PoseEstimate
from models.backbone import ResNet50, ViTEncoder, VideoViTEncoder, FusionVit
from models.neck import UnPatch
from models.head import DeconvHead, MaskHead, ResMaskHead, SimpleHead

In [3]:
dataset_root_dir = "/home/junfeng/datasets/PoseTrack21"
dataset_train = PoseTrack21(
    root_dir=dataset_root_dir,
    set="train",
)

dataset_test = PoseTrack21(
    root_dir=dataset_root_dir,
    set="test",
)

print(len(dataset_train))
print(len(dataset_test))

593
170


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
pretrained_path = "./checkpoints/vitpose_base_coco_aic_mpii.pth"
encoder = ViTEncoder(pretrained_path=pretrained_path)
neck = nn.UnPatch()
head = DeconvHead(pretrained_path=pretrained_path)
model = PoseEstimate(encoder=encoder, neck=neck, head=head)
model_path = "./checkpoints/Single_Frame_ViT_Deconv_01.pth"
# model_dict = torch.load(model_path, map_location=device)
# model.load_state_dict(model_dict["model_state_dict"])
# epoch = model_dict["epoch"]
# print(f"Loaded model from {model_path} at epoch {epoch}")
with HiddenPrints():
    model.to(device)
    model.eval()

Loaded model from ./checkpoints/Fusion_Deconv_Best_02.pth at epoch 1600


In [5]:
dataset = dataset_test
TP = [0]*17
P = [0]*17
TP_udp = [0]*17
P_udp = [0]*17
udp = UDPHeatmap(input_size=(192, 256), heatmap_size=(48, 64))
normalize = np.tile(np.array([[256, 192]]), (1, 1))
thr = 0.05
with torch.no_grad():
    for i in range(len(dataset)):
        for video, video_transformed, expanded_bbox, keypoints, keypoints_transformed, heatmaps, image_id, track_id in dataset.get_for_eval(i):
            mask = keypoints_to_mask(keypoints)
            video_transformed = video_transformed.to(device).unsqueeze(0)
            heatmaps = heatmaps.unsqueeze(0).cpu().numpy()
            pred_heatmaps = model.predict(video_transformed)
            pred_heatmaps = pred_heatmaps.cpu().numpy()
            acc, _, _ = pose_pck_accuracy(
                pred_heatmaps, heatmaps, mask, thr=thr)
            P = [P[i] + count_positive(acc[i]) for i in range(len(acc))]
            TP = [TP[i] + ReLu(acc[i]) for i in range(len(acc))]
            pred_keypoints = udp.decode(pred_heatmaps.squeeze())[0]
            acc_udp, _, _ = keypoint_pck_accuracy(
                pred=pred_keypoints, gt=keypoints_transformed[:, :2].numpy(), mask=mask, thr=thr, norm_factor=normalize)
            P_udp = [P_udp[i] + count_positive(acc_udp[i])
                        for i in range(len(acc_udp))]
            TP_udp = [TP_udp[i] + ReLu(acc_udp[i]) for i in range(len(acc_udp))]
            print(f"video: {i} / {len(dataset)}", end="\r")

video: 169 / 170

In [6]:
# write results to json
results = {
    "model_path": model_path,
    "P": P,
    "TP": TP,
    "P_udp": P_udp,
    "TP_udp": TP_udp,
    "mean_acc": get_mean_average_acc(TP, P),
    "mean_acc_udp": get_mean_average_acc(TP_udp, P_udp),
    "acc_keypoints": [TP[i]/P[i] if P[i]!=0 else 0 for i in range(len(TP))],
    "acc_keypoints_udp": [TP_udp[i]/P_udp[i] if P_udp[i]!=0 else 0 for i in range(len(TP_udp))],
}
with open("./results/Single_Frame_ViT_Deconv_Untrained.json", "w") as f:
    json.dump(results, f)