In [None]:
import os
from os import listdir
import pandas as pd
import numpy as np
import glob
import cv2
import json
from os.path import expanduser
import splitfolders
import shutil
from define_path import Def_Path

from tqdm import tqdm

import torch 
import torchvision
from torchvision import models
from torchvision.models.detection.rpn import AnchorGenerator
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
import torchvision.transforms as T
from torchvision.transforms import functional as F
from torchsummary import summary

from sklearn.model_selection import train_test_split

import albumentations as A # Library for augmentations

import matplotlib.pyplot as plt 
from PIL import Image

import transforms, utils, engine, train
from utils import collate_fn
from engine import train_one_epoch, evaluate


t = torch.cuda.get_device_properties(0).total_memory
print(t)
torch.cuda.empty_cache()

r = torch.cuda.memory_reserved(0)
print(r)
a = torch.cuda.memory_allocated(0)
print(a)
# f = r-a  # free inside reserved

In [None]:
# to generalize home directory. User can change their parent path without entering their home directory
path = Def_Path()

# parent_path =  path.home + "/Workspace/WPI/Summer2023/ws/duc_repo/src/panda_test/" + "data/kp_test_images/"
# parent_path =  path.home + "/Workspace/WPI/Summer2023/ws/duc_repo/src/panda_test/" + "data/sim_marker/"
parent_path = "/home/jc-merlab/Pictures/Data/"

# root_dir = parent_path + path.year + "-" + path.month + "-" + path.day + "/"
root_dir = parent_path + "occ_panda_physical_dataset" + "/"

print(root_dir)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# torch.cuda.set_per_process_memory_fraction(0.9, 0)
print(device)

In [None]:
# this fucntion tranforms an input image for diverseifying data for training
def train_transform():
    return A.Compose([
        A.Sequential([
            A.RandomRotate90(p=1), # Random rotation of an image by 90 degrees zero or more times
            A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.2, brightness_by_max=True, always_apply=False, p=1), # Random change of brightness & contrast
        ], p=1)
    ],
    keypoint_params=A.KeypointParams(format='xy'), # More about keypoint formats used in albumentations library read at https://albumentations.ai/docs/getting_started/keypoints_augmentation/
    bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bboxes_labels']) # Bboxes should have labels, read more at https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/
    )

In [None]:
# this function is to split the dataset into train, test and validation folder.
def train_test_split(src_dir):
    dst_dir_img = src_dir + "images"
    dst_dir_anno = src_dir + "annotations"
    
    if os.path.exists(dst_dir_img) and os.path.exists(dst_dir_anno):
        print("folders exist")
    else:
        os.mkdir(dst_dir_img)
        os.mkdir(dst_dir_anno)
        
    for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
        shutil.copy(jpgfile, dst_dir_img)

#     for jsonfile in glob.iglob(os.path.join(src_dir, "*.json")):
#         shutil.copy(jsonfile, dst_dir_anno)
        
    for jsonfile in glob.iglob(os.path.join(src_dir, "*.json")):
        if not jsonfile.endswith("_vel.json") and not jsonfile.endswith("_combined.json"):
            shutil.copy(jsonfile, dst_dir_anno) 
        
#     output = parent_path + "split_folder_output" + "-" + path.year + "-" + path.month + "-" + path.day 

    output = src_dir + "split_folder_output"
    
    print(output)
    
    splitfolders.ratio(src_dir, # The location of dataset
                   output=output, # The output location
                   seed=42, # The number of seed
                   ratio=(.7, .2, .1), # The ratio of split dataset
                   group_prefix=None, # If your dataset contains more than one file like ".jpg", ".pdf", etc
                   move=False # If you choose to move, turn this into True
                   )
    
    shutil.rmtree(dst_dir_img)
    shutil.rmtree(dst_dir_anno)
    
    return output
    

In [None]:
class ClassDataset(Dataset):
    def __init__(self, root, transform=None, demo=False):                
        self.root = root
        self.transform = transform
        self.demo = demo # Use demo=True if you need transformed and original images (for example, for visualization purposes)
        self.imgs_files = sorted(os.listdir(os.path.join(root, "images")))
        self.annotations_files = sorted(os.listdir(os.path.join(root, "annotations")))
#         self.imgs_files = [file for file in sorted(os.listdir(root)) if file.endswith(".jpg")]
#         self.annotations_files = [file for file in sorted(os.listdir(root)) if file.endswith(".json")]
    
    def __getitem__(self, idx):
#         img_path = os.path.join(self.root, self.imgs_files[idx])
#         annotations_path = os.path.join(self.root, self.annotations_files[idx])
        img_path = os.path.join(self.root, "images", self.imgs_files[idx])
        annotations_path = os.path.join(self.root, "annotations", self.annotations_files[idx])

        img_original = cv2.imread(img_path)
        img_original = cv2.cvtColor(img_original, cv2.COLOR_BGR2RGB)        
        
        with open(annotations_path) as f:
            data = json.load(f)
#             bboxes_original = data['bboxes'][:3]
            bboxes_original = data['bboxes']
#             print("bounding boxes", bboxes_original)
#             keypoints_original = data['keypoints'][:3]
            keypoints_original = data['keypoints']
#             print("original keypoints", np.array(keypoints_original))
#             print("original keypoints shape", (np.array(keypoints_original)).shape)
            
            # All objects are keypoints on the robot
            bboxes_labels_original = [] 
            bboxes_labels_original.append('base_joint')
            bboxes_labels_original.append('joint2')
            bboxes_labels_original.append('joint3')
            bboxes_labels_original.append('joint4')
            bboxes_labels_original.append('joint5')
            bboxes_labels_original.append('joint6') 
            bboxes_labels_original.append('joint7')
            bboxes_labels_original.append('joint8')
            bboxes_labels_original.append('joint9')
#             bboxes_labels_original.append('panda_finger_1')
#             bboxes_labels_original.append('panda_finger_2')
            
#         print(bboxes_original)
#         print(bboxes_labels_original)

        if self.transform:   
            # Converting keypoints from [x,y,visibility]-format to [x, y]-format + Flattening nested list of keypoints            
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]], where each keypoint is in [x, y]-format            
            # Then we need to convert it to the following list:
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2]
            keypoints_original_flattened = [el[0:2] for kp in keypoints_original for el in kp]
            
            # Apply augmentations
            transformed = self.transform(image=img_original, bboxes=bboxes_original, bboxes_labels=bboxes_labels_original, keypoints=keypoints_original_flattened)
            img = transformed['image']
            bboxes = transformed['bboxes']
            # Unflattening list transformed['keypoints']
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2], where each keypoint is in [x, y]-format
            # Then we need to convert it to the following list:
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]]
            keypoints_transformed_unflattened = np.reshape(np.array(transformed['keypoints']), (-1,1,2)).tolist()

            # Converting transformed keypoints from [x, y]-format to [x,y,visibility]-format by appending original visibilities to transformed coordinates of keypoints
            keypoints = []
            for o_idx, obj in enumerate(keypoints_transformed_unflattened):
#                 print("object", obj)
#                 print(" obj index", o_idx)# Iterating over objects
                obj_keypoints = []
                for k_idx, kp in enumerate(obj): # Iterating over keypoints in each object
#                     print("kp index", k_idx)
#                     print("key points",kp)
#                     print("keypoints original second iter", [keypoints_original[0][o_idx][k_idx]],
#                           [keypoints_original[o_idx][k_idx][0]], [keypoints_original[o_idx][k_idx][1]], \
#                          [keypoints_original[o_idx][k_idx][2]], [keypoints_original[o_idx][k_idx][3]])
                    # kp - coordinates of keypoint
                    # keypoints_original[o_idx][k_idx][2] - original visibility of keypoint
                    obj_keypoints.append(kp + [keypoints_original[o_idx][k_idx][2]])
                keypoints.append(obj_keypoints)
#             print(keypoints)
        
        else:
            img, bboxes, keypoints = img_original, bboxes_original, keypoints_original        
        
        # Convert everything into a torch tensor        
        bboxes = torch.as_tensor(bboxes, dtype=torch.float32)       
        target = {}
#         labels = [1, 2, 3]
        labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
#         labels = [1, 2, 3, 4, 5, 6, 7, 8]
#         labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]            
        target["boxes"] = bboxes
        target["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are joint positions
        target["image_id"] = torch.tensor([idx])
        target["area"] = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
        target["iscrowd"] = torch.zeros(len(bboxes), dtype=torch.int64)
        target["keypoints"] = torch.as_tensor(keypoints, dtype=torch.float32)
        img = F.to_tensor(img)        
        bboxes_original = torch.as_tensor(bboxes_original, dtype=torch.float32)
        target_original = {}
        target_original["boxes"] = bboxes_original
        target_original["labels"] = torch.as_tensor(labels, dtype=torch.int64) 
        target_original["image_id"] = torch.tensor([idx])
        target_original["area"] = (bboxes_original[:, 3] - bboxes_original[:, 1]) * (bboxes_original[:, 2] - bboxes_original[:, 0])
        target_original["iscrowd"] = torch.zeros(len(bboxes_original), dtype=torch.int64)
        target_original["keypoints"] = torch.as_tensor(keypoints_original, dtype=torch.float32)        
        img_original = F.to_tensor(img_original)

        if self.demo:
            return img, target, img_original, target_original
        else:
            return img, target
    
    def __len__(self):
        return len(self.imgs_files)
    
    

In [None]:
KEYPOINTS_FOLDER_TRAIN = train_test_split(root_dir) +"/test" 
dataset = ClassDataset(KEYPOINTS_FOLDER_TRAIN, transform=train_transform(), demo=True)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

iterator = iter(data_loader)
batch = next(iterator)
# print(batch[2])

# print("Original targets:\n", batch[3], "\n\n")
# print("Transformed targets:\n", batch[1])

In [None]:
# function to visualize how the transformed data looks 

keypoints_classes_ids2names = {0: 'base_joint', 1: 'joint2', 2: 'joint3', 3: 'joint4', 4: 'joint5', 5: 'joint6',\
                              6:'joint7', 7:'joint8', 8:'panda_finger_1', 9:'panda_finger_2'}

def visualize(image, bboxes, keypoints, image_original=None, bboxes_original=None, keypoints_original=None):
    fontsize = 18

    for bbox in bboxes:
        start_point = (bbox[0], bbox[1])
        end_point = (bbox[2], bbox[3])
        image = cv2.rectangle(image.copy(), start_point, end_point, (0,255,0), 2)
    
    for idx, kps in enumerate(keypoints):
        for kp in kps:
            image = cv2.circle(image.copy(), tuple(kp), 2, (255,0,0), 10)
#         image = cv2.putText(image.copy(), " " + keypoints_classes_ids2names[idx], tuple(kp), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0), 1, cv2.LINE_AA)

    if image_original is None and keypoints_original is None:
        plt.figure(figsize=(40,40))
        plt.imshow(image)
        
        return image

    else:
        for bbox in bboxes_original:
            start_point = (bbox[0], bbox[1])
            end_point = (bbox[2], bbox[3])
            image_original = cv2.rectangle(image_original.copy(), start_point, end_point, (0,255,0), 2)
        
        print(keypoints_original)
        for idx, kps in enumerate(keypoints_original):
            print(idx)
            print(kps)
            for kp in kps:
                print(kp)
                image_original = cv2.circle(image_original, tuple(kp), 5, (255,0,0), 2)
#             image_original = cv2.putText(image_original, " " + keypoints_classes_ids2names[idx], tuple(kp), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0), 1, cv2.LINE_AA)

        f, ax = plt.subplots(1, 2, figsize=(40, 20))

        ax[0].imshow(image_original)
        ax[0].set_title('Original image', fontsize=fontsize)

        ax[1].imshow(image)
        ax[1].set_title('Transformed image', fontsize=fontsize)
        
        return None
        
image = (batch[0][0].permute(1,2,0).numpy() * 255).astype(np.uint8)
bboxes = batch[1][0]['boxes'].detach().cpu().numpy().astype(np.int32).tolist()

keypoints = []
# for kps in batch1[1][0]['keypoints'].detach().cpu().numpy().astype(np.int32).tolist():
#     keypoints.append([kp[:2] for kp in [kps]])
    
for kps in batch[1][0]['keypoints'].detach().cpu().numpy().astype(np.int32).tolist():
    keypoints.append([kp[:2] for kp in kps])

image_original = (batch[2][0].permute(1,2,0).numpy() * 255).astype(np.uint8)
bboxes_original = batch[3][0]['boxes'].detach().cpu().numpy().astype(np.int32).tolist()

keypoints_original = []
# for kps in batch1[3][0]['keypoints'].detach().cpu().numpy().astype(np.int32).tolist():
#     keypoints_original.append([kp[:2] for kp in [kps]])
    
for kps in batch[3][0]['keypoints'].detach().cpu().numpy().astype(np.int32).tolist():
    keypoints_original.append([kp[:2] for kp in kps])

visualize(image, bboxes, keypoints, image_original, bboxes_original, keypoints_original)

In [None]:
def get_model(num_keypoints, weights_path=None):
    
    anchor_generator = AnchorGenerator(sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0))
    model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=False,
                                                                   weights_backbone=True,
                                                                   num_keypoints=num_keypoints,
                                                                   num_classes = 8, # Background is the first class, object is the second class
                                                                   rpn_anchor_generator=anchor_generator)

    if weights_path:
        state_dict = torch.load(weights_path)
        model.load_state_dict(state_dict)        
        
    return model

In [None]:
num_keypoints = 9
model = get_model(num_keypoints, weights_path=None)
model 

In [None]:
total_keypoints = 9

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

split_folder_path = train_test_split(root_dir)

KEYPOINTS_FOLDER_TRAIN = split_folder_path +"/train" #train_test_split(root_dir) +"/train"
KEYPOINTS_FOLDER_VAL = split_folder_path +"/val"
KEYPOINTS_FOLDER_TEST = split_folder_path +"/test"

dataset_train = ClassDataset(KEYPOINTS_FOLDER_TRAIN, transform=train_transform(), demo=False)
# dataset_train = ClassDataset(KEYPOINTS_FOLDER_TRAIN, transform=None, demo=False)
dataset_val = ClassDataset(KEYPOINTS_FOLDER_VAL, transform=None, demo=False)
dataset_test = ClassDataset(KEYPOINTS_FOLDER_TEST, transform=None, demo=False)

# batch_sizes = [3,2]
# epochs_lst = [30,50,100]

# batch_sizes = [3, 2, 1]
# epochs_lst = [25, 30]
batch_sizes = [1]
epochs_lst = [25, 30]

v = 2

for b_size in batch_sizes:
    for epochs in epochs_lst:
        data_loader_train = DataLoader(dataset_train, batch_size=b_size, shuffle=True, collate_fn=collate_fn)
        data_loader_val = DataLoader(dataset_val, batch_size=1, shuffle=False, collate_fn=collate_fn)
        data_loader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate_fn)

        model = get_model(num_keypoints = total_keypoints)
        model.to(device)

        params = [p for p in model.parameters() if p.requires_grad]
        optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3)
        num_epochs = epochs

        for epoch in range(num_epochs):
            train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=1000)
            lr_scheduler.step()
            
#             if epoch == 25 or epoch == 30 :
#                 PATH = f"/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_ld_b{b_size}_e{epoch}_v{v}.pth"        
#                 torch.save(model, PATH)
#                 v+=1 
            
        
        PATH = f"/home/jc-merlab/Pictures/Data/trained_models/kprcnn_plan_b{b_size}_e{epochs}_v{v}_sim.pth"
            
            
        torch.save(model, PATH)
        
           
        


#     evaluate(model, data_loader_val, device)


In [None]:
# Save model weights after training
# torch.save(model.state_dict(), 'keypointsrcnn_weights_120.pth')
# torch.save(model, '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_ld_b1_e30_v3.pth')
            
# torch.save(model, PATH)

In [None]:
KEYPOINTS_FOLDER_VAL = parent_path + "split_folder_output-2023-07-14" +"/val"
dataset_val = ClassDataset(KEYPOINTS_FOLDER_VAL, transform=None, demo=False)
data_loader_val = DataLoader(dataset_val, batch_size=1, shuffle=False, collate_fn=collate_fn)
iterator = iter(data_loader_val)
len(data_loader_val)

In [None]:
evaluate(model, data_loader_val, device)

In [None]:
# weights_path = '/home/user/Workspace/WPI/Summer2023/ws/duc_repo/src/panda_test/data/trained_models/keypointsrcnn_weights_ld_b1_e25_v9.pth'
weights_path = PATH
model = torch.load(weights_path).to(device)
model.eval()

In [None]:
data_iterator = iter(data_loader_val)
i = 1
while True:
    try:
        images, targets = next(data_iterator)
        images = list(img.to(device) for img in images)
        with torch.no_grad():
            outputs = model(images)
            images = (images[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
            scores = outputs[0]['scores'].detach().cpu().numpy()

            high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
            post_nms_idxs = torchvision.ops.nms(outputs[0]['boxes'][high_scores_idxs], outputs[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)
            keypoints = []
            for kps in outputs[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
                keypoints.append([list(map(int, kp[:2])) for kp in kps])

            bboxes = []
            for bbox in outputs[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
                bboxes.append(list(map(int, bbox.tolist())))
            img = visualize(images, bboxes, keypoints)
#             cv2.imwrite("/home/jc-merlab/Pictures/Data/video_results_01/out_image_" + str(i) + ".jpg", img)
            cv2.imshow(f'image{i}', img)

            cv2.waitKey(0)
            cv2.destroyWindow(f'image{i}')

            
            i = i+1

            # Calculate loss and metrics for evaluation here.
    except StopIteration:
        break         
        


In [None]:
weights_path = '/home/user/Workspace/WPI/Summer2023/ws/duc_repo/src/panda_test/data/trained_models/keypointsrcnn_weights_ld_b1_e25_v1.pth'
model = torch.load(weights_path).to(device)

images1, targets1 = next(iterator)
images2, targets2 = next(iterator)
images3, targets3 = next(iterator)
images4, targets4 = next(iterator)
# images, targets = next(iterator)

print(type(images1))


images1 = list(image1.to(device) for image1 in images1)
images2 = list(image2.to(device) for image2 in images2)
images3 = list(image3.to(device) for image3 in images3)
images4 = list(image4.to(device) for image4 in images4)
# images = list(image.to(device) for image in images)

print(type(images1))

with torch.no_grad():
    model.to(device)
    model.eval()
    output1 = model(images1)
    output2 = model(images2)
    output3 = model(images3)
    output4 = model(images4)
#     output = model(image)

# print("Predictions: \n", output1)



In [None]:
print("Predictions: \n", output1)


In [None]:
for i, (images, targets) in enumerate(data_loader_val)

In [None]:
import cv2
import numpy as np
import torch
import time

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_ld_b1_e25_v2.pth'
model = torch.load(weights_path).to(device)
# model = get_model(num_keypoints=6, weights_path=weights_path)
# model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)
model.eval()

# print(type(model))
# Create a VideoCapture object and read from input file
# If the input is the camera, pass 0 instead of the video file name
cap = cv2.VideoCapture('/home/jc-merlab/Pictures/Data/inference_data/test_video_3d.avi')
 
# Check if camera opened successfully
if (cap.isOpened()== False): 
    print("Error opening video stream or file")
i = 0
print(type(i))
while(cap.isOpened()):
  # Capture frame-by-frame
    print(i)
    ret, frame = cap.read()
    if ret == True:        
#         img = cv2.imread(frame)
        image = Image.fromarray(frame)

        image = F.to_tensor(image).to(device)
        image.unsqueeze_(0)
        image = list(image)
    
        with torch.no_grad():
            model.to(device)
            model.eval()
            start = time.time(now)
            output = model(image)
            stop = time.time(now)
            print("time", (stop - start))

        image = (image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

        # Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
        # Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
        # Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

        keypoints = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append([list(map(int, kp[:2])) for kp in kps])

        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
        img = visualize(image, bboxes, keypoints)
        
        cv2.imwrite("/home/jc-merlab/Pictures/Data/video_results_01/out_image_" + str(i) + ".jpg", img)
    
    else:
        break
        
    i = i+1
    
cap.release()
 
# Closes all the frames
cv2.destroyAllWindows()

In [None]:
image1 = (images1[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
scores1 = output1[0]['scores'].detach().cpu().numpy()

high_scores_idxs = np.where(scores1 > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
post_nms_idxs = torchvision.ops.nms(output1[0]['boxes'][high_scores_idxs], output1[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

# Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
# Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
# Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

keypoints = []
for kps in output1[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    keypoints.append([list(map(int, kp[:2])) for kp in kps])
    
print(keypoints)

bboxes = []
for bbox in output1[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    bboxes.append(list(map(int, bbox.tolist())))
    
image = visualize(image1, bboxes, keypoints)






In [None]:
image2 = (images2[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
scores = output2[0]['scores'].detach().cpu().numpy()

high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
post_nms_idxs = torchvision.ops.nms(output2[0]['boxes'][high_scores_idxs], output2[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

# Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
# Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
# Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

keypoints = []
for kps in output2[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    keypoints.append([list(map(int, kp[:2])) for kp in kps])

bboxes = []
for bbox in output2[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    bboxes.append(list(map(int, bbox.tolist())))
    
visualize(image2, bboxes, keypoints)

In [None]:
image3 = (images3[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
scores3 = output3[0]['scores'].detach().cpu().numpy()

high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
post_nms_idxs = torchvision.ops.nms(output3[0]['boxes'][high_scores_idxs], output3[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

# Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
# Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
# Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

keypoints = []
for kps in output3[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    keypoints.append([list(map(int, kp[:2])) for kp in kps])

bboxes = []
for bbox in output3[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    bboxes.append(list(map(int, bbox.tolist())))
    
visualize(image3, bboxes, keypoints)

In [None]:
image4 = (images4[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
scores4 = output4[0]['scores'].detach().cpu().numpy()

high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
post_nms_idxs = torchvision.ops.nms(output4[0]['boxes'][high_scores_idxs], output4[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

# Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
# Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
# Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

keypoints = []
for kps in output4[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    keypoints.append([list(map(int, kp[:2])) for kp in kps])

bboxes = []
for bbox in output4[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    bboxes.append(list(map(int, bbox.tolist())))
    
visualize(image4, bboxes, keypoints)

In [None]:
image = Image.open("/home/jc-merlab/428.jpg")
print(type(image))

image = F.to_tensor(image).to(device)
image.unsqueeze_(0)
print(image.shape)
image = list(image)
# print(type(images))
# images = list(image.to(device) for image in images)

with torch.no_grad():
    model.to(device)
    model.eval()
    output = model(image)
    
    

In [None]:
image = (image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
scores = output[0]['scores'].detach().cpu().numpy()

high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

# Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
# Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
# Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

keypoints = []
for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    keypoints.append(list(map(int, kps[0,0:2])))
#     keypoints.append([list(map(int, kp[:2])) for kp in kps])
keypoints_ = [x for _,x in sorted(zip(labels,keypoints))]

bboxes = []
for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
    bboxes.append(list(map(int, bbox.tolist())))
    
visualize(image, bboxes, keypoints)

In [None]:
import cv2
import numpy as np

weights_path = 'keypointsrcnn_weights.pth'
model = get_model(num_keypoints=6, weights_path=weights_path)
model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)

model.eval()

# print(type(model))
# Create a VideoCapture object and read from input file
# If the input is the camera, pass 0 instead of the video file name
cap = cv2.VideoCapture('/home/jc-merlab/nov1_v1.avi')
 
# Check if camera opened successfully
if (cap.isOpened()== False): 
    print("Error opening video stream or file")
i = 0
print(type(i))
while(cap.isOpened()):
  # Capture frame-by-frame
    print(i)
    ret, frame = cap.read()
    if ret == True:        
#         img = cv2.imread(frame)
        image = Image.fromarray(frame)

        image = F.to_tensor(image).to(device)
        image.unsqueeze_(0)
        image = list(image)
    
        with torch.no_grad():
            model.to(device)
            model.eval()
            output = model(image)

        image = (image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

        # Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
        # Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
        # Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

        keypoints = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append([list(map(int, kp[:2])) for kp in kps])

        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
        img = visualize(image, bboxes, keypoints)
        
        cv2.imwrite("/home/jc-merlab/Pictures/Data/video_results/out_image_" + str(i) + ".jpg", img)
        
    i = i+1
    
cap.release()
 
# Closes all the frames
cv2.destroyAllWindows()

In [None]:
import os
import moviepy.video.io.ImageSequenceClip
image_folder="/home/jc-merlab/Pictures/Data/video_results/"

fps=1

image_files = [os.path.join(image_folder,img)
               for img in os.listdir(image_folder)
               if img.endswith(".jpg")]
clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(image_files, fps=fps)
clip.write_videofile('my_video.mp4')

In [None]:
2500//72

In [None]:
import cv2
import numpy as np
import torch
import time

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_ld_b1_e25_v2.pth'
model = torch.load(weights_path).to(device)
# model = get_model(num_keypoints=6, weights_path=weights_path)
# model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)
model.eval()

# print(type(model))
# Create a VideoCapture object and read from input file
# If the input is the camera, pass 0 instead of the video file name
cap = cv2.VideoCapture('/home/jc-merlab/Pictures/Data/inference_data/test_video_3d.avi')
 
# Check if camera opened successfully
if (cap.isOpened()== False): 
    print("Error opening video stream or file")
i = 0
print(type(i))
while(cap.isOpened()):
  # Capture frame-by-frame
    print(i)
    ret, frame = cap.read()
    if ret == True:        
#         img = cv2.imread(frame)
        image = Image.fromarray(frame)

        image = F.to_tensor(image).to(device)
        image.unsqueeze_(0)
        image = list(image)
        
    
        with torch.no_grad():
            model.to(device)
            model.eval()
            start = time.time(now)
            output = model(image)
            stop = time.time(now)
            print("time", (stop - start))

        image = (image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

        # Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
        # Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
        # Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

        keypoints = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append([list(map(int, kp[:2])) for kp in kps])

        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
        img = visualize(image, bboxes, keypoints)
        
        labels = []
        for label in output[0]['labels'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            labels.append(label)
        keypoints_ = [x for _,x in sorted(zip(labels,keypoints))]
        
        
        cv2.imwrite("/home/jc-merlab/Pictures/Data/video_results_01/out_image_" + str(i) + ".jpg", img)
    
    else:
        break
        
    i = i+1
    
cap.release()
 
# Closes all the frames
cv2.destroyAllWindows()

In [None]:
import cv2
import numpy as np
import torch
import time
import os
import json
import torch, torchvision
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms import functional as F
import shutil

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_sim_b1_e25_v0.pth'
model = torch.load(weights_path).to(device)
# model = get_model(num_keypoints=6, weights_path=weights_path)
# model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)
model.eval()

# Specify input and output folders
input_folder = '/home/jc-merlab/Pictures/Data/occ_sim_append/'
output_frames_folder = '/home/jc-merlab/Pictures/Data/occ_sim_append_op/'
# output_json_folder = '/home/jc-merlab/Pictures/Data/keypoint_jsons'

# Check if output folders exist, create them if not
os.makedirs(output_frames_folder, exist_ok=True)
# os.makedirs(output_json_folder, exist_ok=True)

# Process images in the folder
i = 0
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.jpg'):
        image_path = os.path.join(input_folder, filename)
        image = cv2.imread(image_path)  

        image = F.to_tensor(image).to(device)
        image.unsqueeze_(0)
        image = list(image)
        # print(type(images))
        # images = list(image.to(device) for image in images)

        with torch.no_grad():
            model.to(device)
            model.eval()
            output = model(image)
            image = (image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

        keypoints = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append(list(map(int, kps[0,0:2])))
            
        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
       
        labels = []
        for label in output[0]['labels'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            labels.append(label)
            
        keypoints_ = [x for _,x in sorted(zip(labels,keypoints))]
        print(keypoints_)
        bbox_ = [x for _,x in sorted(zip(labels,bboxes))]
        print(bbox_[3])
        
        # Modify the 4th keypoint as per the requirement
#         if len(keypoints_) >= 4 and len(bbox_) >= 4:
#             fourth_bbox = bbox_[3]
#             # Replace 4th keypoint with the top-left corner of the 4th bounding box
#             keypoints_[3] = [fourth_bbox[0], fourth_bbox[1]]
#             # Append the bottom-right corner of the 4th bounding box as a new keypoint
#             keypoints_.append([fourth_bbox[2], fourth_bbox[3]])
            
        print(keypoints_)

        # Generate JSON data
        image_filename_base = os.path.splitext(filename)[0]  # Remove '.jpg' extension
        json_data = {
            "id": i,
            "image_rgb": filename,  
            "keypoints": [[kp] for kp in keypoints_]
        }

        # Copy image to output folder
        output_image_path = os.path.join(output_frames_folder, filename)
        shutil.copyfile(image_path, output_image_path)

        # Save JSON with matching filename
        output_json_path = os.path.join(output_frames_folder, f"{image_filename_base}.json") 
        with open(output_json_path, 'w') as f:
            json.dump(json_data, f)

        i += 1

print("Processing complete!")

In [3]:
import os
import cv2
import torch
import torchvision
import numpy as np
import json
import shutil
from torchvision.transforms import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_planning_b1_e50_v8.pth'
model = torch.load(weights_path).to(device)
# model = get_model(num_keypoints=6, weights_path=weights_path)
# model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)
model.eval()

# Assuming the necessary imports are done
# Assuming the model is loaded and device is set as in your initial code

# Specify input and output folders
input_folder = '/home/jc-merlab/Pictures/ip_test_folder'
output_frames_folder = '/home/jc-merlab/Pictures/Data/'

# Check if output folders exist, create them if not
os.makedirs(output_frames_folder, exist_ok=True)

def load_ground_truth(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    # ground_truth_keypoints = [[int(kp[0][0]), int(kp[0][1])] for kp in data['keypoints']]
    ground_truth_keypoints = [[int(kp[0][0]), int(kp[0][1]), kp[0][2]] for kp in data['keypoints']]
    return ground_truth_keypoints

def calculate_accuracy(predicted_keypoints, ground_truth_keypoints, margin=10):
    """
    Calculate the accuracy of predicted keypoints within a margin of 10 pixels.
    Also calculate accuracy for invisible keypoints within a margin of 5 pixels.
    """
    correct = 0
    total = len(ground_truth_keypoints)
    
    correct_invisible = 0
    total_invisible = 0

    for pred_kp, gt_kp in zip(predicted_keypoints, ground_truth_keypoints):
        pred_kp = pred_kp.cpu().numpy()  # Move tensor to CPU and convert to NumPy array
        dist = np.linalg.norm(np.array(pred_kp[:2]) - np.array(gt_kp[:2]))  # Use only x, y for distance calculation
#         print("GT Keypoints", gt_kp)
        if gt_kp[2] == 0:  # Invisible keypoint
            total_invisible += 1
            if dist <= margin:  # Margin for invisible keypoints
                correct_invisible += 1
        else:  # Visible keypoint
            if dist <= margin:
                correct += 1
    
    correct_total = correct + correct_invisible
    accuracy = (correct_total / total) * 100
    invisible_accuracy = (correct_invisible / total_invisible) * 100 if total_invisible > 0 else 0
    return accuracy, invisible_accuracy, total_invisible

# Process images in the folder
accuracies = []
invisible_accuracies = []
total_invisible_keypoints = 0
total_inference_time = []
i = 0
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.jpg'):
        image_path = os.path.join(input_folder, filename)
        image = cv2.imread(image_path)

        tensor_image = F.to_tensor(image).to(device)
        tensor_image.unsqueeze_(0)
        tensor_image = list(tensor_image)

        with torch.no_grad():
            output = model(tensor_image)
            tensor_image = (tensor_image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.01)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)
        confidence = output[0]['scores'][high_scores_idxs].detach().cpu().numpy()

#         keypoints = []
#         for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
#             keypoints.append(list(map(int, kps[0,0:2])))
            

            
#         bboxes = []
#         for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
#             bboxes.append(list(map(int, bbox.tolist())))
            
        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
       
#         labels = []
#         for label in output[0]['labels'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
#             labels.append(label)
            
        labels = []
        for label in output[0]['labels'][high_scores_idxs].detach().cpu().numpy():
            labels.append(label)
            
        keypoints = []
        for idx, kps in enumerate(output[0]['keypoints'][high_scores_idxs].detach().cpu().numpy()):
            keypoints.append(list(map(int, kps[0, 0:2])) + [confidence[idx]] + [labels[idx]])
            
        keypoints = [torch.tensor(kp, dtype=torch.float32).to(device) if not isinstance(kp, torch.Tensor) else kp for kp in keypoints]
        keypoints = torch.stack(keypoints).to(device)
        
        unique_labels, best_keypoint_indices = torch.unique(keypoints[:, 3], return_inverse=True)
        best_scores, best_indices = torch.max(keypoints[:, 2].unsqueeze(0) * (best_keypoint_indices == torch.arange(len(unique_labels)).unsqueeze(1).cuda()), dim=1)
        keypoints = keypoints[best_indices]
            
#         keypoints_ = [x for _,x in sorted(zip(labels,keypoints))]
        # Load ground truth keypoints
        json_filename = filename.split('.')[0] + '.json'  # Construct JSON filename
        json_path = os.path.join(input_folder, json_filename)
        ground_truth_keypoints = load_ground_truth(json_path) 
        
#         print(keypoints)

        # Visualize keypoints on the image
        for point in keypoints:
            x,y, c, l = point
            cv2.circle(image, (int(x), int(y)), 12, (255, 0, 0), -1)  # Draws a blue circle on each keypoint
            
        # Ground truth keypoints in red
        for x, y, _ in ground_truth_keypoints:
            cv2.circle(image, (x, y), radius=8, color=(0, 255, 255), thickness=-1)

        # Save the modified image to the output folder
        output_image_path = os.path.join(output_frames_folder, filename)
        cv2.imwrite(output_image_path, image)
        
        # Calculate and store accuracy
        accuracy, invisible_accuracy, num_invisible = calculate_accuracy(keypoints, ground_truth_keypoints, margin=10)
        accuracies.append(accuracy)
        invisible_accuracies.append(invisible_accuracy)
#         total_inference_time.append(inference_time)
        total_invisible_keypoints += num_invisible
        print(f"Accuracy for {filename}: {accuracy}%")
        print(f"Invisible Keypoint Accuracy for {filename}: {invisible_accuracy}%")

        i += 1
        
# Print overall accuracy
overall_accuracy = np.mean(accuracies)
overall_invisible_accuracy = np.mean(invisible_accuracies)
avg_inference_time = np.mean(total_inference_time)
print(f"Overall accuracy: {overall_accuracy}%")
print(f"Overall invisible keypoint accuracy: {overall_invisible_accuracy}%")
print(f"Total number of invisible keypoints: {total_invisible_keypoints}")
# print(f"Average inference time: {avg_inference_time}")


print("Processing complete!")

cuda
Accuracy for 012986.jpg: 0.0%
Invisible Keypoint Accuracy for 012986.jpg: 0.0%
Overall accuracy: 0.0%
Overall invisible keypoint accuracy: 0.0%
Total number of invisible keypoints: 3
Processing complete!


In [None]:
import os
import cv2
import torch
import torchvision
import numpy as np
import json
import shutil
from torchvision.transforms import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# weights_path = '/home/jc-merlab/Pictures/Data/lama_kp_trained_models/trained_models/keypointsrcnn_lama_b4_e25_v1.pth'
weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_planning_b1_e50_v8.pth'

model = torch.load(weights_path).to(device)
# model = get_model(num_keypoints=6, weights_path=weights_path)
# model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)
model.eval()

# Assuming the necessary imports are done
# Assuming the model is loaded and device is set as in your initial code

# Specify input and output folders
input_folder = '/home/jc-merlab/Pictures/Test_Data/occ_vids/exp_01/gt/'
output_frames_folder = '/home/jc-merlab/Pictures/Data/occ_phys_test_data/prediction_with_line/'

# Check if output folders exist, create them if not
os.makedirs(output_frames_folder, exist_ok=True)

def load_ground_truth(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    # ground_truth_keypoints = [[int(kp[0][0]), int(kp[0][1])] for kp in data['keypoints']]
    ground_truth_keypoints = [[int(kp[0][0]), int(kp[0][1]), kp[0][2]] for kp in data['keypoints']]
    return ground_truth_keypoints

def calculate_accuracy(predicted_keypoints, ground_truth_keypoints, margin=10):
    """
    Calculate the accuracy of predicted keypoints within a margin of 10 pixels.
    Also calculate accuracy for invisible keypoints within a margin of 5 pixels.
    """
    correct = 0
    total = len(ground_truth_keypoints)
    
    correct_invisible = 0
    total_invisible = 0

    for pred_kp, gt_kp in zip(predicted_keypoints, ground_truth_keypoints):
        pred_kp = pred_kp.cpu().numpy()  # Move tensor to CPU and convert to NumPy array
        dist = np.linalg.norm(np.array(pred_kp[:2]) - np.array(gt_kp[:2]))  # Use only x, y for distance calculation
#         print("GT Keypoints", gt_kp)
        if gt_kp[2] == 0:  # Invisible keypoint
            total_invisible += 1
            if dist <= margin:  # Margin for invisible keypoints
                correct_invisible += 1
        else:  # Visible keypoint
            if dist <= margin:
                correct += 1
    
    correct_total = correct + correct_invisible
    accuracy = (correct_total / total) * 100
    invisible_accuracy = (correct_invisible / total_invisible) * 100 if total_invisible > 0 else 0
    return accuracy, invisible_accuracy, total_invisible

# Process images in the folder
accuracies = []
invisible_accuracies = []
total_invisible_keypoints = 0
total_inference_time = []
i = 0
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.jpg'):
        image_path = os.path.join(input_folder, filename)
        image = cv2.imread(image_path)

        tensor_image = F.to_tensor(image).to(device)
        tensor_image.unsqueeze_(0)
        tensor_image = list(tensor_image)

        with torch.no_grad():
            output = model(tensor_image)
            tensor_image = (tensor_image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.01)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)
        confidence = output[0]['scores'][high_scores_idxs].detach().cpu().numpy()

#         keypoints = []
#         for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
#             keypoints.append(list(map(int, kps[0,0:2])))
            

            
#         bboxes = []
#         for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
#             bboxes.append(list(map(int, bbox.tolist())))
            
        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
       
#         labels = []
#         for label in output[0]['labels'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
#             labels.append(label)
            
        labels = []
        for label in output[0]['labels'][high_scores_idxs].detach().cpu().numpy():
            labels.append(label)
            
        keypoints = []
        for idx, kps in enumerate(output[0]['keypoints'][high_scores_idxs].detach().cpu().numpy()):
            keypoints.append(list(map(int, kps[0, 0:2])) + [confidence[idx]] + [labels[idx]])
            
        keypoints = [torch.tensor(kp, dtype=torch.float32).to(device) if not isinstance(kp, torch.Tensor) else kp for kp in keypoints]
        keypoints = torch.stack(keypoints).to(device)
        
        unique_labels, best_keypoint_indices = torch.unique(keypoints[:, 3], return_inverse=True)
        best_scores, best_indices = torch.max(keypoints[:, 2].unsqueeze(0) * (best_keypoint_indices == torch.arange(len(unique_labels)).unsqueeze(1).cuda()), dim=1)
        keypoints = keypoints[best_indices]
            
#         keypoints_ = [x for _,x in sorted(zip(labels,keypoints))]
        # Load ground truth keypoints
        json_filename = filename.split('.')[0] + '.json'  # Construct JSON filename
        json_path = os.path.join(input_folder, json_filename)
        ground_truth_keypoints = load_ground_truth(json_path) 
        
#         print(keypoints)

        # Visualize keypoints on the image
#         for point in keypoints:
#             x,y, c, l = point
#             cv2.circle(image, (int(x), int(y)), 9, (255, 0, 0), -1)  # Draws a blue circle on each keypoint
            
        # Ground truth keypoints in red
        for x, y, _ in ground_truth_keypoints:
            cv2.circle(image, (x, y), radius=8, color=(0, 0, 255), thickness=-1)
            
        # Draw lines between consecutive ground truth keypoints
        for j in range(len(ground_truth_keypoints) - 1):
            start_point = tuple(ground_truth_keypoints[j][:2])
            end_point = tuple(ground_truth_keypoints[j + 1][:2])
            cv2.line(image, start_point, end_point, color=(255, 0, 0), thickness=3)
            
        # Visualize keypoints and bounding boxes on the image
        for bbox in bboxes:
            x_min, y_min, x_max, y_max = bbox
            # Draw bounding box (Green color with thickness 2)
            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), thickness=2)

        # Visualize detected keypoints on the image
        for point in keypoints:
            x, y, c, l = point
            # Draw a circle for each detected keypoint (Blue color for keypoints)
            cv2.circle(image, (int(x), int(y)), 9, (255, 0, 0), -1)

        # Ground truth keypoints in red for comparison
        for x, y, _ in ground_truth_keypoints:
            cv2.circle(image, (x, y), radius=8, color=(0, 0, 255), thickness=-1)

        # Draw lines between consecutive ground truth keypoints
        for j in range(len(ground_truth_keypoints) - 1):
            start_point = tuple(ground_truth_keypoints[j][:2])
            end_point = tuple(ground_truth_keypoints[j + 1][:2])
            cv2.line(image, start_point, end_point, color=(255, 0, 0), thickness=3)

        # Save the modified image to the output folder
        output_image_path = os.path.join(output_frames_folder, filename)
        cv2.imwrite(output_image_path, image)
        
        # Calculate and store accuracy
        accuracy, invisible_accuracy, num_invisible = calculate_accuracy(keypoints, ground_truth_keypoints, margin=10)
        accuracies.append(accuracy)
        invisible_accuracies.append(invisible_accuracy)
#         total_inference_time.append(inference_time)
        total_invisible_keypoints += num_invisible
        print(f"Accuracy for {filename}: {accuracy}%")
        print(f"Invisible Keypoint Accuracy for {filename}: {invisible_accuracy}%")

        i += 1
        
# Print overall accuracy
overall_accuracy = np.mean(accuracies)
overall_invisible_accuracy = np.mean(invisible_accuracies)
avg_inference_time = np.mean(total_inference_time)
print(f"Overall accuracy: {overall_accuracy}%")
print(f"Overall invisible keypoint accuracy: {overall_invisible_accuracy}%")
print(f"Total number of invisible keypoints: {total_invisible_keypoints}")
# print(f"Average inference time: {avg_inference_time}")


print("Processing complete!")

In [11]:
import os
import cv2
import torch
import torchvision
import numpy as np
import json
from torchvision.transforms import functional as F

# Check for CUDA availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on: {device}")

# Load pre-trained model
weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_planning_b1_e50_v8.pth'
model = torch.load(weights_path).to(device)
model.eval()

# Input and output paths
input_folder = '/home/jc-merlab/Pictures/Test_Data/occ_vids/exp_01/gt/'
output_path = '/home/jc-merlab/Pictures/Test_Data/occ_vids/exp_01/gt/save_keypoints/'
os.makedirs(output_path, exist_ok=True)  # Ensure output directory exists

# Dictionary to store all keypoints
all_keypoints = {}

# Process images in the folder
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.jpg'):
        image_path = os.path.join(input_folder, filename)
        image = cv2.imread(image_path)

        # Convert image to tensor
        tensor_image = F.to_tensor(image).to(device).unsqueeze(0)

        # Run model inference
        with torch.no_grad():
            output = model(tensor_image)

        # Extract keypoints and confidence scores
        scores = output[0]['scores'].detach().cpu().numpy()
        high_scores_idxs = np.where(scores > 0.7)[0].tolist()
        post_nms_idxs = torchvision.ops.nms(
            output[0]['boxes'][high_scores_idxs], 
            output[0]['scores'][high_scores_idxs], 
            0.3
        ).cpu().numpy()
        
        confidence = output[0]['scores'][high_scores_idxs].detach().cpu().numpy()
        labels = output[0]['labels'][high_scores_idxs].detach().cpu().numpy()
        
        keypoints = []
        for idx, kps in enumerate(output[0]['keypoints'][high_scores_idxs].detach().cpu().numpy()):
            keypoints.append(list(map(int, kps[0, 0:2])) + [confidence[idx]] + [labels[idx]])
            
#         print("keypoints before label index", keypoints)

        keypoints = [torch.tensor(kp, dtype=torch.float32).to(device) if not isinstance(kp, torch.Tensor) else kp for kp in keypoints]
        keypoints = torch.stack(keypoints).to(device)
        
        unique_labels, best_keypoint_indices = torch.unique(keypoints[:, 3], return_inverse=True)
        best_scores, best_indices = torch.max(keypoints[:, 2].unsqueeze(0) * (best_keypoint_indices == torch.arange(len(unique_labels)).unsqueeze(1).cuda()), dim=1)
        keypoints = keypoints[best_indices]
        keypoints_list = keypoints.tolist()
        print("Keypoints after label index", keypoints_list)

        # keypoints_list is the list of keypoints
        keypoints_all = np.array([[int(kp[0]), int(kp[1])] for kp in keypoints_list])
        
        print(keypoints_all.shape)
        
        for idx, kps in enumerate(keypoints_all):
            x, y = int(kps[0]), int(kps[1])
            cv2.circle(image, (x, y), 5, (0, 0, 255), -1)
        
        # Store in dictionary with frame name
        frame_key = filename.replace('.jpg', '')
        all_keypoints[frame_key] = keypoints_all.tolist()

        # Save the image with drawn keypoints
        output_image_path = os.path.join(output_path, filename)
        cv2.imwrite(output_image_path, image)

# Save all keypoints in one JSON file
txt_output_path = os.path.join(output_path, "keypoints.txt")
with open(txt_output_path, 'w') as f:
    for frame, keypoints in all_keypoints.items():
        f.write(f"{frame}: {keypoints}\n")


print(f"Processing complete! Keypoints saved in {txt_output_path} and images saved with drawn keypoints.")
            
        



Running on: cuda
Keypoints after label index [[252.0, 442.0, 0.9998915195465088, 1.0], [255.0, 312.0, 0.9999997615814209, 2.0], [207.0, 283.0, 0.9999959468841553, 3.0], [157.0, 254.0, 0.999987006187439, 4.0], [172.0, 229.0, 0.9999657869338989, 5.0], [198.0, 164.0, 0.9999083280563354, 6.0], [225.0, 97.0, 0.9996922016143799, 7.0], [205.0, 73.0, 0.9864763021469116, 8.0], [230.0, 44.0, 0.9905106425285339, 9.0]]
(9, 2)
Keypoints after label index [[254.0, 441.0, 0.9996466636657715, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [207.0, 283.0, 0.9999953508377075, 3.0], [158.0, 254.0, 0.999982476234436, 4.0], [173.0, 228.0, 0.9999561309814453, 5.0], [198.0, 164.0, 0.9999051094055176, 6.0], [226.0, 97.0, 0.999650239944458, 7.0], [205.0, 72.0, 0.990409255027771, 8.0], [230.0, 45.0, 0.9917991161346436, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.999935507774353, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [207.0, 283.0, 0.9999967813491821, 3.0], [158.0, 254.0, 0.999980330467224

Keypoints after label index [[251.0, 441.0, 0.9998308420181274, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [214.0, 273.0, 0.9999963045120239, 3.0], [171.0, 234.0, 0.9999827146530151, 4.0], [191.0, 212.0, 0.9999685287475586, 5.0], [231.0, 155.0, 0.9997351765632629, 6.0], [271.0, 95.0, 0.9997947812080383, 7.0], [260.0, 68.0, 0.9950980544090271, 8.0], [294.0, 46.0, 0.9961163997650146, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 442.0, 0.9999338388442993, 1.0], [255.0, 311.0, 0.9999996423721313, 2.0], [214.0, 273.0, 0.9999960660934448, 3.0], [172.0, 234.0, 0.99997878074646, 4.0], [192.0, 212.0, 0.9999760389328003, 5.0], [232.0, 154.0, 0.999760091304779, 6.0], [273.0, 94.0, 0.9997716546058655, 7.0], [261.0, 67.0, 0.9907596111297607, 8.0], [296.0, 47.0, 0.9961928129196167, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.999923825263977, 1.0], [255.0, 311.0, 0.9999994039535522, 2.0], [214.0, 272.0, 0.9999960660934448, 3.0], [172.0, 233.0, 0.9999843835830688, 4.0], [193.0,

Keypoints after label index [[252.0, 442.0, 0.9999604225158691, 1.0], [255.0, 311.0, 0.9999986886978149, 2.0], [225.0, 263.0, 0.9999905824661255, 3.0], [194.0, 216.0, 0.9999579191207886, 4.0], [219.0, 199.0, 0.9998288154602051, 5.0], [273.0, 153.0, 0.9997923970222473, 6.0], [329.0, 105.0, 0.999832272529602, 7.0], [329.0, 75.0, 0.9919801950454712, 8.0], [367.0, 68.0, 0.9955319166183472, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.9998249411582947, 1.0], [255.0, 312.0, 0.999998927116394, 2.0], [226.0, 263.0, 0.9999923706054688, 3.0], [195.0, 215.0, 0.9999754428863525, 4.0], [220.0, 199.0, 0.9998650550842285, 5.0], [275.0, 152.0, 0.9993796348571777, 6.0], [331.0, 106.0, 0.9998262524604797, 7.0], [329.0, 74.0, 0.9966117739677429, 8.0], [370.0, 69.0, 0.9961179494857788, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 441.0, 0.9998313188552856, 1.0], [255.0, 311.0, 0.9999988079071045, 2.0], [226.0, 263.0, 0.9999887943267822, 3.0], [196.0, 214.0, 0.9999699592590332, 4.0], [22

Keypoints after label index [[250.0, 441.0, 0.9998865127563477, 1.0], [255.0, 311.0, 0.9999991655349731, 2.0], [238.0, 257.0, 0.9999988079071045, 3.0], [219.0, 203.0, 0.9999847412109375, 4.0], [247.0, 192.0, 0.9999579191207886, 5.0], [313.0, 160.0, 0.9999364614486694, 6.0], [379.0, 129.0, 0.9996788501739502, 7.0], [387.0, 97.0, 0.9987529516220093, 8.0], [429.0, 104.0, 0.9964649677276611, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.999842643737793, 1.0], [255.0, 311.0, 0.9999991655349731, 2.0], [238.0, 256.0, 0.9999984502792358, 3.0], [220.0, 202.0, 0.9999465942382812, 4.0], [248.0, 192.0, 0.9999549388885498, 5.0], [314.0, 161.0, 0.9999710321426392, 6.0], [381.0, 131.0, 0.9997472167015076, 7.0], [390.0, 99.0, 0.9992740750312805, 8.0], [431.0, 106.0, 0.9968286156654358, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 442.0, 0.9999476671218872, 1.0], [255.0, 311.0, 0.9999990463256836, 2.0], [239.0, 256.0, 0.999997615814209, 3.0], [222.0, 202.0, 0.9999922513961792, 4.0], [

Keypoints after label index [[250.0, 442.0, 0.9998635053634644, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [251.0, 253.0, 0.9999979734420776, 3.0], [245.0, 197.0, 0.9998792409896851, 4.0], [275.0, 193.0, 0.9999178647994995, 5.0], [348.0, 179.0, 0.9998975992202759, 6.0], [421.0, 164.0, 0.9998501539230347, 7.0], [440.0, 137.0, 0.9972777962684631, 8.0], [476.0, 157.0, 0.9962764382362366, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.9998527765274048, 1.0], [255.0, 311.0, 0.9999992847442627, 2.0], [252.0, 254.0, 0.9999979734420776, 3.0], [247.0, 197.0, 0.999941349029541, 4.0], [276.0, 193.0, 0.9999260902404785, 5.0], [348.0, 180.0, 0.9997377991676331, 6.0], [421.0, 166.0, 0.999811589717865, 7.0], [441.0, 138.0, 0.9976656436920166, 8.0], [478.0, 160.0, 0.9959408044815063, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.9998433589935303, 1.0], [256.0, 311.0, 0.9999991655349731, 2.0], [252.0, 254.0, 0.9999979734420776, 3.0], [248.0, 197.0, 0.9998986721038818, 4.0],

Keypoints after label index [[252.0, 442.0, 0.9999147653579712, 1.0], [255.0, 311.0, 0.9999994039535522, 2.0], [266.0, 253.0, 0.9999949932098389, 3.0], [276.0, 197.0, 0.9999556541442871, 4.0], [305.0, 200.0, 0.9998435974121094, 5.0], [379.0, 206.0, 0.9997004270553589, 6.0], [453.0, 213.0, 0.9997860789299011, 7.0], [482.0, 192.0, 0.9983035326004028, 8.0], [510.0, 225.0, 0.9953402280807495, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 442.0, 0.999833345413208, 1.0], [255.0, 311.0, 0.9999994039535522, 2.0], [266.0, 254.0, 0.9999954700469971, 3.0], [276.0, 198.0, 0.9999746084213257, 4.0], [307.0, 201.0, 0.9999101161956787, 5.0], [380.0, 208.0, 0.9995797276496887, 6.0], [454.0, 215.0, 0.9998494386672974, 7.0], [483.0, 195.0, 0.9983261227607727, 8.0], [511.0, 228.0, 0.9952347874641418, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 442.0, 0.99988853931427, 1.0], [255.0, 311.0, 0.9999996423721313, 2.0], [268.0, 254.0, 0.9999929666519165, 3.0], [278.0, 198.0, 0.9999377727508545, 4.0], 

Keypoints after label index [[250.0, 442.0, 0.9998753070831299, 1.0], [255.0, 311.0, 0.9999996423721313, 2.0], [279.0, 258.0, 0.9999973773956299, 3.0], [302.0, 204.0, 0.9987466335296631, 4.0], [330.0, 214.0, 0.9997922778129578, 5.0], [400.0, 239.0, 0.9999178647994995, 6.0], [471.0, 263.0, 0.9998791217803955, 7.0], [506.0, 254.0, 0.9969280362129211, 8.0], [522.0, 294.0, 0.9960975646972656, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.9998539686203003, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [280.0, 258.0, 0.9999984502792358, 3.0], [303.0, 204.0, 0.9983341097831726, 4.0], [331.0, 215.0, 0.9997965693473816, 5.0], [401.0, 241.0, 0.9999164342880249, 6.0], [472.0, 266.0, 0.9993956089019775, 7.0], [507.0, 257.0, 0.9965211153030396, 8.0], [522.0, 297.0, 0.9929667711257935, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.999915361404419, 1.0], [256.0, 311.0, 0.9999994039535522, 2.0], [281.0, 258.0, 0.9999982118606567, 3.0], [304.0, 205.0, 0.9974937438964844, 4.0]

Keypoints after label index [[252.0, 442.0, 0.9998457431793213, 1.0], [255.0, 311.0, 0.9999996423721313, 2.0], [289.0, 262.0, 0.9999924898147583, 3.0], [321.0, 214.0, 0.999447762966156, 4.0], [347.0, 230.0, 0.9998154044151306, 5.0], [411.0, 268.0, 0.9998323917388916, 6.0], [477.0, 308.0, 0.9998999834060669, 7.0], [514.0, 305.0, 0.9858681559562683, 8.0], [518.0, 349.0, 0.9939180612564087, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 442.0, 0.9998575448989868, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [289.0, 262.0, 0.999992847442627, 3.0], [321.0, 214.0, 0.9992977380752563, 4.0], [346.0, 229.0, 0.9997777342796326, 5.0], [411.0, 268.0, 0.9998270869255066, 6.0], [477.0, 308.0, 0.9998966455459595, 7.0], [515.0, 305.0, 0.9859316349029541, 8.0], [518.0, 349.0, 0.9940631985664368, 9.0]]
(9, 2)
Keypoints after label index [[252.0, 442.0, 0.9998327493667603, 1.0], [255.0, 311.0, 0.9999997615814209, 2.0], [289.0, 262.0, 0.9999927282333374, 3.0], [321.0, 214.0, 0.9993122816085815, 4.0],

Keypoints after label index [[250.0, 442.0, 0.9998694658279419, 1.0], [255.0, 311.0, 0.9999996423721313, 2.0], [284.0, 261.0, 0.9999927282333374, 3.0], [314.0, 210.0, 0.9996733665466309, 4.0], [340.0, 224.0, 0.9999780654907227, 5.0], [412.0, 243.0, 0.9998109936714172, 6.0], [485.0, 264.0, 0.9998791217803955, 7.0], [522.0, 257.0, 0.9953539371490479, 8.0], [530.0, 299.0, 0.992976188659668, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.9998199343681335, 1.0], [255.0, 311.0, 0.9999995231628418, 2.0], [284.0, 260.0, 0.9999911785125732, 3.0], [313.0, 210.0, 0.9998107552528381, 4.0], [339.0, 223.0, 0.9999775886535645, 5.0], [412.0, 242.0, 0.9999089241027832, 6.0], [485.0, 260.0, 0.9999263286590576, 7.0], [521.0, 255.0, 0.9938099980354309, 8.0], [529.0, 298.0, 0.9973949193954468, 9.0]]
(9, 2)
Keypoints after label index [[255.0, 443.0, 0.999754011631012, 1.0], [255.0, 311.0, 0.9999994039535522, 2.0], [284.0, 260.0, 0.9999868869781494, 3.0], [313.0, 209.0, 0.9997208714485168, 4.0],

Keypoints after label index [[250.0, 442.0, 0.9999346733093262, 1.0], [255.0, 311.0, 0.9999995231628418, 2.0], [280.0, 258.0, 0.999996542930603, 3.0], [303.0, 204.0, 0.9993190765380859, 4.0], [330.0, 216.0, 0.9998654127120972, 5.0], [405.0, 212.0, 0.9998815059661865, 6.0], [479.0, 210.0, 0.9994195699691772, 7.0], [513.0, 200.0, 0.9968094229698181, 8.0], [525.0, 240.0, 0.9956795573234558, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.999953031539917, 1.0], [255.0, 311.0, 0.9999995231628418, 2.0], [279.0, 257.0, 0.9999980926513672, 3.0], [303.0, 204.0, 0.9995323419570923, 4.0], [330.0, 216.0, 0.9998133778572083, 5.0], [404.0, 211.0, 0.9996777772903442, 6.0], [478.0, 207.0, 0.9994078874588013, 7.0], [512.0, 197.0, 0.9954190254211426, 8.0], [522.0, 237.0, 0.9949377775192261, 9.0]]
(9, 2)
Keypoints after label index [[250.0, 442.0, 0.9999083280563354, 1.0], [255.0, 311.0, 0.9999994039535522, 2.0], [279.0, 257.0, 0.9999977350234985, 3.0], [302.0, 204.0, 0.9996416568756104, 4.0],

In [None]:
import os
import cv2
import torch
import torchvision
import numpy as np
import json
import shutil
from torchvision.transforms import functional as F
import time

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_planning_b1_e50_v8.pth'
model = torch.load(weights_path).to(device)
# model = get_model(num_keypoints=6, weights_path=weights_path)
# model.load_state_dict(torch.load('keypointsrcnn_weights.pth'))

model.to(device)
model.eval()

# Assuming the necessary imports are done
# Assuming the model is loaded and device is set as in your initial code

# Specify input and output folders
input_folder = '/home/jc-merlab/Pictures/Data/occ_panda_phys_test_data/'
output_frames_folder = '/home/jc-merlab/Pictures/Data/occ_phys_test_data/panda_kprcnn_op/'

# Check if output folders exist, create them if not
os.makedirs(output_frames_folder, exist_ok=True)


model.to(device)
model.eval()

# print(type(model))
# Create a VideoCapture object and read from input file
# If the input is the camera, pass 0 instead of the video file name
cap = cv2.VideoCapture('/home/jc-merlab/Pictures/Test_Data/ycb_test_01.avi')
 
# Check if camera opened successfully
if (cap.isOpened()== False): 
    print("Error opening video stream or file")
i = 0
print(type(i))
while(cap.isOpened()):
  # Capture frame-by-frame
    print(i)
    ret, frame = cap.read()
    if ret == True:        
#         img = cv2.imread(frame)
        image = Image.fromarray(frame)

        image = F.to_tensor(image).to(device)
        image.unsqueeze_(0)
        image = list(image)
    
        with torch.no_grad():
            model.to(device)
            model.eval()
            start = time.time()
            output = model(image)
            stop = time.time()
            print("time", (stop - start))

        image = (image[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()

        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

        # Below, in output[0]['keypoints'][high_scores_idxs][post_nms_idxs] and output[0]['boxes'][high_scores_idxs][post_nms_idxs]
        # Firstly, we choose only those objects, which have score above predefined threshold. This is done with choosing elements with [high_scores_idxs] indexes
        # Secondly, we choose only those objects, which are left after NMS is applied. This is done with choosing elements with [post_nms_idxs] indexes

        keypoints = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append([list(map(int, kp[:2])) for kp in kps])

        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
        img = visualize(image, bboxes, keypoints)
        
        cv2.imwrite("/home/jc-merlab/Pictures/Test_Data/vid_occ_kp/out_image_" + str(i) + ".jpg", img)
    
    else:
        break
        
    i = i+1
    
cap.release()
 
# Closes all the frames
cv2.destroyAllWindows()
